diff options
Diffstat (limited to 'VexRiscv/src/main/scala/vexriscv/ip')
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala | 1184 | ||||
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala | 487 | ||||
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 1944 | ||||
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala | 140 | ||||
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala | 116 | ||||
-rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala | 186 |
6 files changed, 4057 insertions, 0 deletions
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala new file mode 100644 index 0000000..2b70400 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala @@ -0,0 +1,1184 @@ +package vexriscv.ip + +import vexriscv._ +import spinal.core._ +import spinal.lib._ +import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4Shared} +import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig} +import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbCmd, BmbInvalidationParameter, BmbParameter, BmbSourceParameter} +import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig} +import spinal.lib.bus.simple._ +import vexriscv.plugin.DBusSimpleBus + + +case class DataCacheConfig(cacheSize : Int, + bytePerLine : Int, + wayCount : Int, + addressWidth : Int, + cpuDataWidth : Int, + var rfDataWidth : Int = -1, //-1 mean cpuDataWidth + memDataWidth : Int, + catchAccessError : Boolean, + catchIllegal : Boolean, + catchUnaligned : Boolean, + earlyWaysHits : Boolean = true, + earlyDataMux : Boolean = false, + tagSizeShift : Int = 0, //Used to force infering ram + withLrSc : Boolean = false, + withAmo : Boolean = false, + withExclusive : Boolean = false, + withInvalidate : Boolean = false, + pendingMax : Int = 64, + directTlbHit : Boolean = false, + mergeExecuteMemory : Boolean = false, + asyncTagMemory : Boolean = false, + withWriteAggregation : Boolean = false){ + + if(rfDataWidth == -1) rfDataWidth = cpuDataWidth + assert(!(mergeExecuteMemory && (earlyDataMux || earlyWaysHits))) + assert(!(earlyDataMux && !earlyWaysHits)) + assert(isPow2(pendingMax)) + assert(rfDataWidth <= memDataWidth) + + def lineCount = cacheSize/bytePerLine/wayCount + def sizeMax = log2Up(bytePerLine) + def sizeWidth = log2Up(sizeMax + 1) + val aggregationWidth = if(withWriteAggregation) log2Up(memDataBytes+1) else 0 + def withWriteResponse = withExclusive + def burstSize = bytePerLine*8/memDataWidth + val burstLength = bytePerLine/(cpuDataWidth/8) + def catchSomething = catchUnaligned || catchIllegal || catchAccessError + def withInternalAmo = withAmo && !withExclusive + def withInternalLrSc = withLrSc && !withExclusive + def withExternalLrSc = withLrSc && withExclusive + def withExternalAmo = withAmo && withExclusive + def cpuDataBytes = cpuDataWidth/8 + def rfDataBytes = rfDataWidth/8 + def memDataBytes = memDataWidth/8 + def getAxi4SharedConfig() = Axi4Config( + addressWidth = addressWidth, + dataWidth = memDataWidth, + useId = false, + useRegion = false, + useBurst = false, + useLock = false, + useQos = false + ) + + + def getAvalonConfig() = AvalonMMConfig.bursted( + addressWidth = addressWidth, + dataWidth = memDataWidth, + burstCountWidth = log2Up(burstSize + 1)).copy( + useByteEnable = true, + constantBurstBehavior = true, + burstOnBurstBoundariesOnly = true, + useResponse = true, + maximumPendingReadTransactions = 2 + ) + + def getWishboneConfig() = WishboneConfig( + addressWidth = 32-log2Up(memDataWidth/8), + dataWidth = memDataWidth, + selWidth = memDataBytes, + useSTALL = false, + useLOCK = false, + useERR = true, + useRTY = false, + tgaWidth = 0, + tgcWidth = 0, + tgdWidth = 0, + useBTE = true, + useCTI = true + ) + + def getBmbParameter() = BmbParameter( + BmbAccessParameter( + addressWidth = 32, + dataWidth = memDataWidth + ).addSources(1, BmbSourceParameter( + lengthWidth = log2Up(this.bytePerLine), + contextWidth = (if(!withWriteResponse) 1 else 0) + aggregationWidth, + alignment = BmbParameter.BurstAlignement.LENGTH, + canExclusive = withExclusive, + withCachedRead = true, + canInvalidate = withInvalidate, + canSync = withInvalidate + )), + BmbInvalidationParameter( + invalidateLength = log2Up(this.bytePerLine), + invalidateAlignment = BmbParameter.BurstAlignement.LENGTH + ) + ) +} + +object DataCacheCpuExecute{ + implicit def implArgs(that : DataCacheCpuExecute) = that.args +} + +case class DataCacheCpuExecute(p : DataCacheConfig) extends Bundle with IMasterSlave{ + val isValid = Bool + val address = UInt(p.addressWidth bit) + val haltIt = Bool + val args = DataCacheCpuExecuteArgs(p) + val refilling = Bool + + override def asMaster(): Unit = { + out(isValid, args, address) + in(haltIt, refilling) + } +} + +case class DataCacheCpuExecuteArgs(p : DataCacheConfig) extends Bundle{ + val wr = Bool + val size = UInt(log2Up(log2Up(p.cpuDataBytes)+1) bits) + val isLrsc = p.withLrSc generate Bool() + val isAmo = p.withAmo generate Bool() + val amoCtrl = p.withAmo generate new Bundle { + val swap = Bool() + val alu = Bits(3 bits) + } + + val totalyConsistent = Bool() //Only for AMO/LRSC +} + +case class DataCacheCpuMemory(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{ + val isValid = Bool + val isStuck = Bool + val isWrite = Bool + val address = UInt(p.addressWidth bit) + val mmuRsp = MemoryTranslatorRsp(mmu) + + override def asMaster(): Unit = { + out(isValid, isStuck, address) + in(isWrite) + out(mmuRsp) + } +} + + +case class FenceFlags() extends Bundle { + val SW,SR,SO,SI,PW,PR,PO,PI = Bool() + val FM = Bits(4 bits) + + def SL = SR || SI + def SS = SW || SO + def PL = PR || PI + def PS = PW || PO + def forceAll(): Unit ={ + List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := True) + } + def clearAll(): Unit ={ + List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := False) + } +} + +case class DataCacheCpuWriteBack(p : DataCacheConfig) extends Bundle with IMasterSlave{ + val isValid = Bool() + val isStuck = Bool() + val isFiring = Bool() + val isUser = Bool() + val haltIt = Bool() + val isWrite = Bool() + val storeData = Bits(p.cpuDataWidth bit) + val data = Bits(p.cpuDataWidth bit) + val address = UInt(p.addressWidth bit) + val mmuException, unalignedAccess, accessError = Bool() + val keepMemRspData = Bool() //Used by external AMO to avoid having an internal buffer + val fence = FenceFlags() + val exclusiveOk = Bool() + + override def asMaster(): Unit = { + out(isValid,isStuck,isUser, address, fence, storeData, isFiring) + in(haltIt, data, mmuException, unalignedAccess, accessError, isWrite, keepMemRspData, exclusiveOk) + } +} + +case class DataCacheFlush(lineCount : Int) extends Bundle{ + val singleLine = Bool() + val lineId = UInt(log2Up(lineCount) bits) +} + +case class DataCacheCpuBus(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{ + val execute = DataCacheCpuExecute(p) + val memory = DataCacheCpuMemory(p, mmu) + val writeBack = DataCacheCpuWriteBack(p) + + val redo = Bool() + val flush = Stream(DataCacheFlush(p.lineCount)) + + override def asMaster(): Unit = { + master(execute) + master(memory) + master(writeBack) + master(flush) + in(redo) + } +} + + +case class DataCacheMemCmd(p : DataCacheConfig) extends Bundle{ + val wr = Bool + val uncached = Bool + val address = UInt(p.addressWidth bit) + val data = Bits(p.cpuDataWidth bits) + val mask = Bits(p.cpuDataWidth/8 bits) + val size = UInt(p.sizeWidth bits) //... 1 => 2 bytes ... 2 => 4 bytes ... + val exclusive = p.withExclusive generate Bool() + val last = Bool + +// def beatCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes))) +// def beatCount = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes-1))) + + //Utilities which does quite a few assumtions about the bus utilisation + def byteCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)-1, log2Up(p.bytePerLine) bits))) + def beatCountMinusOne = (size === log2Up(p.bytePerLine)) ? U(p.burstSize-1) | U(0) + def beatCount = (size === log2Up(p.bytePerLine)) ? U(p.burstSize) | U(1) + def isBurst = size === log2Up(p.bytePerLine) +} +case class DataCacheMemRsp(p : DataCacheConfig) extends Bundle{ + val aggregated = UInt(p.aggregationWidth bits) + val last = Bool() + val data = Bits(p.memDataWidth bit) + val error = Bool + val exclusive = p.withExclusive generate Bool() +} +case class DataCacheInv(p : DataCacheConfig) extends Bundle{ + val enable = Bool() + val address = UInt(p.addressWidth bit) +} +case class DataCacheAck(p : DataCacheConfig) extends Bundle{ + val hit = Bool() +} + +case class DataCacheSync(p : DataCacheConfig) extends Bundle{ + val aggregated = UInt(p.aggregationWidth bits) +} + +case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave{ + val cmd = Stream (DataCacheMemCmd(p)) + val rsp = Flow (DataCacheMemRsp(p)) + + val inv = p.withInvalidate generate Stream(Fragment(DataCacheInv(p))) + val ack = p.withInvalidate generate Stream(Fragment(DataCacheAck(p))) + val sync = p.withInvalidate generate Stream(DataCacheSync(p)) + + override def asMaster(): Unit = { + master(cmd) + slave(rsp) + + if(p.withInvalidate) { + slave(inv) + master(ack) + slave(sync) + } + } + + def toAxi4Shared(stageCmd : Boolean = false, pendingWritesMax : Int = 7): Axi4Shared = { + val axi = Axi4Shared(p.getAxi4SharedConfig()).setName("dbus_axi") + + val cmdPreFork = if (stageCmd) cmd.stage.stage().s2mPipe() else cmd + + val pendingWrites = CounterUpDown( + stateCount = pendingWritesMax + 1, + incWhen = cmdPreFork.fire && cmdPreFork.wr, + decWhen = axi.writeRsp.fire + ) + + val hazard = (pendingWrites =/= 0 && !cmdPreFork.wr) || pendingWrites === pendingWritesMax + val (cmdFork, dataFork) = StreamFork2(cmdPreFork.haltWhen(hazard)) + val cmdStage = cmdFork.throwWhen(RegNextWhen(!cmdFork.last,cmdFork.fire).init(False)) + val dataStage = dataFork.throwWhen(!dataFork.wr) + + axi.sharedCmd.arbitrationFrom(cmdStage) + axi.sharedCmd.write := cmdStage.wr + axi.sharedCmd.prot := "010" + axi.sharedCmd.cache := "1111" + axi.sharedCmd.size := log2Up(p.memDataBytes) + axi.sharedCmd.addr := cmdStage.address + axi.sharedCmd.len := cmdStage.beatCountMinusOne.resized + + axi.writeData.arbitrationFrom(dataStage) + axi.writeData.data := dataStage.data + axi.writeData.strb := dataStage.mask + axi.writeData.last := dataStage.last + + rsp.valid := axi.r.valid + rsp.error := !axi.r.isOKAY() + rsp.data := axi.r.data + + axi.r.ready := True + axi.b.ready := True + + axi + } + + + def toAvalon(): AvalonMM = { + val avalonConfig = p.getAvalonConfig() + val mm = AvalonMM(avalonConfig) + mm.read := cmd.valid && !cmd.wr + mm.write := cmd.valid && cmd.wr + mm.address := cmd.address(cmd.address.high downto log2Up(p.memDataWidth/8)) @@ U(0,log2Up(p.memDataWidth/8) bits) + mm.burstCount := cmd.beatCount + mm.byteEnable := cmd.mask + mm.writeData := cmd.data + + cmd.ready := mm.waitRequestn + rsp.valid := mm.readDataValid + rsp.data := mm.readData + rsp.error := mm.response =/= AvalonMM.Response.OKAY + + mm + } + + def toWishbone(): Wishbone = { + val wishboneConfig = p.getWishboneConfig() + val bus = Wishbone(wishboneConfig) + val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0) + val addressShift = log2Up(p.memDataWidth/8) + + val cmdBridge = Stream (DataCacheMemCmd(p)) + val isBurst = cmdBridge.isBurst + cmdBridge.valid := cmd.valid + cmdBridge.address := (isBurst ? (cmd.address(31 downto widthOf(counter) + addressShift) @@ counter @@ U(0, addressShift bits)) | (cmd.address(31 downto addressShift) @@ U(0, addressShift bits))) + cmdBridge.wr := cmd.wr + cmdBridge.mask := cmd.mask + cmdBridge.data := cmd.data + cmdBridge.size := cmd.size + cmdBridge.last := !isBurst || counter === p.burstSize-1 + cmd.ready := cmdBridge.ready && (cmdBridge.wr || cmdBridge.last) + + + when(cmdBridge.fire){ + counter := counter + 1 + when(cmdBridge.last){ + counter := 0 + } + } + + + bus.ADR := cmdBridge.address >> addressShift + bus.CTI := Mux(isBurst, cmdBridge.last ? B"111" | B"010", B"000") + bus.BTE := B"00" + bus.SEL := cmdBridge.wr ? cmdBridge.mask | B((1 << p.memDataBytes)-1) + bus.WE := cmdBridge.wr + bus.DAT_MOSI := cmdBridge.data + + cmdBridge.ready := cmdBridge.valid && bus.ACK + bus.CYC := cmdBridge.valid + bus.STB := cmdBridge.valid + + rsp.valid := RegNext(cmdBridge.valid && !bus.WE && bus.ACK) init(False) + rsp.data := RegNext(bus.DAT_MISO) + rsp.error := False //TODO + bus + } + + + + def toPipelinedMemoryBus(): PipelinedMemoryBus = { + val bus = PipelinedMemoryBus(32,32) + + val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0) + when(bus.cmd.fire){ counter := counter + 1 } + when( cmd.fire && cmd.last){ counter := 0 } + + bus.cmd.valid := cmd.valid + bus.cmd.address := (cmd.address(31 downto 2) | counter.resized) @@ U"00" + bus.cmd.write := cmd.wr + bus.cmd.mask := cmd.mask + bus.cmd.data := cmd.data + cmd.ready := bus.cmd.ready && (cmd.wr || counter === p.burstSize-1) + rsp.valid := bus.rsp.valid + rsp.data := bus.rsp.payload.data + rsp.error := False + bus + } + + + def toBmb(syncPendingMax : Int = 32, + timeoutCycles : Int = 16) : Bmb = new Area{ + setCompositeName(DataCacheMemBus.this, "Bridge", true) + val pipelinedMemoryBusConfig = p.getBmbParameter() + val bus = Bmb(pipelinedMemoryBusConfig).setCompositeName(this,"toBmb", true) + + case class Context() extends Bundle{ + val isWrite = !p.withWriteResponse generate Bool() + val rspCount = (p.aggregationWidth != 0) generate UInt(p.aggregationWidth bits) + } + + + def sizeToLength(size : UInt) = size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> U((1 << i)-1, log2Up(p.cpuDataBytes) bits))) + + val withoutWriteBuffer = if(p.aggregationWidth == 0) new Area { + val busCmdContext = Context() + + bus.cmd.valid := cmd.valid + bus.cmd.last := cmd.last + bus.cmd.opcode := (cmd.wr ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ)) + bus.cmd.address := cmd.address.resized + bus.cmd.data := cmd.data + bus.cmd.length := cmd.byteCountMinusOne + bus.cmd.mask := cmd.mask + if (p.withExclusive) bus.cmd.exclusive := cmd.exclusive + if (!p.withWriteResponse) busCmdContext.isWrite := cmd.wr + bus.cmd.context := B(busCmdContext) + + cmd.ready := bus.cmd.ready + if(p.withInvalidate) sync.arbitrationFrom(bus.sync) + } + + val withWriteBuffer = if(p.aggregationWidth != 0) new Area { + val buffer = new Area { + val stream = cmd.toEvent().m2sPipe() + val address = Reg(UInt(p.addressWidth bits)) + val length = Reg(UInt(pipelinedMemoryBusConfig.access.lengthWidth bits)) + val write = Reg(Bool) + val exclusive = Reg(Bool) + val data = Reg(Bits(p.memDataWidth bits)) + val mask = Reg(Bits(p.memDataWidth/8 bits)) init(0) + } + + val aggregationRange = log2Up(p.memDataWidth/8)-1 downto log2Up(p.cpuDataWidth/8) + val tagRange = p.addressWidth-1 downto aggregationRange.high+1 + val aggregationEnabled = Reg(Bool) + val aggregationCounter = Reg(UInt(p.aggregationWidth bits)) init(0) + val aggregationCounterFull = aggregationCounter === aggregationCounter.maxValue + val timer = Reg(UInt(log2Up(timeoutCycles)+1 bits)) init(0) + val timerFull = timer.msb + val hit = cmd.address(tagRange) === buffer.address(tagRange) + val cmdExclusive = if(p.withExclusive) cmd.exclusive else False + val canAggregate = cmd.valid && cmd.wr && !cmd.uncached && !cmdExclusive && !timerFull && !aggregationCounterFull && (!buffer.stream.valid || aggregationEnabled && hit) + val doFlush = cmd.valid && !canAggregate || timerFull || aggregationCounterFull || !aggregationEnabled +// val canAggregate = False +// val doFlush = True + val busCmdContext = Context() + val halt = False + + when(cmd.fire){ + aggregationCounter := aggregationCounter + 1 + } + when(buffer.stream.valid && !timerFull){ + timer := timer + 1 + } + when(bus.cmd.fire || !buffer.stream.valid){ + buffer.mask := 0 + aggregationCounter := 0 + timer := 0 + } + + buffer.stream.ready := (bus.cmd.ready && doFlush || canAggregate) && !halt + bus.cmd.valid := buffer.stream.valid && doFlush && !halt + bus.cmd.last := True + bus.cmd.opcode := (buffer.write ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ)) + bus.cmd.address := buffer.address + bus.cmd.length := buffer.length + bus.cmd.data := buffer.data + bus.cmd.mask := buffer.mask + + if (p.withExclusive) bus.cmd.exclusive := buffer.exclusive + bus.cmd.context.removeAssignments() := B(busCmdContext) + if (!p.withWriteResponse) busCmdContext.isWrite := bus.cmd.isWrite + busCmdContext.rspCount := aggregationCounter + + val aggregationSel = cmd.address(aggregationRange) + when(cmd.fire){ + val dIn = cmd.data.subdivideIn(8 bits) + val dReg = buffer.data.subdivideIn(8 bits) + for(byteId <- 0 until p.memDataBytes){ + when(aggregationSel === byteId / p.cpuDataBytes && cmd.mask(byteId % p.cpuDataBytes)){ + dReg.write(byteId, dIn(byteId % p.cpuDataBytes)) + buffer.mask(byteId) := True + } + } + } + + when(cmd.fire){ + buffer.write := cmd.wr + buffer.address := cmd.address.resized + buffer.length := cmd.byteCountMinusOne + if (p.withExclusive) buffer.exclusive := cmd.exclusive + + when(cmd.wr && !cmd.uncached && !cmdExclusive){ + aggregationEnabled := True + buffer.address(aggregationRange.high downto 0) := 0 + buffer.length := p.memDataBytes-1 + } otherwise { + aggregationEnabled := False + } + } + + + val rspCtx = bus.rsp.context.as(Context()) + rsp.aggregated := rspCtx.rspCount + + val syncLogic = p.withInvalidate generate new Area{ + val cmdCtx = Stream(UInt(p.aggregationWidth bits)) + cmdCtx.valid := bus.cmd.fire && bus.cmd.isWrite + cmdCtx.payload := aggregationCounter + halt setWhen(!cmdCtx.ready) + + val syncCtx = cmdCtx.queue(syncPendingMax).s2mPipe().m2sPipe() //Assume latency of sync is at least 3 cycles + syncCtx.ready := bus.sync.fire + + sync.arbitrationFrom(bus.sync) + sync.aggregated := syncCtx.payload + } + } + + + rsp.valid := bus.rsp.valid + if(!p.withWriteResponse) rsp.valid clearWhen(bus.rsp.context(0)) + rsp.data := bus.rsp.data + rsp.error := bus.rsp.isError + rsp.last := bus.rsp.last + if(p.withExclusive) rsp.exclusive := bus.rsp.exclusive + bus.rsp.ready := True + + val invalidateLogic = p.withInvalidate generate new Area{ + val beatCountMinusOne = bus.inv.transferBeatCountMinusOne(p.bytePerLine) + val counter = Reg(UInt(widthOf(beatCountMinusOne) bits)) init(0) + + inv.valid := bus.inv.valid + inv.address := bus.inv.address + (counter << log2Up(p.bytePerLine)) + inv.enable := bus.inv.all + inv.last := counter === beatCountMinusOne + bus.inv.ready := inv.last && inv.ready + + if(widthOf(counter) != 0) when(inv.fire){ + counter := counter + 1 + when(inv.last){ + counter := 0 + } + } + + bus.ack.arbitrationFrom(ack.throwWhen(!ack.last)) + } + }.bus + +} + +object DataCacheExternalAmoStates extends SpinalEnum{ + val LR_CMD, LR_RSP, SC_CMD, SC_RSP = newElement(); +} + +//If external amo, mem rsp should stay +class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{ + import p._ + + val io = new Bundle{ + val cpu = slave(DataCacheCpuBus(p, mmuParameter)) + val mem = master(DataCacheMemBus(p)) + } + + val haltCpu = False + val lineWidth = bytePerLine*8 + val lineCount = cacheSize/bytePerLine + val wordWidth = cpuDataWidth + val wordWidthLog2 = log2Up(wordWidth) + val wordPerLine = lineWidth/wordWidth + val bytePerWord = wordWidth/8 + val wayLineCount = lineCount/wayCount + val wayLineLog2 = log2Up(wayLineCount) + val wayWordCount = wayLineCount * wordPerLine + val memWordPerLine = lineWidth/memDataWidth + val memTransactionPerLine = p.bytePerLine / (p.memDataWidth/8) + val bytePerMemWord = memDataWidth/8 + val wayMemWordCount = wayLineCount * memWordPerLine + + val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine) + val lineRange = tagRange.low-1 downto log2Up(bytePerLine) + val cpuWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerWord) + val memWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerMemWord) + val hitRange = tagRange.high downto lineRange.low + val memWordToCpuWordRange = log2Up(bytePerMemWord)-1 downto log2Up(bytePerWord) + val cpuWordToRfWordRange = log2Up(bytePerWord)-1 downto log2Up(p.rfDataBytes) + + + class LineInfo() extends Bundle{ + val valid, error = Bool() + val address = UInt(tagRange.length bit) + } + + val tagsReadCmd = Flow(UInt(log2Up(wayLineCount) bits)) + val tagsInvReadCmd = withInvalidate generate Flow(UInt(log2Up(wayLineCount) bits)) + val tagsWriteCmd = Flow(new Bundle{ + val way = Bits(wayCount bits) + val address = UInt(log2Up(wayLineCount) bits) + val data = new LineInfo() + }) + + val tagsWriteLastCmd = RegNext(tagsWriteCmd) + + val dataReadCmd = Flow(UInt(log2Up(wayMemWordCount) bits)) + val dataWriteCmd = Flow(new Bundle{ + val way = Bits(wayCount bits) + val address = UInt(log2Up(wayMemWordCount) bits) + val data = Bits(memDataWidth bits) + val mask = Bits(memDataWidth/8 bits) + }) + + + val ways = for(i <- 0 until wayCount) yield new Area{ + val tags = Mem(new LineInfo(), wayLineCount) + val data = Mem(Bits(memDataWidth bit), wayMemWordCount) + + //Reads + val tagsReadRsp = asyncTagMemory match { + case false => tags.readSync(tagsReadCmd.payload, tagsReadCmd.valid && !io.cpu.memory.isStuck) + case true => tags.readAsync(RegNextWhen(tagsReadCmd.payload, io.cpu.execute.isValid && !io.cpu.memory.isStuck)) + } + val dataReadRspMem = data.readSync(dataReadCmd.payload, dataReadCmd.valid && !io.cpu.memory.isStuck) + val dataReadRspSel = if(mergeExecuteMemory) io.cpu.writeBack.address else io.cpu.memory.address + val dataReadRsp = dataReadRspMem.subdivideIn(cpuDataWidth bits).read(dataReadRspSel(memWordToCpuWordRange)) + + val tagsInvReadRsp = withInvalidate generate(asyncTagMemory match { + case false => tags.readSync(tagsInvReadCmd.payload, tagsInvReadCmd.valid) + case true => tags.readAsync(RegNextWhen(tagsInvReadCmd.payload, tagsInvReadCmd.valid)) + }) + + //Writes + when(tagsWriteCmd.valid && tagsWriteCmd.way(i)){ + tags.write(tagsWriteCmd.address, tagsWriteCmd.data) + } + when(dataWriteCmd.valid && dataWriteCmd.way(i)){ + data.write( + address = dataWriteCmd.address, + data = dataWriteCmd.data, + mask = dataWriteCmd.mask + ) + } + } + + + tagsReadCmd.valid := False + tagsReadCmd.payload.assignDontCare() + dataReadCmd.valid := False + dataReadCmd.payload.assignDontCare() + tagsWriteCmd.valid := False + tagsWriteCmd.payload.assignDontCare() + dataWriteCmd.valid := False + dataWriteCmd.payload.assignDontCare() + + when(io.cpu.execute.isValid && !io.cpu.memory.isStuck){ + tagsReadCmd.valid := True + dataReadCmd.valid := True + tagsReadCmd.payload := io.cpu.execute.address(lineRange) + dataReadCmd.payload := io.cpu.execute.address(lineRange.high downto memWordRange.low) + } + + def collisionProcess(readAddress : UInt, readMask : Bits): Bits ={ + val ret = Bits(wayCount bits) + val readAddressAligned = (readAddress >> log2Up(memDataWidth/cpuDataWidth)) + val dataWriteMaskAligned = dataWriteCmd.mask.subdivideIn(memDataWidth/cpuDataWidth slices).read(readAddress(log2Up(memDataWidth/cpuDataWidth)-1 downto 0)) + for(i <- 0 until wayCount){ + ret(i) := dataWriteCmd.valid && dataWriteCmd.way(i) && dataWriteCmd.address === readAddressAligned && (readMask & dataWriteMaskAligned) =/= 0 + } + ret + } + + + io.cpu.execute.haltIt := False + + val rspSync = True + val rspLast = True + val memCmdSent = RegInit(False) setWhen (io.mem.cmd.fire) clearWhen (!io.cpu.writeBack.isStuck) + val pending = withExclusive generate new Area{ + val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0) + val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - ((io.mem.rsp.valid && io.mem.rsp.last) ? (io.mem.rsp.aggregated +^ 1) | 0) + counter := counterNext + + val done = RegNext(counterNext === 0) + val full = RegNext(counter.msb) //Has margin + val last = RegNext(counterNext === 1) //Equivalent to counter === 1 but pipelined + + if(!withInvalidate) { + io.cpu.execute.haltIt setWhen(full) + } + + rspSync clearWhen (!last || !memCmdSent) + rspLast clearWhen (!last) + } + + val sync = withInvalidate generate new Area{ + io.mem.sync.ready := True + val syncCount = io.mem.sync.aggregated +^ 1 + val syncContext = new Area{ + val history = Mem(Bool, pendingMax) + val wPtr, rPtr = Reg(UInt(log2Up(pendingMax)+1 bits)) init(0) + when(io.mem.cmd.fire && io.mem.cmd.wr){ + history.write(wPtr.resized, io.mem.cmd.uncached) + wPtr := wPtr + 1 + } + + when(io.mem.sync.fire){ + rPtr := rPtr + syncCount + } + val uncached = history.readAsync(rPtr.resized) + val full = RegNext(wPtr - rPtr >= pendingMax-1) + io.cpu.execute.haltIt setWhen(full) + } + + def pending(inc : Bool, dec : Bool) = new Area { + val pendingSync = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0) + val pendingSyncNext = pendingSync + U(io.mem.cmd.fire && io.mem.cmd.wr && inc) - ((io.mem.sync.fire && dec) ? syncCount | 0) + pendingSync := pendingSyncNext + } + + val writeCached = pending(inc = !io.mem.cmd.uncached, dec = !syncContext.uncached) + val writeUncached = pending(inc = io.mem.cmd.uncached, dec = syncContext.uncached) + + def track(load : Bool, uncached : Boolean) = new Area { + val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0) + counter := counter - ((io.mem.sync.fire && counter =/= 0 && (if(uncached) syncContext.uncached else !syncContext.uncached)) ? syncCount | 0) + when(load){ counter := (if(uncached) writeUncached.pendingSyncNext else writeCached.pendingSyncNext) } + + val busy = counter =/= 0 + } + + val w2w = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SW, uncached = false) + val w2r = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SR, uncached = false) + val w2i = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SI, uncached = false) + val w2o = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SO, uncached = false) + val o2w = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SW, uncached = true) + val o2r = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SR, uncached = true) + //Assume o2i and o2o are ordered by the interconnect + + val notTotalyConsistent = w2w.busy || w2r.busy || w2i.busy || w2o.busy || o2w.busy || o2r.busy + } + + + + + val stage0 = new Area{ +// val mask = io.cpu.execute.size.mux ( +// U(0) -> B"0001", +// U(1) -> B"0011", +// default -> B"1111" +// ) |<< io.cpu.execute.address(1 downto 0) + + val mask = io.cpu.execute.size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> B((1 << (1 << i)) -1, p.cpuDataBytes bits))) |<< io.cpu.execute.address(log2Up(p.cpuDataBytes)-1 downto 0) + + + val dataColisions = collisionProcess(io.cpu.execute.address(lineRange.high downto cpuWordRange.low), mask) + val wayInvalidate = B(0, wayCount bits) //Used if invalidate enabled + + val isAmo = if(withAmo) io.cpu.execute.isAmo else False + } + + val stageA = new Area{ + def stagePipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else RegNextWhen(that, !io.cpu.memory.isStuck) + val request = stagePipe(io.cpu.execute.args) + val mask = stagePipe(stage0.mask) + io.cpu.memory.isWrite := request.wr + + val isAmo = if(withAmo) request.isAmo else False + val isLrsc = if(withAmo) request.isLrsc else False + val consistancyCheck = (withInvalidate || withWriteResponse) generate new Area { + val hazard = False + val w = sync.w2w.busy || sync.o2w.busy + val r = stagePipe(sync.w2r.busy || sync.o2r.busy) || sync.w2r.busy || sync.o2r.busy // As it use the cache, need to check against the execute stage status too + val o = CombInit(sync.w2o.busy) + val i = CombInit(sync.w2i.busy) + + val s = io.cpu.memory.mmuRsp.isIoAccess ? o | w + val l = io.cpu.memory.mmuRsp.isIoAccess ? i | r + + when(isAmo? (s || l) | (request.wr ? s | l)){ + hazard := True + } + when(request.totalyConsistent && (sync.notTotalyConsistent || io.cpu.writeBack.isValid && io.cpu.writeBack.isWrite)){ + hazard := True + } + } + + val wayHits = earlyWaysHits generate Bits(wayCount bits) + val indirectTlbHitGen = (earlyWaysHits && !directTlbHit) generate new Area { + wayHits := B(ways.map(way => (io.cpu.memory.mmuRsp.physicalAddress(tagRange) === way.tagsReadRsp.address && way.tagsReadRsp.valid))) + } + val directTlbHitGen = (earlyWaysHits && directTlbHit) generate new Area { + val wayTlbHits = for (way <- ways) yield for (tlb <- io.cpu.memory.mmuRsp.ways) yield { + way.tagsReadRsp.address === tlb.physical(tagRange) && tlb.sel + } + val translatedHits = B(wayTlbHits.map(_.orR)) + val bypassHits = B(ways.map(_.tagsReadRsp.address === io.cpu.memory.address(tagRange))) + wayHits := (io.cpu.memory.mmuRsp.bypassTranslation ? bypassHits | translatedHits) & B(ways.map(_.tagsReadRsp.valid)) + } + + val dataMux = earlyDataMux generate MuxOH(wayHits, ways.map(_.dataReadRsp)) + val wayInvalidate = stagePipe(stage0. wayInvalidate) + val dataColisions = if(mergeExecuteMemory){ + stagePipe(stage0.dataColisions) + } else { + //Assume the writeback stage will never be unstall memory acces while memory stage is stalled + stagePipe(stage0.dataColisions) | collisionProcess(io.cpu.memory.address(lineRange.high downto cpuWordRange.low), mask) + } + } + + val stageB = new Area { + def stagePipe[T <: Data](that : T) = RegNextWhen(that, !io.cpu.writeBack.isStuck) + def ramPipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else RegNextWhen(that, !io.cpu.writeBack.isStuck) + val request = RegNextWhen(stageA.request, !io.cpu.writeBack.isStuck) + val mmuRspFreeze = False + val mmuRsp = RegNextWhen(io.cpu.memory.mmuRsp, !io.cpu.writeBack.isStuck && !mmuRspFreeze) + val tagsReadRsp = ways.map(w => ramPipe(w.tagsReadRsp)) + val dataReadRsp = !earlyDataMux generate ways.map(w => ramPipe(w.dataReadRsp)) + val wayInvalidate = stagePipe(stageA. wayInvalidate) + val consistancyHazard = if(stageA.consistancyCheck != null) stagePipe(stageA.consistancyCheck.hazard) else False + val dataColisions = stagePipe(stageA.dataColisions) +// val unaligned = if(!catchUnaligned) False else stagePipe((stageA.request.size === 2 && io.cpu.memory.address(1 downto 0) =/= 0) || (stageA.request.size === 1 && io.cpu.memory.address(0 downto 0) =/= 0)) + val unaligned = if(!catchUnaligned) False else stagePipe((1 to log2Up(p.cpuDataBytes)).map(i => stageA.request.size === i && io.cpu.memory.address(i-1 downto 0) =/= 0).orR) + val waysHitsBeforeInvalidate = if(earlyWaysHits) stagePipe(B(stageA.wayHits)) else B(tagsReadRsp.map(tag => mmuRsp.physicalAddress(tagRange) === tag.address && tag.valid).asBits()) + val waysHits = waysHitsBeforeInvalidate & ~wayInvalidate + val waysHit = waysHits.orR + val dataMux = if(earlyDataMux) stagePipe(stageA.dataMux) else MuxOH(waysHits, dataReadRsp) + val mask = stagePipe(stageA.mask) + + //Loader interface + val loaderValid = False + + val ioMemRspMuxed = io.mem.rsp.data.subdivideIn(cpuDataWidth bits).read(io.cpu.writeBack.address(memWordToCpuWordRange)) + + io.cpu.writeBack.haltIt := True + + //Evict the cache after reset logics + val flusher = new Area { + val waitDone = RegInit(False) clearWhen(io.cpu.flush.ready) + val hold = False + val counter = Reg(UInt(lineRange.size + 1 bits)) init(0) + when(!counter.msb) { + tagsWriteCmd.valid := True + tagsWriteCmd.address := counter.resized + tagsWriteCmd.way.setAll() + tagsWriteCmd.data.valid := False + io.cpu.execute.haltIt := True + when(!hold) { + counter := counter + 1 + when(io.cpu.flush.singleLine){ + counter.msb := True + } + } + } + + io.cpu.flush.ready := waitDone && counter.msb + + val start = RegInit(True) //Used to relax timings + start := !waitDone && !start && io.cpu.flush.valid && !io.cpu.execute.isValid && !io.cpu.memory.isValid && !io.cpu.writeBack.isValid && !io.cpu.redo + + when(start){ + waitDone := True + counter := 0 + when(io.cpu.flush.singleLine){ + counter := U"0" @@ io.cpu.flush.lineId + } + } + } + + val lrSc = withInternalLrSc generate new Area{ + val reserved = RegInit(False) + when(io.cpu.writeBack.isValid && io.cpu.writeBack.isFiring){ + reserved setWhen(request.isLrsc) + reserved clearWhen(request.wr) + } + } + + val isAmo = if(withAmo) request.isAmo else False + val isAmoCached = if(withInternalAmo) isAmo else False + val isExternalLsrc = if(withExternalLrSc) request.isLrsc else False + val isExternalAmo = if(withExternalAmo) request.isAmo else False + + val requestDataBypass = CombInit(io.cpu.writeBack.storeData) + import DataCacheExternalAmoStates._ + val amo = withAmo generate new Area{ + def rf = io.cpu.writeBack.storeData(p.rfDataWidth-1 downto 0) + def memLarger = if(withInternalAmo) dataMux else ioMemRspMuxed + def mem = memLarger.subdivideIn(rfDataWidth bits).read(io.cpu.writeBack.address(cpuWordToRfWordRange)) + val compare = request.amoCtrl.alu.msb + val unsigned = request.amoCtrl.alu(2 downto 1) === B"11" + val addSub = (rf.asSInt + Mux(compare, ~mem, mem).asSInt + Mux(compare, S(1), S(0))).asBits + val less = Mux(rf.msb === mem.msb, addSub.msb, Mux(unsigned, mem.msb, rf.msb)) + val selectRf = request.amoCtrl.swap ? True | (request.amoCtrl.alu.lsb ^ less) + + val result = (request.amoCtrl.alu | (request.amoCtrl.swap ## B"00")).mux( + B"000" -> addSub, + B"001" -> (rf ^ mem), + B"010" -> (rf | mem), + B"011" -> (rf & mem), + default -> (selectRf ? rf | mem) + ) + // val resultRegValid = RegNext(True) clearWhen(!io.cpu.writeBack.isStuck) + // val resultReg = RegNext(result) + val resultReg = Reg(Bits(32 bits)) + + val internal = withInternalAmo generate new Area{ + val resultRegValid = RegNext(io.cpu.writeBack.isStuck) + resultReg := result + } + val external = !withInternalAmo generate new Area{ + val state = RegInit(LR_CMD) + } + } + + + val cpuWriteToCache = False + when(cpuWriteToCache){ + dataWriteCmd.valid setWhen(request.wr && waysHit) + dataWriteCmd.address := mmuRsp.physicalAddress(lineRange.high downto memWordRange.low) + dataWriteCmd.data.subdivideIn(cpuDataWidth bits).foreach(_ := requestDataBypass) + dataWriteCmd.mask := 0 + dataWriteCmd.mask.subdivideIn(cpuDataWidth/8 bits).write(io.cpu.writeBack.address(memWordToCpuWordRange), mask) + dataWriteCmd.way := waysHits + } + + val badPermissions = (!mmuRsp.allowWrite && request.wr) || (!mmuRsp.allowRead && (!request.wr || isAmo)) + val loadStoreFault = io.cpu.writeBack.isValid && (mmuRsp.exception || badPermissions) + + io.cpu.redo := False + io.cpu.writeBack.accessError := False + io.cpu.writeBack.mmuException := loadStoreFault && (if(catchIllegal) mmuRsp.isPaging else False) + io.cpu.writeBack.unalignedAccess := io.cpu.writeBack.isValid && unaligned + io.cpu.writeBack.isWrite := request.wr + + + io.mem.cmd.valid := False + io.mem.cmd.address := mmuRsp.physicalAddress + io.mem.cmd.last := True + io.mem.cmd.wr := request.wr + io.mem.cmd.mask := mask + io.mem.cmd.data := requestDataBypass + io.mem.cmd.uncached := mmuRsp.isIoAccess + io.mem.cmd.size := request.size.resized + if(withExternalLrSc) io.mem.cmd.exclusive := request.isLrsc || isAmo + + + val bypassCache = mmuRsp.isIoAccess || isExternalLsrc || isExternalAmo + + io.cpu.writeBack.keepMemRspData := False + when(io.cpu.writeBack.isValid) { + when(isExternalAmo){ + if(withExternalAmo) switch(amo.external.state){ + is(LR_CMD){ + io.mem.cmd.valid := True + io.mem.cmd.wr := False + when(io.mem.cmd.ready) { + amo.external.state := LR_RSP + } + } + is(LR_RSP){ + when(io.mem.rsp.valid && pending.last) { + amo.external.state := SC_CMD + amo.resultReg := amo.result + } + } + is(SC_CMD){ + io.mem.cmd.valid := True + when(io.mem.cmd.ready) { + amo.external.state := SC_RSP + } + } + is(SC_RSP){ + io.cpu.writeBack.keepMemRspData := True + when(io.mem.rsp.valid) { + amo.external.state := LR_CMD + when(io.mem.rsp.exclusive){ //Success + cpuWriteToCache := True + io.cpu.writeBack.haltIt := False + } + } + } + } + } elsewhen(mmuRsp.isIoAccess || isExternalLsrc) { + val waitResponse = !request.wr + if(withExternalLrSc) waitResponse setWhen(request.isLrsc) + + io.cpu.writeBack.haltIt.clearWhen(waitResponse ? (io.mem.rsp.valid && rspSync) | io.mem.cmd.ready) + + io.mem.cmd.valid := !memCmdSent + + if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){ + io.mem.cmd.valid := False + io.cpu.writeBack.haltIt := False + } + } otherwise { + when(waysHit || request.wr && !isAmoCached) { //Do not require a cache refill ? + cpuWriteToCache := True + + //Write through + io.mem.cmd.valid setWhen(request.wr) + io.cpu.writeBack.haltIt clearWhen(!request.wr || io.mem.cmd.ready) + + if(withInternalAmo) when(isAmo){ + when(!amo.internal.resultRegValid) { + io.mem.cmd.valid := False + dataWriteCmd.valid := False + io.cpu.writeBack.haltIt := True + } + } + + //On write to read dataColisions + when((!request.wr || isAmoCached) && (dataColisions & waysHits) =/= 0){ + io.cpu.redo := True + if(withAmo) io.mem.cmd.valid := False + } + + if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){ + io.mem.cmd.valid := False + dataWriteCmd.valid := False + io.cpu.writeBack.haltIt := False + } + } otherwise { //Do refill + //Emit cmd + io.mem.cmd.valid setWhen(!memCmdSent) + io.mem.cmd.wr := False + io.mem.cmd.address(0, lineRange.low bits) := 0 + io.mem.cmd.size := log2Up(p.bytePerLine) + + loaderValid setWhen(io.mem.cmd.ready) + } + } + } + + when(bypassCache){ + io.cpu.writeBack.data := ioMemRspMuxed + def isLast = if(pending != null) pending.last else True + if(catchAccessError) io.cpu.writeBack.accessError := !request.wr && isLast && io.mem.rsp.valid && io.mem.rsp.error + } otherwise { + io.cpu.writeBack.data := dataMux + if(catchAccessError) io.cpu.writeBack.accessError := (waysHits & B(tagsReadRsp.map(_.error))) =/= 0 || (loadStoreFault && !mmuRsp.isPaging) + } + + if(withLrSc) { + val success = if(withInternalLrSc)lrSc.reserved else io.mem.rsp.exclusive + io.cpu.writeBack.exclusiveOk := success + when(request.isLrsc && request.wr){ + // io.cpu.writeBack.data := B(!success).resized + if(withExternalLrSc) when(io.cpu.writeBack.isValid && io.mem.rsp.valid && rspSync && success && waysHit){ + cpuWriteToCache := True + } + } + } + if(withAmo) when(request.isAmo){ + requestDataBypass.subdivideIn(p.rfDataWidth bits).foreach(_ := amo.resultReg) + } + + //remove side effects on exceptions + when(consistancyHazard || mmuRsp.refilling || io.cpu.writeBack.accessError || io.cpu.writeBack.mmuException || io.cpu.writeBack.unalignedAccess){ + io.mem.cmd.valid := False + tagsWriteCmd.valid := False + dataWriteCmd.valid := False + loaderValid := False + io.cpu.writeBack.haltIt := False + if(withInternalLrSc) lrSc.reserved := lrSc.reserved + if(withExternalAmo) amo.external.state := LR_CMD + } + io.cpu.redo setWhen(io.cpu.writeBack.isValid && (mmuRsp.refilling || consistancyHazard)) + + assert(!(io.cpu.writeBack.isValid && !io.cpu.writeBack.haltIt && io.cpu.writeBack.isStuck), "writeBack stuck by another plugin is not allowed", ERROR) + } + + val loader = new Area{ + val valid = RegInit(False) setWhen(stageB.loaderValid) + val baseAddress = stageB.mmuRsp.physicalAddress + + val counter = Counter(memTransactionPerLine) + val waysAllocator = Reg(Bits(wayCount bits)) init(1) + val error = RegInit(False) + val kill = False + val killReg = RegInit(False) setWhen(kill) + + when(valid && io.mem.rsp.valid && rspLast){ + dataWriteCmd.valid := True + dataWriteCmd.address := baseAddress(lineRange) @@ counter + dataWriteCmd.data := io.mem.rsp.data + dataWriteCmd.mask.setAll() + dataWriteCmd.way := waysAllocator + error := error | io.mem.rsp.error + counter.increment() + } + + val done = CombInit(counter.willOverflow) + if(withInvalidate) done setWhen(valid && pending.counter === 0) //Used to solve invalidate write request at the same time + + when(done){ + valid := False + + //Update tags + tagsWriteCmd.valid := True + tagsWriteCmd.address := baseAddress(lineRange) + tagsWriteCmd.data.valid := !(kill || killReg) + tagsWriteCmd.data.address := baseAddress(tagRange) + tagsWriteCmd.data.error := error || (io.mem.rsp.valid && io.mem.rsp.error) + tagsWriteCmd.way := waysAllocator + + error := False + killReg := False + } + + when(!valid){ + waysAllocator := (waysAllocator ## waysAllocator.msb).resized + } + + io.cpu.redo setWhen(valid.rise()) + io.cpu.execute.refilling := valid + + stageB.mmuRspFreeze setWhen(stageB.loaderValid || valid) + } + + val invalidate = withInvalidate generate new Area{ + val s0 = new Area{ + val input = io.mem.inv + tagsInvReadCmd.valid := input.fire + tagsInvReadCmd.payload := input.address(lineRange) + + val loaderTagHit = input.address(tagRange) === loader.baseAddress(tagRange) + val loaderLineHit = input.address(lineRange) === loader.baseAddress(lineRange) + when(input.valid && input.enable && loader.valid && loaderLineHit && loaderTagHit){ + loader.kill := True + } + } + val s1 = new Area{ + val input = s0.input.stage() + val loaderValid = RegNextWhen(loader.valid, s0.input.ready) + val loaderWay = RegNextWhen(loader.waysAllocator, s0.input.ready) + val loaderTagHit = RegNextWhen(s0.loaderTagHit, s0.input.ready) + val loaderLineHit = RegNextWhen(s0.loaderLineHit, s0.input.ready) + val invalidations = Bits(wayCount bits) + + var wayHits = B(ways.map(way => (input.address(tagRange) === way.tagsInvReadRsp.address && way.tagsInvReadRsp.valid))) & ~invalidations + + //Handle invalider read during loader write hazard + when(loaderValid && loaderLineHit && !loaderTagHit){ + wayHits \= wayHits & ~loaderWay + } + } + val s2 = new Area{ + val input = s1.input.stage() + val wayHits = RegNextWhen(s1.wayHits, s1.input.ready) + val wayHit = wayHits.orR + + when(input.valid && input.enable) { + //Manage invalidate write during cpu read hazard + when(input.address(lineRange) === io.cpu.execute.address(lineRange)) { + stage0.wayInvalidate := wayHits + } + + //Invalidate cache tag + when(wayHit) { + tagsWriteCmd.valid := True + stageB.flusher.hold := True + tagsWriteCmd.address := input.address(lineRange) + tagsWriteCmd.data.valid := False + tagsWriteCmd.way := wayHits + loader.done := False //Hold loader tags write + } + } + io.mem.ack.arbitrationFrom(input) + io.mem.ack.hit := wayHit + io.mem.ack.last := input.last + + //Manage invalidation read during write hazard + s1.invalidations := RegNextWhen((input.valid && input.enable && input.address(lineRange) === s0.input.address(lineRange)) ? wayHits | 0, s0.input.ready) + } + } +} diff --git a/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala new file mode 100644 index 0000000..e09712c --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala @@ -0,0 +1,487 @@ +package vexriscv.ip + +import vexriscv._ +import spinal.core._ +import spinal.lib._ +import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4ReadOnly} +import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig} +import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbParameter, BmbSourceParameter} +import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig} +import spinal.lib.bus.simple._ +import vexriscv.plugin.{IBusSimpleBus, IBusSimplePlugin} + + +case class InstructionCacheConfig( cacheSize : Int, + bytePerLine : Int, + wayCount : Int, + addressWidth : Int, + cpuDataWidth : Int, + memDataWidth : Int, + catchIllegalAccess : Boolean, + catchAccessFault : Boolean, + asyncTagMemory : Boolean, + twoCycleCache : Boolean = true, + twoCycleRam : Boolean = false, + twoCycleRamInnerMux : Boolean = false, + preResetFlush : Boolean = false, + bypassGen : Boolean = false, + reducedBankWidth : Boolean = false){ + + assert(!(twoCycleRam && !twoCycleCache)) + + def burstSize = bytePerLine*8/memDataWidth + def catchSomething = catchAccessFault || catchIllegalAccess + + def getAxi4Config() = Axi4Config( + addressWidth = addressWidth, + dataWidth = memDataWidth, + useId = false, + useRegion = false, + useLock = false, + useQos = false, + useSize = false + ) + + def getAvalonConfig() = AvalonMMConfig.bursted( + addressWidth = addressWidth, + dataWidth = memDataWidth, + burstCountWidth = log2Up(burstSize + 1)).getReadOnlyConfig.copy( + useResponse = true, + constantBurstBehavior = true + ) + + def getPipelinedMemoryBusConfig() = PipelinedMemoryBusConfig( + addressWidth = 32, + dataWidth = 32 + ) + + def getWishboneConfig() = WishboneConfig( + addressWidth = 32-log2Up(memDataWidth/8), + dataWidth = memDataWidth, + selWidth = memDataWidth/8, + useSTALL = false, + useLOCK = false, + useERR = true, + useRTY = false, + tgaWidth = 0, + tgcWidth = 0, + tgdWidth = 0, + useBTE = true, + useCTI = true + ) + + def getBmbParameter() = BmbParameter( + BmbAccessParameter( + addressWidth = 32, + dataWidth = memDataWidth + ).addSources(1, BmbSourceParameter( + lengthWidth = log2Up(this.bytePerLine), + contextWidth = 0, + canWrite = false, + alignment = BmbParameter.BurstAlignement.LENGTH, + maximumPendingTransaction = 1 + )) + ) +} + + + +case class InstructionCacheCpuPrefetch(p : InstructionCacheConfig) extends Bundle with IMasterSlave{ + val isValid = Bool + val haltIt = Bool + val pc = UInt(p.addressWidth bit) + + override def asMaster(): Unit = { + out(isValid, pc) + in(haltIt) + } +} + +trait InstructionCacheCommons{ + val isValid : Bool + val isStuck : Bool + val pc : UInt + val physicalAddress : UInt + val data : Bits + val cacheMiss, error, mmuRefilling, mmuException, isUser : Bool +} + +case class InstructionCacheCpuFetch(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave with InstructionCacheCommons { + val isValid = Bool() + val isStuck = Bool() + val isRemoved = Bool() + val pc = UInt(p.addressWidth bits) + val data = Bits(p.cpuDataWidth bits) + val dataBypassValid = p.bypassGen generate Bool() + val dataBypass = p.bypassGen generate Bits(p.cpuDataWidth bits) + val mmuRsp = MemoryTranslatorRsp(mmuParameter) + val physicalAddress = UInt(p.addressWidth bits) + val cacheMiss, error, mmuRefilling, mmuException, isUser = ifGen(!p.twoCycleCache)(Bool) + + override def asMaster(): Unit = { + out(isValid, isStuck, isRemoved, pc) + inWithNull(error,mmuRefilling,mmuException,data, cacheMiss,physicalAddress) + outWithNull(isUser, dataBypass, dataBypassValid) + out(mmuRsp) + } +} + + +case class InstructionCacheCpuDecode(p : InstructionCacheConfig) extends Bundle with IMasterSlave with InstructionCacheCommons { + val isValid = Bool + val isStuck = Bool + val pc = UInt(p.addressWidth bits) + val physicalAddress = UInt(p.addressWidth bits) + val data = Bits(p.cpuDataWidth bits) + val cacheMiss, error, mmuRefilling, mmuException, isUser = ifGen(p.twoCycleCache)(Bool) + + override def asMaster(): Unit = { + out(isValid, isStuck, pc) + outWithNull(isUser) + inWithNull(error, mmuRefilling, mmuException,data, cacheMiss, physicalAddress) + } +} + +case class InstructionCacheCpuBus(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{ + val prefetch = InstructionCacheCpuPrefetch(p) + val fetch = InstructionCacheCpuFetch(p, mmuParameter) + val decode = InstructionCacheCpuDecode(p) + val fill = Flow(UInt(p.addressWidth bits)) + + override def asMaster(): Unit = { + master(prefetch, fetch, decode, fill) + } +} + +case class InstructionCacheMemCmd(p : InstructionCacheConfig) extends Bundle{ + val address = UInt(p.addressWidth bit) + val size = UInt(log2Up(log2Up(p.bytePerLine) + 1) bits) +} + +case class InstructionCacheMemRsp(p : InstructionCacheConfig) extends Bundle{ + val data = Bits(p.memDataWidth bit) + val error = Bool +} + +case class InstructionCacheMemBus(p : InstructionCacheConfig) extends Bundle with IMasterSlave{ + val cmd = Stream (InstructionCacheMemCmd(p)) + val rsp = Flow (InstructionCacheMemRsp(p)) + + override def asMaster(): Unit = { + master(cmd) + slave(rsp) + } + + def toAxi4ReadOnly(): Axi4ReadOnly = { + val axiConfig = p.getAxi4Config() + val mm = Axi4ReadOnly(axiConfig) + + mm.readCmd.valid := cmd.valid + mm.readCmd.len := p.burstSize-1 + mm.readCmd.addr := cmd.address + mm.readCmd.prot := "110" + mm.readCmd.cache := "1111" + mm.readCmd.setBurstINCR() + cmd.ready := mm.readCmd.ready + rsp.valid := mm.readRsp.valid + rsp.data := mm.readRsp.data + rsp.error := !mm.readRsp.isOKAY() + mm.readRsp.ready := True + mm + } + + def toAvalon(): AvalonMM = { + val avalonConfig = p.getAvalonConfig() + val mm = AvalonMM(avalonConfig) + mm.read := cmd.valid + mm.burstCount := U(p.burstSize) + mm.address := cmd.address + cmd.ready := mm.waitRequestn + rsp.valid := mm.readDataValid + rsp.data := mm.readData + rsp.error := mm.response =/= AvalonMM.Response.OKAY + mm + } + + + def toPipelinedMemoryBus(): PipelinedMemoryBus = { + val pipelinedMemoryBusConfig = p.getPipelinedMemoryBusConfig() + val bus = PipelinedMemoryBus(pipelinedMemoryBusConfig) + val counter = Counter(p.burstSize, bus.cmd.fire) + bus.cmd.valid := cmd.valid + bus.cmd.address := cmd.address(31 downto widthOf(counter.value) + 2) @@ counter @@ U"00" + bus.cmd.write := False + bus.cmd.mask.assignDontCare() + bus.cmd.data.assignDontCare() + cmd.ready := counter.willOverflow + rsp.valid := bus.rsp.valid + rsp.data := bus.rsp.payload.data + rsp.error := False + bus + } + + + def toWishbone(): Wishbone = { + val wishboneConfig = p.getWishboneConfig() + val bus = Wishbone(wishboneConfig) + val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0) + val pending = counter =/= 0 + val lastCycle = counter === counter.maxValue + + bus.ADR := (cmd.address >> widthOf(counter) + log2Up(p.memDataWidth/8)) @@ counter + bus.CTI := lastCycle ? B"111" | B"010" + bus.BTE := "00" + bus.SEL.setAll() + bus.WE := False + bus.DAT_MOSI.assignDontCare() + bus.CYC := False + bus.STB := False + when(cmd.valid || pending){ + bus.CYC := True + bus.STB := True + when(bus.ACK){ + counter := counter + 1 + } + } + + cmd.ready := cmd.valid && bus.ACK + rsp.valid := RegNext(bus.CYC && bus.ACK) init(False) + rsp.data := RegNext(bus.DAT_MISO) + rsp.error := False //TODO + bus + } + + def toBmb() : Bmb = { + val busParameter = p.getBmbParameter + val bus = Bmb(busParameter).setCompositeName(this,"toBmb", true) + bus.cmd.arbitrationFrom(cmd) + bus.cmd.opcode := Bmb.Cmd.Opcode.READ + bus.cmd.address := cmd.address.resized + bus.cmd.length := p.bytePerLine - 1 + bus.cmd.last := True + rsp.valid := bus.rsp.valid + rsp.data := bus.rsp.data + rsp.error := bus.rsp.isError + bus.rsp.ready := True + bus + } +} + + +case class InstructionCacheFlushBus() extends Bundle with IMasterSlave{ + val cmd = Event + val rsp = Bool + + override def asMaster(): Unit = { + master(cmd) + in(rsp) + } +} + +class InstructionCache(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{ + import p._ + val io = new Bundle{ + val flush = in Bool() + val cpu = slave(InstructionCacheCpuBus(p, mmuParameter)) + val mem = master(InstructionCacheMemBus(p)) + } + + val lineWidth = bytePerLine*8 + val lineCount = cacheSize/bytePerLine + val cpuWordWidth = cpuDataWidth + val memWordPerLine = lineWidth/memDataWidth + val bytePerCpuWord = cpuWordWidth/8 + val wayLineCount = lineCount/wayCount + + val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine) + val lineRange = tagRange.low-1 downto log2Up(bytePerLine) + + case class LineTag() extends Bundle{ + val valid = Bool + val error = Bool + val address = UInt(tagRange.length bit) + } + + val bankCount = wayCount + val bankWidth = if(!reducedBankWidth) memDataWidth else Math.max(cpuDataWidth, memDataWidth/wayCount) + val bankByteSize = cacheSize/bankCount + val bankWordCount = bankByteSize*8/bankWidth + val bankWordToCpuWordRange = log2Up(bankWidth/8)-1 downto log2Up(bytePerCpuWord) + val memToBankRatio = bankWidth*bankCount / memDataWidth + + val banks = Seq.fill(bankCount)(Mem(Bits(bankWidth bits), bankWordCount)) + + val ways = Seq.fill(wayCount)(new Area{ + val tags = Mem(LineTag(),wayLineCount) + + if(preResetFlush){ + tags.initBigInt(List.fill(wayLineCount)(BigInt(0))) + } + }) + + + val lineLoader = new Area{ + val fire = False + val valid = RegInit(False) clearWhen(fire) + val address = KeepAttribute(Reg(UInt(addressWidth bits))) + val hadError = RegInit(False) clearWhen(fire) + val flushPending = RegInit(True) + + when(io.cpu.fill.valid){ + valid := True + address := io.cpu.fill.payload + } + + io.cpu.prefetch.haltIt := valid || flushPending + + val flushCounter = Reg(UInt(log2Up(wayLineCount) + 1 bit)) + when(!flushCounter.msb){ + io.cpu.prefetch.haltIt := True + flushCounter := flushCounter + 1 + } + when(!RegNext(flushCounter.msb)){ + io.cpu.prefetch.haltIt := True + } + + when(io.flush){ + io.cpu.prefetch.haltIt := True + flushPending := True + } + + when(flushPending && !(valid || io.cpu.fetch.isValid) ){ + flushCounter := 0 + flushPending := False + } + + + + val cmdSent = RegInit(False) setWhen(io.mem.cmd.fire) clearWhen(fire) + io.mem.cmd.valid := valid && !cmdSent + io.mem.cmd.address := address(tagRange.high downto lineRange.low) @@ U(0,lineRange.low bit) + io.mem.cmd.size := log2Up(p.bytePerLine) + + val wayToAllocate = Counter(wayCount, !valid) + val wordIndex = KeepAttribute(Reg(UInt(log2Up(memWordPerLine) bits)) init(0)) + + + val write = new Area{ + val tag = ways.map(_.tags.writePort) + val data = banks.map(_.writePort) + } + + for(wayId <- 0 until wayCount){ + val wayHit = wayToAllocate === wayId + val tag = write.tag(wayId) + tag.valid := ((wayHit && fire) || !flushCounter.msb) + tag.address := (flushCounter.msb ? address(lineRange) | flushCounter(flushCounter.high-1 downto 0)) + tag.data.valid := flushCounter.msb + tag.data.error := hadError || io.mem.rsp.error + tag.data.address := address(tagRange) + } + + for((writeBank, bankId) <- write.data.zipWithIndex){ + if(!reducedBankWidth) { + writeBank.valid := io.mem.rsp.valid && wayToAllocate === bankId + writeBank.address := address(lineRange) @@ wordIndex + writeBank.data := io.mem.rsp.data + } else { + val sel = U(bankId) - wayToAllocate.value + val groupSel = wayToAllocate(log2Up(bankCount)-1 downto log2Up(bankCount/memToBankRatio)) + val subSel = sel(log2Up(bankCount/memToBankRatio) -1 downto 0) + writeBank.valid := io.mem.rsp.valid && groupSel === (bankId >> log2Up(bankCount/memToBankRatio)) + writeBank.address := address(lineRange) @@ wordIndex @@ (subSel) + writeBank.data := io.mem.rsp.data.subdivideIn(bankCount/memToBankRatio slices)(subSel) + } + } + + + when(io.mem.rsp.valid) { + wordIndex := (wordIndex + 1).resized + hadError.setWhen(io.mem.rsp.error) + when(wordIndex === wordIndex.maxValue) { + fire := True + } + } + } + + val fetchStage = new Area{ + val read = new Area{ + val banksValue = for(bank <- banks) yield new Area{ + val dataMem = bank.readSync(io.cpu.prefetch.pc(lineRange.high downto log2Up(bankWidth/8)), !io.cpu.fetch.isStuck) + val data = if(!twoCycleRamInnerMux) dataMem.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange)) else dataMem + } + + val waysValues = for((way, wayId) <- ways.zipWithIndex) yield new Area{ + val tag = if(asyncTagMemory) { + way.tags.readAsync(io.cpu.fetch.pc(lineRange)) + }else { + way.tags.readSync(io.cpu.prefetch.pc(lineRange), !io.cpu.fetch.isStuck) + } +// val data = CombInit(banksValue(wayId).data) + } + } + + + val hit = (!twoCycleRam) generate new Area{ + val hits = read.waysValues.map(way => way.tag.valid && way.tag.address === io.cpu.fetch.mmuRsp.physicalAddress(tagRange)) + val valid = Cat(hits).orR + val wayId = OHToUInt(hits) + val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (io.cpu.fetch.mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio))) + val error = read.waysValues.map(_.tag.error).read(wayId) + val data = read.banksValue.map(_.data).read(bankId) + val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(data) else data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange)) + io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | word) else word) + if(twoCycleCache){ + io.cpu.decode.data := RegNextWhen(io.cpu.fetch.data,!io.cpu.decode.isStuck) + } + } + + if(twoCycleRam && wayCount == 1){ + val cacheData = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(read.banksValue.head.data) else read.banksValue.head.data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange)) + io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | cacheData) else cacheData) + } + + io.cpu.fetch.physicalAddress := io.cpu.fetch.mmuRsp.physicalAddress + + val resolution = ifGen(!twoCycleCache)( new Area{ + val mmuRsp = io.cpu.fetch.mmuRsp + + io.cpu.fetch.cacheMiss := !hit.valid + io.cpu.fetch.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)) + io.cpu.fetch.mmuRefilling := mmuRsp.refilling + io.cpu.fetch.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute) + }) + } + + + + val decodeStage = ifGen(twoCycleCache) (new Area{ + def stage[T <: Data](that : T) = RegNextWhen(that,!io.cpu.decode.isStuck) + val mmuRsp = stage(io.cpu.fetch.mmuRsp) + + val hit = if(!twoCycleRam) new Area{ + val valid = stage(fetchStage.hit.valid) + val error = stage(fetchStage.hit.error) + } else new Area{ + val tags = fetchStage.read.waysValues.map(way => stage(way.tag)) + val hits = tags.map(tag => tag.valid && tag.address === mmuRsp.physicalAddress(tagRange)) + val valid = Cat(hits).orR + val wayId = OHToUInt(hits) + val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio))) + val error = tags(wayId).error + val data = fetchStage.read.banksValue.map(bank => stage(bank.data)).read(bankId) + val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) data else data.subdivideIn(cpuDataWidth bits).read(io.cpu.decode.pc(bankWordToCpuWordRange)) + if(p.bypassGen) when(stage(io.cpu.fetch.dataBypassValid)){ + word := stage(io.cpu.fetch.dataBypass) + } + io.cpu.decode.data := word + } + + io.cpu.decode.cacheMiss := !hit.valid + io.cpu.decode.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)) + io.cpu.decode.mmuRefilling := mmuRsp.refilling + io.cpu.decode.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute) + io.cpu.decode.physicalAddress := mmuRsp.physicalAddress + }) +} + diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala new file mode 100644 index 0000000..657b2fb --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -0,0 +1,1944 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets} +import spinal.lib.math.UnsignedDivider + +import scala.collection.mutable.ArrayBuffer + +object FpuDivSqrtIterationState extends SpinalEnum{ + val IDLE, YY, XYY, Y2_XYY, DIV, _15_XYY2, Y_15_XYY2, Y_15_XYY2_RESULT, SQRT = newElement() +} + + +case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ + val io = new Bundle { + val port = Vec(slave(FpuPort(p)), portCount) + } + + val portCountWidth = log2Up(portCount) + val Source = HardType(UInt(portCountWidth bits)) + val exponentOne = (1 << p.internalExponentSize-1) - 1 + val exponentF32Subnormal = exponentOne-127 + val exponentF64Subnormal = exponentOne-1023 + val exponentF32Infinity = exponentOne+127+1 + val exponentF64Infinity = exponentOne+1023+1 + + + + def whenDouble(format : FpuFormat.C)(yes : => Unit)(no : => Unit): Unit ={ + if(p.withDouble) when(format === FpuFormat.DOUBLE) { yes } otherwise{ no } + if(!p.withDouble) no + } + + def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={ + if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no }) + else no + } + + case class RfReadInput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + val arg = p.Arg() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class RfReadOutput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + val arg = p.Arg() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val rs1Boxed, rs2Boxed = p.withDouble generate Bool() + } + + + case class LoadInput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val i2f = Bool() + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class ShortPipInput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val value = Bits(32 bits) + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val rs1Boxed, rs2Boxed = p.withDouble generate Bool() + } + + class MulInput() extends Bundle{ + val source = Source() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + val add = Bool() + val divSqrt = Bool() + val msb1, msb2 = Bool() //allow usage of msb bits of mul + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + case class DivSqrtInput() extends Bundle{ + val source = Source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val div = Bool() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class DivInput() extends Bundle{ + val source = Source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + case class SqrtInput() extends Bundle{ + val source = Source() + val rs1 = p.internalFloating() + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + val addExtraBits = 2 + case class AddInput() extends Bundle{ + val source = Source() + val rs1, rs2 = FpuFloat(exponentSize = p.internalExponentSize, mantissaSize = p.internalMantissaSize+addExtraBits) + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val needCommit = Bool() + } + + + class MergeInput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.writeFloating() + val scrap = Bool() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val NV = Bool() + val DZ = Bool() + } + + case class RoundOutput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.internalFloating() + val format = p.withDouble generate FpuFormat() + val NV, NX, OF, UF, DZ = Bool() + val write = Bool() + } + + val rf = new Area{ + case class Entry() extends Bundle{ + val value = p.internalFloating() + val boxed = p.withDouble generate Bool() + } + val ram = Mem(Entry(), 32*portCount) + + val init = new Area{ + val counter = Reg(UInt(6 bits)) init(0) + val done = CombInit(counter.msb) + when(!done){ + counter := counter + 1 + } + def apply(port : Flow[MemWriteCmd[Bool]]) = { + port.valid := !done + port.address := counter.resized + port.data := False + port + } + } + + val scoreboards = Array.fill(portCount)(new Area{ + val target, hit = Mem(Bool, 32) // XOR + val writes = Mem(Bool, 32) + + val targetWrite = init(target.writePort) + val hitWrite = init(hit.writePort) + }) + } + + val commitFork = new Area{ + val load, commit = Vec(Stream(FpuCommit(p)), portCount) + for(i <- 0 until portCount){ + val fork = new StreamFork(FpuCommit(p), 2, synchronous = true) + fork.io.input << io.port(i).commit + fork.io.outputs(0) >> load(i) + fork.io.outputs(1).pipelined(m2s = false, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload + } + } + + class Tracker(width : Int) extends Area{ + val counter = Reg(UInt(width bits)) init(0) + val full = counter.andR + val notEmpty = counter.orR + val inc = False + val dec = False + counter := counter + U(inc) - U(dec) + } + + class CommitArea(source : Int) extends Area{ + val pending = new Tracker(4) + val add, mul, div, sqrt, short = new Tracker(4) + val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow + + when(input.fire){ + add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR) + mul.inc setWhen(List(FpuOpcode.MUL, FpuOpcode.FMA).map(input.opcode === _).orR) + div.inc setWhen(List(FpuOpcode.DIV).map(input.opcode === _).orR) + sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR) + short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR) + rf.scoreboards(source).writes(input.rd) := input.write + pending.dec := True + } + } + + val commitLogic = for(source <- 0 until portCount) yield new CommitArea(source) + + def commitConsume(what : CommitArea => Tracker, source : UInt, fire : Bool) : Bool = { + for(i <- 0 until portCount) what(commitLogic(i)).dec setWhen(fire && source === i) + commitLogic.map(what(_).notEmpty).read(source) + } + + + val scheduler = for(portId <- 0 until portCount; + scoreboard = rf.scoreboards(portId)) yield new Area{ + val input = io.port(portId).cmd.pipelined(s2m = true) + val useRs1, useRs2, useRs3, useRd = False + switch(input.opcode){ + is(p.Opcode.LOAD) { useRd := True } + is(p.Opcode.STORE) { useRs2 := True } + is(p.Opcode.ADD) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.MUL) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.DIV) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.SQRT) { useRd := True; useRs1 := True } + is(p.Opcode.FMA) { useRd := True; useRs1 := True; useRs2 := True; useRs3 := True } + is(p.Opcode.I2F) { useRd := True } + is(p.Opcode.F2I) { useRs1 := True } + is(p.Opcode.MIN_MAX) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.CMP) { useRs1 := True; useRs2 := True } + is(p.Opcode.SGNJ) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.FMV_X_W) { useRs1 := True } + is(p.Opcode.FMV_W_X) { useRd := True } + is(p.Opcode.FCLASS ) { useRs1 := True } + is(p.Opcode.FCVT_X_X ) { useRd := True; useRs1 := True } + } + + val uses = List(useRs1, useRs2, useRs3, useRd) + val regs = List(input.rs1, input.rs2, input.rs3, input.rd) + val rfHits = regs.map(scoreboard.hit.readAsync(_)) + val rfTargets = regs.map(scoreboard.target.readAsync(_)) + val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _) + + val hits = (0 to 3).map(id => uses(id) && rfBusy(id)) + val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full + val output = input.haltWhen(hazard) + when(input.opcode === p.Opcode.STORE){ + output.rs1 := input.rs2 //Datapath optimisation to unify rs source in the store pipeline + } + when(input.valid && rf.init.done){ + scoreboard.targetWrite.address := input.rd + scoreboard.targetWrite.data := !rfTargets.last + } + when(output.fire && useRd){ + scoreboard.targetWrite.valid := True + commitLogic(portId).pending.inc := True + } + } + + + val cmdArbiter = new Area{ + val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount) + arbiter.io.inputs <> Vec(scheduler.map(_.output.pipelined(m2s = p.schedulerM2sPipe))) + + val output = arbiter.io.output.swapPayload(RfReadInput()) + output.source := arbiter.io.chosen + output.payload.assignSomeByName(arbiter.io.output.payload) + } + + val read = new Area{ + val s0 = cmdArbiter.output.pipelined() + val s1 = s0.m2sPipe() + val output = s1.swapPayload(RfReadOutput()) + val rs = if(p.asyncRegFile){ + List(s1.rs1, s1.rs2, s1.rs3).map(a => rf.ram.readAsync(s1.source @@ a)) + } else { + List(s0.rs1, s0.rs2, s0.rs3).map(a => rf.ram.readSync(s0.source @@ a, enable = !output.isStall)) + } + output.source := s1.source + output.opcode := s1.opcode + output.arg := s1.arg + output.roundMode := s1.roundMode + output.rd := s1.rd + output.rs1 := rs(0).value + output.rs2 := rs(1).value + output.rs3 := rs(2).value + if(p.withDouble){ + output.rs1Boxed := rs(0).boxed + output.rs2Boxed := rs(1).boxed + output.format := s1.format + val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W + val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE + when(!sgnjBypass) { + when(store) { //Pass through + output.format := rs(0).boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE + } elsewhen (s1.format === FpuFormat.FLOAT =/= rs(0).boxed) { + output.rs1.setNanQuiet + output.rs1.sign := False + } + } + when(s1.format === FpuFormat.FLOAT =/= rs(1).boxed) { + output.rs2.setNanQuiet + output.rs2.sign := False + } + when(s1.format === FpuFormat.FLOAT =/= rs(2).boxed) { + output.rs3.setNanQuiet + } + } + } + + val decode = new Area{ + val input = read.output/*.s2mPipe()*/.combStage() + input.ready := False + + val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR + val load = Stream(LoadInput()) + load.valid := input.valid && loadHit + input.ready setWhen(loadHit && load.ready) + load.payload.assignSomeByName(input.payload) + load.i2f := input.opcode === FpuOpcode.I2F + + val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR + val shortPip = Stream(ShortPipInput()) + input.ready setWhen(shortPipHit && shortPip.ready) + shortPip.valid := input.valid && shortPipHit + shortPip.payload.assignSomeByName(input.payload) + + val divSqrtHit = input.opcode === p.Opcode.DIV || input.opcode === p.Opcode.SQRT + val divSqrt = Stream(DivSqrtInput()) + if(p.withDivSqrt) { + input.ready setWhen (divSqrtHit && divSqrt.ready) + divSqrt.valid := input.valid && divSqrtHit + divSqrt.payload.assignSomeByName(input.payload) + divSqrt.div := input.opcode === p.Opcode.DIV + } + + val divHit = input.opcode === p.Opcode.DIV + val div = Stream(DivInput()) + if(p.withDiv) { + input.ready setWhen (divHit && div.ready) + div.valid := input.valid && divHit + div.payload.assignSomeByName(input.payload) + } + + val sqrtHit = input.opcode === p.Opcode.SQRT + val sqrt = Stream(SqrtInput()) + if(p.withSqrt) { + input.ready setWhen (sqrtHit && sqrt.ready) + sqrt.valid := input.valid && sqrtHit + sqrt.payload.assignSomeByName(input.payload) + } + + + val fmaHit = input.opcode === p.Opcode.FMA + val mulHit = input.opcode === p.Opcode.MUL || fmaHit + val mul = Stream(new MulInput()) + val divSqrtToMul = Stream(new MulInput()) + if(!p.withDivSqrt){ + divSqrtToMul.valid := False + divSqrtToMul.payload.assignDontCare() + } + + if(p.withMul) { + input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid) + mul.valid := input.valid && mulHit || divSqrtToMul.valid + + divSqrtToMul.ready := mul.ready + mul.payload := divSqrtToMul.payload + when(!divSqrtToMul.valid) { + mul.payload.assignSomeByName(input.payload) + mul.add := fmaHit + mul.divSqrt := False + mul.msb1 := True + mul.msb2 := True + mul.rs2.sign.allowOverride(); + mul.rs2.sign := input.rs2.sign ^ input.arg(0) + mul.rs3.sign.allowOverride(); + mul.rs3.sign := input.rs3.sign ^ input.arg(1) + } + } + + val addHit = input.opcode === p.Opcode.ADD + val add = Stream(AddInput()) + val mulToAdd = Stream(AddInput()) + + + if(p.withAdd) { + input.ready setWhen (addHit && add.ready && !mulToAdd.valid) + add.valid := input.valid && addHit || mulToAdd.valid + + mulToAdd.ready := add.ready + add.payload := mulToAdd.payload + when(!mulToAdd.valid) { + add.source := input.source + add.rd := input.rd + add.roundMode := input.roundMode + if(p.withDouble) add.format := input.format + add.needCommit := True + add.rs1.special := input.rs1.special + add.rs2.special := input.rs2.special + add.rs1.exponent := input.rs1.exponent + add.rs2.exponent := input.rs2.exponent + add.rs1.sign := input.rs1.sign + add.rs2.sign := input.rs2.sign ^ input.arg(0) + add.rs1.mantissa := input.rs1.mantissa << addExtraBits + add.rs2.mantissa := input.rs2.mantissa << addExtraBits + } + } + } + + val load = new Area{ + + case class S0() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.storeLoadType() + val i2f = Bool() + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + val s0 = new Area{ + val input = decode.load.pipelined(m2s = true, s2m = true).stage() + val filtred = commitFork.load.map(port => port.takeWhen(List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(_ === port.opcode).orR)) + def feed = filtred(input.source) + val hazard = !feed.valid + + + val output = input.haltWhen(hazard).swapPayload(S0()) + filtred.foreach(_.ready := False) + feed.ready := input.valid && output.ready + output.source := input.source + output.rd := input.rd + output.value := feed.value + output.i2f := input.i2f + output.arg := input.arg + output.roundMode := input.roundMode + if(p.withDouble) { + output.format := input.format + when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing + output.format := FpuFormat.FLOAT + } + } + } + + val s1 = new Area{ + val input = s0.output.stage() + val busy = False + + val f32 = new Area{ + val mantissa = input.value(0, 23 bits).asUInt + val exponent = input.value(23, 8 bits).asUInt + val sign = input.value(31) + } + val f64 = p.withDouble generate new Area{ + val mantissa = input.value(0, 52 bits).asUInt + val exponent = input.value(52, 11 bits).asUInt + val sign = input.value(63) + } + + val recodedExpOffset = UInt(p.internalExponentSize bits) + val passThroughFloat = p.internalFloating() + passThroughFloat.special := False + + whenDouble(input.format){ + passThroughFloat.sign := f64.sign + passThroughFloat.exponent := f64.exponent.resized + passThroughFloat.mantissa := f64.mantissa + recodedExpOffset := exponentF64Subnormal + } { + passThroughFloat.sign := f32.sign + passThroughFloat.exponent := f32.exponent.resized + passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0) + recodedExpOffset := exponentF32Subnormal + } + + + val manZero = passThroughFloat.mantissa === 0 + val expZero = passThroughFloat.exponent === 0 + val expOne = passThroughFloat.exponent(7 downto 0).andR + if(p.withDouble) { + expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0) + expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7) + } + + val isZero = expZero && manZero + val isSubnormal = expZero && !manZero + val isInfinity = expOne && manZero + val isNan = expOne && !manZero + + + val fsm = new Area{ + val done, boot, patched = Reg(Bool()) + val ohInputWidth = 32 max p.internalMantissaSize + val ohInput = Bits(ohInputWidth bits).assignDontCare() + when(!input.i2f) { + if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9 + if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits + } otherwise { + ohInput(ohInputWidth-32-1 downto 0) := 0 + ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0) + } + + val i2fZero = Reg(Bool) + + val shift = new Area{ + val by = Reg(UInt(log2Up(ohInputWidth) bits)) + val input = UInt(ohInputWidth bits).assignDontCare() + var logic = input + for(i <- by.range){ + logic \= by(i) ? (logic |<< (BigInt(1) << i)) | logic + } + val output = RegNextWhen(logic, !done) + } + shift.input := (ohInput.asUInt |<< 1).resized + + when(input.valid && (input.i2f || isSubnormal) && !done){ + busy := True + when(boot){ + when(input.i2f && !patched && input.value(31) && input.arg(0)){ + input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits)) + patched := True + } otherwise { + shift.by := OHToUInt(OHMasking.first((ohInput).reversed)) + boot := False + i2fZero := input.value(31 downto 0) === 0 + } + } otherwise { + done := True + } + } + + val expOffset = (UInt(p.internalExponentSize bits)) + expOffset := 0 + when(isSubnormal){ + expOffset := shift.by.resized + } + + when(!input.isStall){ + done := False + boot := True + patched := False + } + } + + + val i2fSign = fsm.patched + val (i2fHigh, i2fLow) = fsm.shift.output.splitAt(if(p.withDouble) 0 else widthOf(input.value)-24) + val scrap = i2fLow =/= 0 + + val recoded = p.internalFloating() + recoded.mantissa := passThroughFloat.mantissa + recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized + recoded.sign := passThroughFloat.sign + recoded.setNormal + when(isZero){recoded.setZero} + when(isInfinity){recoded.setInfinity} + when(isNan){recoded.setNan} + + val output = input.haltWhen(busy).swapPayload(new MergeInput()) + output.source := input.source + output.roundMode := input.roundMode + if(p.withDouble) { + output.format := input.format + } + output.rd := input.rd + output.value.sign := recoded.sign + output.value.exponent := recoded.exponent + output.value.mantissa := recoded.mantissa @@ U"0" + output.value.special := recoded.special + output.scrap := False + output.NV := False + output.DZ := False + when(input.i2f){ + output.value.sign := i2fSign + output.value.exponent := (U(exponentOne+31) - fsm.shift.by).resized + output.value.setNormal + output.scrap := scrap + when(fsm.i2fZero) { output.value.setZero } + } + + when(input.i2f || isSubnormal){ + output.value.mantissa := U(i2fHigh) @@ (if(p.withDouble) U"0" else U"") + } + } + + } + + val shortPip = new Area{ + val input = decode.shortPip.stage() + + val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR + val rfOutput = Stream(new MergeInput()) + + val isCommited = commitConsume(_.short, input.source, input.fire && toFpuRf) + val output = rfOutput.haltWhen(!isCommited) + + val result = p.storeLoadType().assignDontCare() + + val halt = False + val recodedResult = p.storeLoadType() + val f32 = new Area{ + val exp = (input.rs1.exponent - (exponentOne-127)).resize(8 bits) + val man = CombInit(input.rs1.mantissa(if(p.withDouble) 51 downto 29 else 22 downto 0)) + } + val f64 = p.withDouble generate new Area{ + val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits) + val man = CombInit(input.rs1.mantissa) + } + + whenDouble(input.format){ + recodedResult := input.rs1.sign ## f64.exp ## f64.man + } { + recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man + } + + val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal) + val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold + val isSubnormal = !input.rs1.special && expInSubnormalRange + val isNormal = !input.rs1.special && !expInSubnormalRange + val fsm = new Area{ + val f2iShift = input.rs1.exponent - U(exponentOne) + val isF2i = input.opcode === FpuOpcode.F2I + val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal + val done, boot = Reg(Bool()) + val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1 + + val shift = new Area{ + val by = Reg(UInt(log2Up(p.internalMantissaSize+1 max 33) bits)) + val input = UInt(p.internalMantissaSize+1 max 33 bits).assignDontCare() + var logic = input + val scrap = Reg(Bool) + for(i <- by.range.reverse){ + scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0) + logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic + } + when(boot){ + scrap := False + } + val output = RegNextWhen(logic, !done) + } + + shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9) + + val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10))) + when(input.valid && (needRecoding || isF2i) && !done){ + halt := True + when(boot){ + when(isF2i){ + shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge + } otherwise { + shift.by := (formatShiftOffset - input.rs1.exponent).resized + } + boot := False + } otherwise { + done := True + } + } + + when(!input.isStall){ + done := False + boot := True + } + } + + val mantissaForced = False + val exponentForced = False + val mantissaForcedValue = Bool().assignDontCare() + val exponentForcedValue = Bool().assignDontCare() + val cononicalForced = False + + + when(input.rs1.special){ + switch(input.rs1.exponent(1 downto 0)){ + is(FpuFloat.ZERO){ + mantissaForced := True + exponentForced := True + mantissaForcedValue := False + exponentForcedValue := False + } + is(FpuFloat.INFINITY){ + mantissaForced := True + exponentForced := True + mantissaForcedValue := False + exponentForcedValue := True + } + is(FpuFloat.NAN){ + exponentForced := True + exponentForcedValue := True + when(input.rs1.isCanonical){ + cononicalForced := True + mantissaForced := True + mantissaForcedValue := False + } + } + } + } + + + + when(isSubnormal){ + exponentForced := True + exponentForcedValue := False + recodedResult(0,23 bits) := fsm.shift.output(22 downto 0).asBits + whenDouble(input.format){ + recodedResult(51 downto 23) := fsm.shift.output(51 downto 23).asBits + }{} + } + when(mantissaForced){ + recodedResult(0,23 bits) := (default -> mantissaForcedValue) + whenDouble(input.format){ + recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue) + }{} + } + when(exponentForced){ + whenDouble(input.format){ + recodedResult(52, 11 bits) := (default -> exponentForcedValue) + } { + recodedResult(23, 8 bits) := (default -> exponentForcedValue) + } + } + when(cononicalForced){ + whenDouble(input.format){ + recodedResult(63) := False + recodedResult(51) := True + } { + recodedResult(31) := False + recodedResult(22) := True + } + } + + val rspNv = False + val rspNx = False + + val f2i = new Area{ //Will not work for 64 bits float max value rounding + val unsigned = fsm.shift.output(32 downto 0) >> 1 + val resign = input.arg(0) && input.rs1.sign + val round = fsm.shift.output(0) ## fsm.shift.scrap + val increment = input.roundMode.mux( + FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))), + FpuRoundMode.RTZ -> False, + FpuRoundMode.RDN -> (round =/= 0 && input.rs1.sign), + FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign), + FpuRoundMode.RMM -> (round(1)) + ) + val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt) + val overflow = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan + val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign + val isZero = input.rs1.isZero + if(p.withDouble){ + overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31))) + } + when(isZero){ + result := 0 + } elsewhen(underflow || overflow) { + val low = overflow + val high = input.arg(0) ^ overflow + result := (31 -> high, default -> low) + rspNv := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero + } otherwise { + rspNx := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0 + } + } + + val bothZero = input.rs1.isZero && input.rs2.isZero + val rs1Equal = input.rs1 === input.rs2 + val rs1AbsSmaller = (input.rs1.exponent @@ input.rs1.mantissa) < (input.rs2.exponent @@ input.rs2.mantissa) + rs1AbsSmaller.setWhen(input.rs2.isInfinity) + rs1AbsSmaller.setWhen(input.rs1.isZero) + rs1AbsSmaller.clearWhen(input.rs2.isZero) + rs1AbsSmaller.clearWhen(input.rs1.isInfinity) + rs1Equal setWhen(input.rs1.sign === input.rs2.sign && input.rs1.isInfinity && input.rs2.isInfinity) + val rs1Smaller = (input.rs1.sign ## input.rs2.sign).mux( + 0 -> rs1AbsSmaller, + 1 -> False, + 2 -> True, + 3 -> (!rs1AbsSmaller && !rs1Equal) + ) + + + val minMaxSelectRs2 = !(((rs1Smaller ^ input.arg(0)) && !input.rs1.isNan || input.rs2.isNan)) + val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan + val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0)) + when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 } + val sgnjRs1Sign = CombInit(input.rs1.sign) + val sgnjRs2Sign = CombInit(input.rs2.sign) + if(p.withDouble){ + sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE) + } + val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0) + val fclassResult = B(0, 32 bits) + val decoded = input.rs1.decode() + fclassResult(0) := input.rs1.sign && decoded.isInfinity + fclassResult(1) := input.rs1.sign && isNormal + fclassResult(2) := input.rs1.sign && isSubnormal + fclassResult(3) := input.rs1.sign && decoded.isZero + fclassResult(4) := !input.rs1.sign && decoded.isZero + fclassResult(5) := !input.rs1.sign && isSubnormal + fclassResult(6) := !input.rs1.sign && isNormal + fclassResult(7) := !input.rs1.sign && decoded.isInfinity + fclassResult(8) := decoded.isNan && !decoded.isQuiet + fclassResult(9) := decoded.isNan && decoded.isQuiet + + + switch(input.opcode){ + is(FpuOpcode.STORE) { result := recodedResult } + is(FpuOpcode.FMV_X_W) { result := recodedResult } + is(FpuOpcode.F2I) { result(31 downto 0) := f2i.result.asBits } + is(FpuOpcode.CMP) { result(31 downto 0) := cmpResult.resized } + is(FpuOpcode.FCLASS) { result(31 downto 0) := fclassResult.resized } + } + + + rfOutput.valid := input.valid && toFpuRf && !halt + rfOutput.source := input.source + rfOutput.rd := input.rd + rfOutput.roundMode := input.roundMode + if(p.withDouble) rfOutput.format := input.format + rfOutput.scrap := False + rfOutput.value.sign := input.rs1.sign + rfOutput.value.exponent := input.rs1.exponent + rfOutput.value.mantissa := input.rs1.mantissa @@ U"0" + rfOutput.value.special := input.rs1.special + + switch(input.opcode){ + is(FpuOpcode.MIN_MAX){ + when(minMaxSelectRs2) { + rfOutput.value.sign := input.rs2.sign + rfOutput.value.exponent := input.rs2.exponent + rfOutput.value.mantissa := input.rs2.mantissa @@ U"0" + rfOutput.value.special := input.rs2.special + } + when(minMaxSelectNanQuiet){ + rfOutput.value.setNanQuiet + } + } + is(FpuOpcode.SGNJ){ + when(!input.rs1.isNan) { + rfOutput.value.sign := sgnjResult + } + if(p.withDouble) when(input.rs1Boxed && input.format === FpuFormat.DOUBLE){ + rfOutput.value.sign := input.rs1.sign + rfOutput.format := FpuFormat.FLOAT + } + } + if(p.withDouble) is(FpuOpcode.FCVT_X_X){ + rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT) + when(input.rs1.isNan){ + rfOutput.value.setNanQuiet + } + } + } + + val signalQuiet = input.opcode === FpuOpcode.CMP && input.arg =/= 2 + val rs1Nan = input.rs1.isNan + val rs2Nan = input.rs2.isNan + val rs1NanNv = input.rs1.isNan && (!input.rs1.isQuiet || signalQuiet) + val rs2NanNv = input.rs2.isNan && (!input.rs2.isQuiet || signalQuiet) + val NV = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv || + List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv + rspNv setWhen(NV) + + val rspStreams = Vec(Stream(FpuRsp(p)), portCount) + input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source)) + for(i <- 0 until portCount){ + def rsp = rspStreams(i) + rsp.valid := input.valid && input.source === i && !toFpuRf && !halt + rsp.value := result + rsp.NV := rspNv + rsp.NX := rspNx + io.port(i).rsp << rsp.stage() + } + + + rfOutput.NV := NV + rfOutput.DZ := False + } + + val mul = p.withMul generate new Area{ + val inWidthA = p.internalMantissaSize+1 + val inWidthB = p.internalMantissaSize+1 + val outWidth = p.internalMantissaSize*2+2 + + case class MulSplit(offsetA : Int, offsetB : Int, widthA : Int, widthB : Int, id : Int){ + val offsetC = offsetA+offsetB + val widthC = widthA + widthB + val endC = offsetC+widthC + } + val splitsUnordered = for(offsetA <- 0 until inWidthA by p.mulWidthA; + offsetB <- 0 until inWidthB by p.mulWidthB; + widthA = (inWidthA - offsetA) min p.mulWidthA; + widthB = (inWidthB - offsetB) min p.mulWidthB) yield { + MulSplit(offsetA, offsetB, widthA, widthB, -1) + } + val splits = splitsUnordered.sortWith(_.endC < _.endC).zipWithIndex.map(e => e._1.copy(id=e._2)) + + class MathWithExp extends MulInput{ + val exp = UInt(p.internalExponentSize+1 bits) + } + val preMul = new Area{ + val input = decode.mul.stage() + val output = input.swapPayload(new MathWithExp()) + output.payload.assignSomeByName(input.payload) + output.exp := input.rs1.exponent +^ input.rs2.exponent + } + class MathWithMul extends MathWithExp{ + val muls = Vec(splits.map(e => UInt(e.widthA + e.widthB bits))) + } + val mul = new Area{ + val input = preMul.output.stage() + val output = input.swapPayload(new MathWithMul()) + val mulA = U(input.msb1) @@ input.rs1.mantissa + val mulB = U(input.msb2) @@ input.rs2.mantissa + output.payload.assignSomeByName(input.payload) + splits.foreach(e => output.muls(e.id) := mulA(e.offsetA, e.widthA bits) * mulB(e.offsetB, e.widthB bits)) + } + + val sumSplitAt = splits.size/2//splits.filter(e => e.endC <= p.internalMantissaSize).size + + class Sum1Output extends MathWithExp{ + val muls2 = Vec(splits.drop(sumSplitAt).map(e => UInt(e.widthA + e.widthB bits))) + val mulC2 = UInt(p.internalMantissaSize*2+2 bits) + } + class Sum2Output extends MathWithExp{ + val mulC = UInt(p.internalMantissaSize*2+2 bits) + } + + val sum1 = new Area { + val input = mul.output.stage() + val sum = splits.take(sumSplitAt).map(e => (input.muls(e.id) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _) + + val output = input.swapPayload(new Sum1Output()) + output.payload.assignSomeByName(input.payload) + output.mulC2 := sum.resized + output.muls2 := Vec(input.muls.drop(sumSplitAt)) + } + + val sum2 = new Area { + val input = sum1.output.stage() + val sum = input.mulC2 + splits.drop(sumSplitAt).map(e => (input.muls2(e.id-sumSplitAt) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _) + + val isCommited = commitConsume(_.mul, input.source, input.fire) + val output = input.haltWhen(!isCommited).swapPayload(new Sum2Output()) + output.payload.assignSomeByName(input.payload) + output.mulC := sum + } + + val norm = new Area{ + val input = sum2.output.stage() + val (mulHigh, mulLow) = input.mulC.splitAt(p.internalMantissaSize-1) + val scrap = mulLow =/= 0 + val needShift = mulHigh.msb + val exp = input.exp + U(needShift) + val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits) + scrap setWhen(needShift && mulHigh(0)) + val forceZero = input.rs1.isZero || input.rs2.isZero + val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24) + val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25) + val forceUnderflow = exp < underflowThreshold + val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity + val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero)) + val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan + + val output = p.writeFloating() + output.sign := input.rs1.sign ^ input.rs2.sign + output.exponent := (exp - exponentOne).resized + output.mantissa := man.asUInt + output.setNormal + val NV = False + + when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 } + + when(forceNan) { + output.setNanQuiet + NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling) + } elsewhen(forceOverflow) { + output.setInfinity + } elsewhen(forceZero) { + output.setZero + } elsewhen(forceUnderflow) { + output.exponent := underflowExp.resized + } + } + + val result = new Area { + def input = norm.input + def NV = norm.NV + + val notMul = new Area { + val output = Flow(UInt(p.internalMantissaSize + 1 bits)) + output.valid := input.valid && input.divSqrt + output.payload := input.mulC(p.internalMantissaSize, p.internalMantissaSize + 1 bits) + } + + val output = Stream(new MergeInput()) + output.valid := input.valid && !input.add && !input.divSqrt + output.source := input.source + output.rd := input.rd + if (p.withDouble) output.format := input.format + output.roundMode := input.roundMode + output.scrap := norm.scrap + output.value := norm.output + output.NV := NV + output.DZ := False + + val mulToAdd = Stream(AddInput()) + decode.mulToAdd << mulToAdd.stage() + + mulToAdd.valid := input.valid && input.add + mulToAdd.source := input.source + mulToAdd.rs1.mantissa := norm.output.mantissa @@ norm.scrap //FMA Precision lost + mulToAdd.rs1.exponent := norm.output.exponent + mulToAdd.rs1.sign := norm.output.sign + mulToAdd.rs1.special := norm.output.special + mulToAdd.rs2 := input.rs3 + mulToAdd.rs2.mantissa.removeAssignments() := input.rs3.mantissa << addExtraBits + mulToAdd.rd := input.rd + mulToAdd.roundMode := input.roundMode + mulToAdd.needCommit := False + if (p.withDouble) mulToAdd.format := input.format + + when(NV){ + mulToAdd.rs1.mantissa.msb := False + } + + input.ready := (input.add ? mulToAdd.ready | output.ready) || input.divSqrt + } + } + + + val div = p.withDiv generate new Area{ + val input = decode.div.halfPipe() + val haltIt = True + val isCommited = RegNext(commitConsume(_.div, input.source, input.fire)) + val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput()) + + val dividerShift = if(p.withDouble) 0 else 1 + val divider = FpuDiv(p.internalMantissaSize + dividerShift) + divider.io.input.a := input.rs1.mantissa << dividerShift + divider.io.input.b := input.rs2.mantissa << dividerShift + val dividerResult = divider.io.output.result >> dividerShift + val dividerScrap = divider.io.output.remain =/= 0 || divider.io.output.result(0, dividerShift bits) =/= 0 + + val cmdSent = RegInit(False) setWhen(divider.io.input.fire) clearWhen(!haltIt) + divider.io.input.valid := input.valid && !cmdSent + divider.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + val needShift = !dividerResult.msb + val mantissa = needShift ? dividerResult(0, p.internalMantissaSize + 1 bits) | dividerResult(1, p.internalMantissaSize + 1 bits) + val scrap = dividerScrap || !needShift && dividerResult(0) + val exponentOffset = 1 << (p.internalExponentSize + 1) + val exponent = input.rs1.exponent + U(exponentOffset | exponentOne) - input.rs2.exponent - U(needShift) + + output.value.setNormal + output.value.sign := input.rs1.sign ^ input.rs2.sign + output.value.exponent := exponent.resized + output.value.mantissa := mantissa + output.scrap := scrap + when(exponent.takeHigh(2) === 3){ output.value.exponent(p.internalExponentSize-3, 3 bits) := 7} //Handle overflow + + + + val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) + val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) + val forceUnderflow = exponent < underflowThreshold + val forceOverflow = input.rs1.isInfinity || input.rs2.isZero + val infinitynan = input.rs1.isZero && input.rs2.isZero || input.rs1.isInfinity && input.rs2.isInfinity + val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan + val forceZero = input.rs1.isZero || input.rs2.isInfinity + + + + output.NV := False + output.DZ := !forceNan && !input.rs1.isInfinity && input.rs2.isZero + + when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } + + when(forceNan) { + output.value.setNanQuiet + output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + } elsewhen(forceOverflow) { + output.value.setInfinity + } elsewhen(forceZero) { + output.value.setZero + } elsewhen(forceUnderflow) { + output.value.exponent := underflowExp.resized + } + + + haltIt clearWhen(divider.io.output.valid) + } + + + + val sqrt = p.withSqrt generate new Area{ + val input = decode.sqrt.halfPipe() + val haltIt = True + val isCommited = RegNext(commitConsume(_.sqrt, input.source, input.fire)) + val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput()) + + val needShift = !input.rs1.exponent.lsb + val sqrt = FpuSqrt(p.internalMantissaSize) + sqrt.io.input.a := (needShift ? (U"1" @@ input.rs1.mantissa @@ U"0") | (U"01" @@ input.rs1.mantissa)) + + val cmdSent = RegInit(False) setWhen(sqrt.io.input.fire) clearWhen(!haltIt) + sqrt.io.input.valid := input.valid && !cmdSent + sqrt.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + + val scrap = sqrt.io.output.remain =/= 0 + val exponent = RegNext(exponentOne-exponentOne/2 -1 +^ (input.rs1.exponent >> 1) + U(input.rs1.exponent.lsb)) + + output.value.setNormal + output.value.sign := input.rs1.sign + output.value.exponent := exponent + output.value.mantissa := sqrt.io.output.result + output.scrap := scrap + output.NV := False + output.DZ := False + + val negative = !input.rs1.isNan && !input.rs1.isZero && input.rs1.sign + + when(input.rs1.isInfinity){ + output.value.setInfinity + } + when(negative){ + output.value.setNanQuiet + output.NV := True + } + when(input.rs1.isNan){ + output.value.setNanQuiet + output.NV := !input.rs1.isQuiet + } + when(input.rs1.isZero){ + output.value.setZero + } + + +// val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) +// val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) +// val forceUnderflow = exponent < underflowThreshold +// val forceOverflow = input.rs1.isInfinity// || input.rs2.isInfinity +// val infinitynan = input.rs1.isZero && input.rs2.isZero +// val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan +// val forceZero = input.rs1.isZero +// +// +// +// output.NV := False +// output.DZ := !forceNan && input.rs2.isZero +// +// when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } +// +// when(forceNan) { +// output.value.setNanQuiet +// output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) +// } elsewhen(forceOverflow) { +// output.value.setInfinity +// } elsewhen(forceZero) { +// output.value.setZero +// } elsewhen(forceUnderflow) { +// output.value.exponent := underflowExp.resized +// } + + + haltIt clearWhen(sqrt.io.output.valid) + } + + //divSqrt isn't realy used anymore + val divSqrt = p.withDivSqrt generate new Area { + val input = decode.divSqrt.halfPipe() + assert(false, "Need to implement commit tracking") + val aproxWidth = 8 + val aproxDepth = 64 + val divIterationCount = 3 + val sqrtIterationCount = 3 + + val mulWidth = p.internalMantissaSize + 1 + + import FpuDivSqrtIterationState._ + val state = RegInit(FpuDivSqrtIterationState.IDLE()) + val iteration = Reg(UInt(log2Up(divIterationCount max sqrtIterationCount) bits)) + + decode.divSqrtToMul.valid := False + decode.divSqrtToMul.source := input.source + decode.divSqrtToMul.rs1.assignDontCare() + decode.divSqrtToMul.rs2.assignDontCare() + decode.divSqrtToMul.rs3.assignDontCare() + decode.divSqrtToMul.rd := input.rd + decode.divSqrtToMul.add := False + decode.divSqrtToMul.divSqrt := True + decode.divSqrtToMul.msb1 := True + decode.divSqrtToMul.msb2 := True + decode.divSqrtToMul.rs1.special := False //TODO + decode.divSqrtToMul.rs2.special := False + decode.divSqrtToMul.roundMode := input.roundMode + if(p.withDouble) decode.divSqrtToMul.format := input.format + + + val aprox = new Area { + val rom = Mem(UInt(aproxWidth bits), aproxDepth * 2) + val divTable, sqrtTable = ArrayBuffer[Double]() + for(i <- 0 until aproxDepth){ + val value = 1+(i+0.5)/aproxDepth + divTable += 1/value + } + for(i <- 0 until aproxDepth){ + val scale = if(i < aproxDepth/2) 2 else 1 + val value = scale+(scale*(i%(aproxDepth/2)+0.5)/aproxDepth*2) +// println(s"$i => $value" ) + sqrtTable += 1/Math.sqrt(value) + } + val romElaboration = (sqrtTable ++ divTable).map(v => BigInt(((v-0.5)*2*(1 << aproxWidth)).round)) + + rom.initBigInt(romElaboration) + val div = input.rs2.mantissa.takeHigh(log2Up(aproxDepth)) + val sqrt = U(input.rs1.exponent.lsb ## input.rs1.mantissa).takeHigh(log2Up(aproxDepth)) + val address = U(input.div ## (input.div ? div | sqrt)) + val raw = rom.readAsync(address) + val result = U"01" @@ (raw << (mulWidth-aproxWidth-2)) + } + + val divExp = new Area{ + val value = (1 << p.internalExponentSize) - 3 - input.rs2.exponent + } + val sqrtExp = new Area{ + val value = ((1 << p.internalExponentSize-1) + (1 << p.internalExponentSize-2) - 2 -1) - (input.rs1.exponent >> 1) + U(!input.rs1.exponent.lsb) + } + + def mulArg(rs1 : UInt, rs2 : UInt): Unit ={ + decode.divSqrtToMul.rs1.mantissa := rs1.resized + decode.divSqrtToMul.rs2.mantissa := rs2.resized + decode.divSqrtToMul.msb1 := rs1.msb + decode.divSqrtToMul.msb2 := rs2.msb + } + + val mulBuffer = mul.result.notMul.output.toStream.stage + mulBuffer.ready := False + + val iterationValue = Reg(UInt(mulWidth bits)) + + input.ready := False + switch(state){ + is(IDLE){ + iterationValue := aprox.result + iteration := 0 + when(input.valid) { + state := YY + } + } + is(YY){ + decode.divSqrtToMul.valid := True + mulArg(iterationValue, iterationValue) + when(decode.divSqrtToMul.ready) { + state := XYY + } + } + is(XYY){ + decode.divSqrtToMul.valid := mulBuffer.valid + val sqrtIn = !input.rs1.exponent.lsb ? (U"1" @@ input.rs1.mantissa) | ((U"1" @@ input.rs1.mantissa) |>> 1) + val divIn = U"1" @@ input.rs2.mantissa + mulArg(input.div ? divIn| sqrtIn, mulBuffer.payload) + when(mulBuffer.valid && decode.divSqrtToMul.ready) { + state := (input.div ? Y2_XYY | _15_XYY2) + mulBuffer.ready := True + } + } + is(Y2_XYY){ + mulBuffer.ready := True + when(mulBuffer.valid) { + iterationValue := ((iterationValue << 1) - mulBuffer.payload).resized + mulBuffer.ready := True + iteration := iteration + 1 + when(iteration =/= divIterationCount-1){ //TODO + state := YY + } otherwise { + state := DIV + } + } + } + is(DIV){ + decode.divSqrtToMul.valid := True + decode.divSqrtToMul.divSqrt := False + decode.divSqrtToMul.rs1 := input.rs1 + decode.divSqrtToMul.rs2.sign := input.rs2.sign + decode.divSqrtToMul.rs2.exponent := divExp.value + iterationValue.msb.asUInt + decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized + val zero = input.rs2.isInfinity + val overflow = input.rs2.isZero + val nan = input.rs2.isNan || (input.rs1.isZero && input.rs2.isZero) + + when(nan){ + decode.divSqrtToMul.rs2.setNanQuiet + } elsewhen(overflow) { + decode.divSqrtToMul.rs2.setInfinity + } elsewhen(zero) { + decode.divSqrtToMul.rs2.setZero + } + when(decode.divSqrtToMul.ready) { + state := IDLE + input.ready := True + } + } + is(_15_XYY2){ + when(mulBuffer.valid) { + state := Y_15_XYY2 + mulBuffer.payload.getDrivingReg := (U"11" << mulWidth-2) - (mulBuffer.payload) + } + } + is(Y_15_XYY2){ + decode.divSqrtToMul.valid := True + mulArg(iterationValue, mulBuffer.payload) + when(decode.divSqrtToMul.ready) { + mulBuffer.ready := True + state := Y_15_XYY2_RESULT + } + } + is(Y_15_XYY2_RESULT){ + iterationValue := mulBuffer.payload + mulBuffer.ready := True + when(mulBuffer.valid) { + iteration := iteration + 1 + when(iteration =/= sqrtIterationCount-1){ + state := YY + } otherwise { + state := SQRT + } + } + } + is(SQRT){ + decode.divSqrtToMul.valid := True + decode.divSqrtToMul.divSqrt := False + decode.divSqrtToMul.rs1 := input.rs1 + decode.divSqrtToMul.rs2.sign := False + decode.divSqrtToMul.rs2.exponent := sqrtExp.value + iterationValue.msb.asUInt + decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized + + val nan = input.rs1.sign && !input.rs1.isZero + + when(nan){ + decode.divSqrtToMul.rs2.setNanQuiet + } + + when(decode.divSqrtToMul.ready) { + state := IDLE + input.ready := True + } + } + } + } + + val add = p.withAdd generate new Area{ + + + class PreShifterOutput extends AddInput{ + val absRs1Bigger = Bool() + val rs1ExponentBigger = Bool() + } + + val preShifter = new Area{ + val input = decode.add.combStage() + val output = input.swapPayload(new PreShifterOutput) + + val exp21 = input.rs2.exponent -^ input.rs1.exponent + val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero + val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent + val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa + val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity + + output.payload.assignSomeByName(input.payload) + output.absRs1Bigger := absRs1Bigger + output.rs1ExponentBigger := rs1ExponentBigger + } + + class ShifterOutput extends AddInput{ + val xSign, ySign = Bool() + val xMantissa, yMantissa = UInt(p.internalMantissaSize+1+addExtraBits bits) + val xyExponent = UInt(p.internalExponentSize bits) + val xySign = Bool() + val roundingScrap = Bool() + } + + val shifter = new Area { + val input = preShifter.output.stage() + val output = input.swapPayload(new ShifterOutput) + output.payload.assignSomeByName(input.payload) + + val exp21 = input.rs2.exponent -^ input.rs1.exponent + val shiftBy = exp21.asSInt.abs//rs1ExponentBigger ? (0-exp21) | exp21 + val shiftOverflow = (shiftBy >= p.internalMantissaSize+1+addExtraBits) + val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero) + + def absRs1Bigger = input.absRs1Bigger + def rs1ExponentBigger = input.rs1ExponentBigger + + //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path + val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign + output.xSign := xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign) + output.ySign := xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign) + val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) + val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) + var yMantissa = CombInit(yMantissaUnshifted) + val roundingScrap = False + for(i <- log2Up(p.internalMantissaSize) - 1 downto 0){ + roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0) + yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa + } + when(passThrough) { yMantissa := 0 } + when(shiftOverflow) { roundingScrap := True } + when(input.rs1.special || input.rs2.special){ roundingScrap := False } + output.xyExponent := rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent + output.xMantissa := xMantissa + output.yMantissa := yMantissa + output.xySign := xySign + output.roundingScrap := roundingScrap + } + + class MathOutput extends ShifterOutput{ + val xyMantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits) + } + + val math = new Area { + val input = shifter.output.stage() + val output = input.swapPayload(new MathOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ? + val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here + output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits) + + } + + class OhOutput extends MathOutput{ + val shift = UInt(log2Up(p.internalMantissaSize+1+addExtraBits+1) bits) + } + + val oh = new Area { + val input = math.output.stage() + val isCommited = commitConsume(_.add, input.source, input.fire && input.needCommit) + val output = input.haltWhen(input.needCommit && !isCommited).swapPayload(new OhOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + val shiftOh = OHMasking.first(output.xyMantissa.asBools.reverse) //The OhMasking.first can be processed in parallel to the xyMantissa carry chaine +// output.shiftOh := shiftOh + + val shift = OHToUInt(shiftOh) + output.shift := shift + } + + + class NormOutput extends AddInput{ + val mantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits) + val exponent = UInt(p.internalExponentSize+1 bits) + val infinityNan, forceNan, forceZero, forceInfinity = Bool() + val xySign, roundingScrap = Bool() + val xyMantissaZero = Bool() + } + + val norm = new Area{ + val input = oh.output.stage() + val output = input.swapPayload(new NormOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + output.mantissa := (xyMantissa |<< shift) + output.exponent := xyExponent -^ shift + 1 + output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity) + output.forceZero := xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero) + output.infinityNan := (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign)) + output.forceNan := input.rs1.isNan || input.rs2.isNan || output.infinityNan + output.xyMantissaZero := xyMantissa === 0 + } + + val result = new Area { + val input = norm.output.pipelined() + val output = input.swapPayload(new MergeInput()) + import input.payload._ + + output.source := input.source + output.rd := input.rd + output.value.sign := xySign + output.value.mantissa := (mantissa >> addExtraBits).resized + output.value.exponent := exponent.resized + output.value.special := False + output.roundMode := input.roundMode + if (p.withDouble) output.format := input.format + output.scrap := (mantissa(1) | mantissa(0) | roundingScrap) + + output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling + output.DZ := False + when(forceNan) { + output.value.setNanQuiet + } elsewhen (forceInfinity) { + output.value.setInfinity + } elsewhen (forceZero) { + output.value.setZero + when(xyMantissaZero || input.rs1.isZero && input.rs2.isZero) { + output.value.sign := input.rs1.sign && input.rs2.sign + } + when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN) { + output.value.sign := True + } + } + } + } + + + val merge = new Area { + val inputs = ArrayBuffer[Stream[MergeInput]]() + inputs += load.s1.output.stage() + if(p.withSqrt) (inputs += sqrt.output) + if(p.withDiv) (inputs += div.output) + if(p.withAdd) (inputs += add.result.output) + if(p.withMul) (inputs += mul.result.output) + if(p.withShortPipMisc) (inputs += shortPip.output.pipelined(m2s = true)) + val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs).toFlow + } + + class RoundFront extends MergeInput{ + val mantissaIncrement = Bool() + val roundAdjusted = Bits(2 bits) + val exactMask = UInt(p.internalMantissaSize + 2 bits) + } + + val roundFront = new Area { + val input = merge.arbitrated.stage() + val output = input.swapPayload(new RoundFront()) + output.payload.assignSomeByName(input.payload) + + val manAggregate = input.value.mantissa @@ input.scrap + val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal + 1)(exponentF32Subnormal + 1) + val expDif = expBase -^ input.value.exponent + val expSubnormal = !expDif.msb + var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0)) + if (p.withDouble) when(input.format === FpuFormat.FLOAT) { + discardCount \= discardCount + 29 + } + val exactMask = (List(True) ++ (0 until p.internalMantissaSize + 1).map(_ < discardCount)).asBits.asUInt + val roundAdjusted = (True ## (manAggregate >> 1)) (discardCount) ## ((manAggregate & exactMask) =/= 0) + + val mantissaIncrement = !input.value.special && input.roundMode.mux( + FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate >> 2)) (discardCount))), + FpuRoundMode.RTZ -> False, + FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign), + FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign), + FpuRoundMode.RMM -> (roundAdjusted(1)) + ) + + output.mantissaIncrement := mantissaIncrement + output.roundAdjusted := roundAdjusted + output.exactMask := exactMask + } + + val roundBack = new Area{ + val input = roundFront.output.stage() + val output = input.swapPayload(RoundOutput()) + import input.payload._ + + val math = p.internalFloating() + val mantissaRange = p.internalMantissaSize downto 1 + val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue) + val adderRightOp = (mantissaIncrement ? (exactMask >> 1)| U(0)).resize(p.internalMantissaSize bits) + val adder = KeepAttribute(KeepAttribute(input.value.exponent @@ adderMantissa) + KeepAttribute(adderRightOp) + KeepAttribute(U(mantissaIncrement))) + math.special := input.value.special + math.sign := input.value.sign + math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits) + math.mantissa := adder(0, p.internalMantissaSize bits) + + val patched = CombInit(math) + val nx,of,uf = False + + val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal) + val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1) + val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1) + + //catch exact 1.17549435E-38 underflow, but, who realy care ? +// val borringCase = input.value.exponent === ufSubnormalThreshold && roundAdjusted.asUInt < U"11" +// when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){ +// uf := True +// } + val threshold = input.roundMode.mux( + FpuRoundMode.RNE -> U"110", + FpuRoundMode.RTZ -> U"110", + FpuRoundMode.RDN -> (input.value.sign ? U"101" | U"111"), + FpuRoundMode.RUP -> (input.value.sign ? U"111" | U"101"), + FpuRoundMode.RMM -> U"110" + ) + val borringRound = (input.value.mantissa(1 downto 0) ## input.scrap) + if(p.withDouble) when(input.format === FpuFormat.FLOAT) { borringRound := (input.value.mantissa(30 downto 29) ## input.value.mantissa(28 downto 0).orR)} + + val borringCase = input.value.exponent === ufSubnormalThreshold && borringRound.asUInt < threshold + when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){ + uf := True + } + when(!math.special && math.exponent > ofThreshold){ + nx := True + of := True + val doMax = input.roundMode.mux( + FpuRoundMode.RNE -> (False), + FpuRoundMode.RTZ -> (True), + FpuRoundMode.RDN -> (!math.sign), + FpuRoundMode.RUP -> (math.sign), + FpuRoundMode.RMM -> (False) + ) + when(doMax){ + patched.exponent := ofThreshold + patched.mantissa.setAll() + } otherwise { + patched.setInfinity + } + } + + + when(!math.special && math.exponent < ufThreshold){ + nx := True + uf := True + val doMin = input.roundMode.mux( + FpuRoundMode.RNE -> (False), + FpuRoundMode.RTZ -> (False), + FpuRoundMode.RDN -> (math.sign), + FpuRoundMode.RUP -> (!math.sign), + FpuRoundMode.RMM -> (False) + ) + when(doMin){ + patched.exponent := ufThreshold.resized + patched.mantissa := 0 + } otherwise { + patched.setZero + } + } + + + nx setWhen(!input.value.special && (roundAdjusted =/= 0)) + val writes = rf.scoreboards.map(_.writes.readAsync(input.rd)) + val write = writes.toList.read(input.source) + output.NX := nx & write + output.OF := of & write + output.UF := uf & write + output.NV := input.NV & write + output.DZ := input.DZ & write + output.source := input.source + output.rd := input.rd + output.write := write + if(p.withDouble) output.format := input.format + output.value := patched + } + + val writeback = new Area{ + val input = roundBack.output.stage() + + for(i <- 0 until portCount){ + val c = io.port(i).completion + c.valid := input.fire && input.source === i + c.flags.NX := input.NX + c.flags.OF := input.OF + c.flags.UF := input.UF + c.flags.NV := input.NV + c.flags.DZ := input.DZ + c.written := input.write + } + + when(input.valid){ + for(i <- 0 until portCount) { + val port = rf.scoreboards(i).hitWrite + port.valid setWhen(input.source === i) + port.address := input.rd + port.data := !rf.scoreboards(i).hit(input.rd) //TODO improve + } + } + + val port = rf.ram.writePort + port.valid := input.valid && input.write + port.address := input.source @@ input.rd + port.data.value := input.value + if(p.withDouble) port.data.boxed := input.format === FpuFormat.FLOAT + + val randomSim = p.sim generate (in UInt(p.internalMantissaSize bits)) + if(p.sim) when(port.data.value.isZero || port.data.value.isInfinity){ + port.data.value.mantissa := randomSim + } + if(p.sim) when(input.value.special){ + port.data.value.exponent(p.internalExponentSize-1 downto 3) := randomSim.resized + when(!input.value.isNan){ + port.data.value.exponent(2 downto 2) := randomSim.resized + } + } + + when(port.valid){ + assert(!(port.data.value.exponent === 0 && !port.data.value.special), "Special violation") + assert(!(port.data.value.exponent === port.data.value.exponent.maxValue && !port.data.value.special), "Special violation") + } + } +} + + + + +object FpuSynthesisBench extends App{ + val payloadType = HardType(Bits(8 bits)) + class Fpu(name : String, portCount : Int, p : FpuParameter) extends Rtl{ + override def getName(): String = "Fpu_" + name + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuCore(portCount, p){ + + setDefinitionName(Fpu.this.getName()) + }) + } + + class Shifter(width : Int) extends Rtl{ + override def getName(): String = "shifter_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(a >> sel) + setDefinitionName(Shifter.this.getName()) + }) + } + + class Rotate(width : Int) extends Rtl{ + override def getName(): String = "rotate_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(Delay(Delay(a,3).rotateLeft(Delay(sel,3)),3)) + setDefinitionName(Rotate.this.getName()) + }) + } + +// rotate2_24 -> +// Artix 7 -> 233 Mhz 96 LUT 167 FF +// Artix 7 -> 420 Mhz 86 LUT 229 FF +// rotate2_32 -> +// Artix 7 -> 222 Mhz 108 LUT 238 FF +// Artix 7 -> 399 Mhz 110 LUT 300 FF +// rotate2_52 -> +// Artix 7 -> 195 Mhz 230 LUT 362 FF +// Artix 7 -> 366 Mhz 225 LUT 486 FF +// rotate2_64 -> +// Artix 7 -> 182 Mhz 257 LUT 465 FF +// Artix 7 -> 359 Mhz 266 LUT 591 FF + class Rotate2(width : Int) extends Rtl{ + override def getName(): String = "rotate2_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(Delay((U(0, width bits) @@ Delay(a,3)).rotateLeft(Delay(sel,3)),3)) + setDefinitionName(Rotate2.this.getName()) + }) + } + + class Rotate3(width : Int) extends Rtl{ + override def getName(): String = "rotate3_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = Delay(in UInt(width bits), 3) + val sel = Delay(in UInt(log2Up(width) bits),3) + // val result = + // val output = Delay(result, 3) + setDefinitionName(Rotate3.this.getName()) + }) + } + + class Div(width : Int) extends Rtl{ + override def getName(): String = "div_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new UnsignedDivider(width,width, false).setDefinitionName(Div.this.getName())) + } + + class Add(width : Int) extends Rtl{ + override def getName(): String = "add_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a, b = in UInt(width bits) + val result = out(a + b) + setDefinitionName(Add.this.getName()) + }) + } + + class DivSqrtRtl(width : Int) extends Rtl{ + override def getName(): String = "DivSqrt_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuDiv(width).setDefinitionName(DivSqrtRtl.this.getName())) + } + + val rtls = ArrayBuffer[Rtl]() + rtls += new Fpu( + "32", + portCount = 1, + FpuParameter( +// withDivSqrt = false, + withDouble = false + ) + ) + rtls += new Fpu( + "64", + portCount = 1, + FpuParameter( +// withDivSqrt = false, + withDouble = true + ) + ) + +// rtls += new Div(52) +// rtls += new Div(23) +// rtls += new Add(64) +// rtls += new DivSqrtRtl(52) +// rtls += new DivSqrtRtl(23) + + // rtls += new Shifter(24) +// rtls += new Shifter(32) +// rtls += new Shifter(52) +// rtls += new Shifter(64) +// rtls += new Rotate(24) +// rtls += new Rotate(32) +// rtls += new Rotate(52) +// rtls += new Rotate(64) +// rtls += new Rotate3(24) +// rtls += new Rotate3(32) +// rtls += new Rotate3(52) +// rtls += new Rotate3(64) + + val targets = XilinxStdTargets()// ++ AlteraStdTargets() + + + Bench(rtls, targets) +} + +//Fpu_32 -> +//Artix 7 -> 136 Mhz 1471 LUT 1336 FF +//Artix 7 -> 196 Mhz 1687 LUT 1371 FF +//Fpu_64 -> +//Artix 7 -> 105 Mhz 2822 LUT 2132 FF +//Artix 7 -> 161 Mhz 3114 LUT 2272 FF +// +// +// +//Fpu_32 -> +//Artix 7 -> 128 Mhz 1693 LUT 1481 FF +//Artix 7 -> 203 Mhz 1895 LUT 1481 FF +//Fpu_64 -> +//Artix 7 -> 99 Mhz 3073 LUT 2396 FF +//Artix 7 -> 164 Mhz 3433 LUT 2432 FF + + +//Fpu_32 -> +//Artix 7 -> 112 Mhz 1790 LUT 1666 FF +//Artix 7 -> 158 Mhz 1989 LUT 1701 FF +//Fpu_64 -> +//Artix 7 -> 100 Mhz 3294 LUT 2763 FF +//Artix 7 -> 151 Mhz 3708 LUT 2904 FF + +//Fpu_32 -> +//Artix 7 -> 139 Mhz 1879 LUT 1713 FF +//Artix 7 -> 206 Mhz 2135 LUT 1723 FF +//Fpu_64 -> +//Artix 7 -> 106 Mhz 3502 LUT 2811 FF +//Artix 7 -> 163 Mhz 3905 LUT 2951 FF + +//Fpu_32 -> +//Artix 7 -> 130 Mhz 1889 LUT 1835 FF +//Artix 7 -> 210 Mhz 2131 LUT 1845 FF +//Fpu_64 -> +//Artix 7 -> 106 Mhz 3322 LUT 3023 FF +//Artix 7 -> 161 Mhz 3675 LUT 3163 FF + +//Fpu_32 -> +//Artix 7 -> 132 Mhz 1891 LUT 1837 FF +//Artix 7 -> 209 Mhz 2132 LUT 1847 FF +//Fpu_64 -> +//Artix 7 -> 105 Mhz 3348 LUT 3024 FF +//Artix 7 -> 162 Mhz 3712 LUT 3165 FF + +//Fpu_32 -> +//Artix 7 -> 128 Mhz 1796 LUT 1727 FF +//Artix 7 -> 208 Mhz 2049 LUT 1727 FF +//Fpu_64 -> +//Artix 7 -> 109 Mhz 3417 LUT 2913 FF +//Artix 7 -> 168 Mhz 3844 LUT 3053 FF + +/* +testfloat -tininessafter -all1 > all1.txt +cat all1.txt | grep "Errors found in" + +testfloat -tininessafter -all2 > all2.txt +cat all2.txt | grep "Errors found in" + +testfloat -tininessafter -f32_mulAdd > fma.txt + +testfloat -tininessafter -all2 -level 2 -checkall > all2.txt + + + +all1 => +Errors found in f32_to_ui64_rx_minMag: +Errors found in f32_to_i64_rx_minMag: +Errors found in f64_to_ui64_rx_minMag: +Errors found in f64_to_i64_rx_minMag: + +all2 => + + +Errors found in f32_mulAdd, rounding min: ++00.7FFFFF +67.000001 -01.000000 + => -01.000000 ...ux expected -01.000000 ....x ++67.000001 +00.7FFFFF -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +-00.7FFFFF -67.000001 -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +-67.000001 -00.7FFFFF -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +Errors found in f32_mulAdd, rounding max: ++00.7FFFFF -67.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x ++67.000001 -00.7FFFFF +01.000000 + => +01.000000 ...ux expected +01.000000 ....x ++66.7FFFFE -01.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x +-00.7FFFFF +67.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x +-67.000001 +00.7FFFFF +01.000000 + => +01.000000 ...ux expected +01.000000 ....x + + + + */
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala new file mode 100644 index 0000000..7c9e713 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala @@ -0,0 +1,140 @@ +package vexriscv.ip.fpu + + +import spinal.core._ +import spinal.lib.math.{UnsignedDividerCmd, UnsignedDividerRsp} +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuDivCmd(mantissaWidth : Int) extends Bundle{ + val a,b = UInt(mantissaWidth bits) +} + +case class FpuDivRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 + 2 bits) + val remain = UInt(mantissaWidth+1 bits) +} + +case class FpuDiv(val mantissaWidth : Int) extends Component { + assert(mantissaWidth % 2 == 0) + val io = new Bundle{ + val input = slave Stream(FpuDivCmd(mantissaWidth)) + val output = master Stream(FpuDivRsp(mantissaWidth)) + } + + val iterations = (mantissaWidth+2+2)/2 + val counter = Reg(UInt(log2Up(iterations) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val shifter = Reg(UInt(mantissaWidth + 3 bits)) + val result = Reg(UInt(mantissaWidth+1+2 bits)) + + val div1, div3 = Reg(UInt(mantissaWidth+3 bits)) + val div2 = div1 |<< 1 + + val sub1 = shifter -^ div1 + val sub2 = shifter -^ div2 + val sub3 = shifter -^ div3 + + io.output.valid := done + io.output.result := (result << 0).resized + io.output.remain := (shifter >> 2).resized + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(shifter) + result := result |<< 2 + when(!sub1.msb){ + sel := sub1.resized + result(1 downto 0) := 1 + } + when(!sub2.msb){ + sel := sub2.resized + result(1 downto 0) := 2 + } + when(!sub3.msb){ + sel := sub3.resized + result(1 downto 0) := 3 + } + shifter := sel |<< 2 + } + + when(!busy){ + counter := 0 + shifter := (U"1" @@ io.input.a @@ U"").resized + div1 := (U"1" @@ io.input.b).resized + div3 := (U"1" @@ io.input.b) +^ (((U"1" @@ io.input.b)) << 1) + busy := io.input.valid + } +} + + +object FpuDivTester extends App{ + import spinal.core.sim._ + + for(w <- List(16, 20)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuDiv(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuDivRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int, b : Int): Unit ={ + cmdQueue +={p => + p.a #= a + p.b #= b + } + rspQueue += {p => + val x = (a | (1 << dut.mantissaWidth)).toLong + val y = (b | (1 << dut.mantissaWidth)).toLong + val result = (x << dut.mantissaWidth+2) / y + val remain = (x << dut.mantissaWidth+2) % y + + assert(p.result.toLong == result, f"$x%x/$y%x=${p.result.toLong}%x instead of $result%x") + assert(p.remain.toLong == remain, f"$x%x %% $y%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 + test(0xE000 << s, 0x8000 << s) + test(0xC000 << s, 0x4000 << s) + test(0xC835 << s, 0x4742 << s) + test(0,0) + test(0,f) + test(f,0) + test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(1 << dut.mantissaWidth), Random.nextInt(1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +} + +object FpuDivTester2 extends App{ + val mantissaWidth = 52 + val a = BigInt(0xfffffff810000l) + val b = BigInt(0x0000000000FF0l) + val x = (a | (1l << mantissaWidth)) + val y = (b | (1l << mantissaWidth)) + val result = (x << mantissaWidth+2) / y + val remain = (x << mantissaWidth+2) % y + println("done") + +}
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala new file mode 100644 index 0000000..0f80905 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala @@ -0,0 +1,116 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuSqrtCmd(mantissaWidth : Int) extends Bundle{ + val a = UInt(mantissaWidth+2 bits) +} + +case class FpuSqrtRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 bits) + val remain = UInt(mantissaWidth+5 bits) +} + +case class FpuSqrt(val mantissaWidth : Int) extends Component { + val io = new Bundle{ + val input = slave Stream(FpuSqrtCmd(mantissaWidth)) + val output = master Stream(FpuSqrtRsp(mantissaWidth)) + } + + val iterations = mantissaWidth+2 + val counter = Reg(UInt(log2Up(iterations ) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val a = Reg(UInt(mantissaWidth+5 bits)) + val x = Reg(UInt(mantissaWidth bits)) + val q = Reg(UInt(mantissaWidth+1 bits)) + val t = a-(q @@ U"01") + + + io.output.valid := done + io.output.result := (q << 0).resized + io.output.remain := a + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(a) + when(!t.msb){ + sel := t.resized + } + q := (q @@ !t.msb).resized + a := (sel @@ x(widthOf(x)-2,2 bits)).resized + x := x |<< 2 + } + + when(!busy){ + q := 0 + a := io.input.a(widthOf(io.input.a)-2,2 bits).resized + x := (io.input.a).resized + counter := 0 + when(io.input.valid){ + busy := True + } + } +} + + +object FpuSqrtTester extends App{ + import spinal.core.sim._ + + for(w <- List(16)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuSqrt(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuSqrtRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int): Unit ={ + cmdQueue +={p => + p.a #= a + } + rspQueue += {p => +// val x = (a * (1l << dut.mantissaWidth)).toLong +// val result = Math.sqrt(x).toLong/(1 << dut.mantissaWidth/2) +// val remain = a-x*x + val x = a.toDouble / (1 << dut.mantissaWidth) + val result = (Math.sqrt(x)*(1 << dut.mantissaWidth+1)).toLong + val filtred = result % (1 << dut.mantissaWidth+1) +// val remain = (a-(result*result)).toLong + assert(p.result.toLong == filtred, f"$a%x=${p.result.toLong}%x instead of $filtred%x") +// assert(p.remain.toLong == remain, f"$a%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 +// test(121) + test(0x20000) + test(0x18000) +// test(0,0) +// test(0,f) +// test(f,0) +// test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(3 << dut.mantissaWidth) + (1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +}
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala new file mode 100644 index 0000000..9338c35 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -0,0 +1,186 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ + + +object Fpu{ + + object Function{ + val MUL = 0 + val ADD = 1 + } + +} + + +case class FpuFloatDecoded() extends Bundle{ + val isNan = Bool() + val isNormal = Bool() + val isSubnormal = Bool() + val isZero = Bool() + val isInfinity = Bool() + val isQuiet = Bool() +} + +object FpuFloat{ + val ZERO = 0 + val INFINITY = 1 + val NAN = 2 + val NAN_CANONICAL_BIT = 2 +} + +case class FpuFloat(exponentSize: Int, + mantissaSize: Int) extends Bundle { + val mantissa = UInt(mantissaSize bits) + val exponent = UInt(exponentSize bits) + val sign = Bool() + val special = Bool() + + def withInvertSign : FpuFloat ={ + val ret = FpuFloat(exponentSize,mantissaSize) + ret.sign := !sign + ret.exponent := exponent + ret.mantissa := mantissa + ret + } + + def isNormal = !special + def isZero = special && exponent(1 downto 0) === FpuFloat.ZERO + def isInfinity = special && exponent(1 downto 0) === FpuFloat.INFINITY + def isNan = special && exponent(1 downto 0) === FpuFloat.NAN + def isQuiet = mantissa.msb + def isNanSignaling = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet + def isCanonical = exponent(FpuFloat.NAN_CANONICAL_BIT) + + def setNormal = { special := False } + def setZero = { special := True; exponent(1 downto 0) := FpuFloat.ZERO } + def setInfinity = { special := True; exponent(1 downto 0) := FpuFloat.INFINITY } + def setNan = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := False} + def setNanQuiet = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := True; mantissa.msb := True; } + + def decode() = { + val ret = FpuFloatDecoded() + ret.isZero := isZero + ret.isNormal := isNormal + ret.isInfinity := isInfinity + ret.isNan := isNan + ret.isQuiet := mantissa.msb + ret + } + + def decodeIeee754() = { + val ret = FpuFloatDecoded() + val expZero = exponent === 0 + val expOne = exponent === exponent.maxValue + val manZero = mantissa === 0 + ret.isZero := expZero && manZero + ret.isSubnormal := expZero && !manZero + ret.isNormal := !expOne && !expZero + ret.isInfinity := expOne && manZero + ret.isNan := expOne && !manZero + ret.isQuiet := mantissa.msb + ret + } +} + +object FpuOpcode extends SpinalEnum{ + val LOAD, STORE, MUL, ADD, FMA, I2F, F2I, CMP, DIV, SQRT, MIN_MAX, SGNJ, FMV_X_W, FMV_W_X, FCLASS, FCVT_X_X = newElement() +} + +object FpuFormat extends SpinalEnum{ + val FLOAT, DOUBLE = newElement() +} + +object FpuRoundMode extends SpinalEnum(){ + val RNE, RTZ, RDN, RUP, RMM = newElement() + defaultEncoding = SpinalEnumEncoding("opt")( + RNE -> 0, + RTZ -> 1, + RDN -> 2, + RUP -> 3, + RMM -> 4 + ) +} +object FpuRoundModeInstr extends SpinalEnum(){ + val RNE, RTZ, RDN, RUP, RMM, DYN = newElement() + defaultEncoding = SpinalEnumEncoding("opt")( + RNE -> 0, + RTZ -> 1, + RDN -> 2, + RUP -> 3, + RMM -> 4, + DYN -> 7 + ) +} + + +case class FpuParameter( withDouble : Boolean, + asyncRegFile : Boolean = false, + mulWidthA : Int = 18, + mulWidthB : Int = 18, + schedulerM2sPipe : Boolean = false, + sim : Boolean = false, + withAdd : Boolean = true, + withMul : Boolean = true, + withDivSqrt : Boolean = false, + withDiv : Boolean = true, + withSqrt : Boolean = true, + withShortPipMisc : Boolean = true){ + + val internalMantissaSize = if(withDouble) 52 else 23 + val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits)) + val internalExponentSize = (if(withDouble) 11 else 8) + 1 + val internalFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize)) + val writeFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize+1)) + + val rfAddress = HardType(UInt(5 bits)) + + val Opcode = FpuOpcode + val Format = FpuFormat + val argWidth = 2 + val Arg = HardType(Bits(2 bits)) +} + +case class FpuFlags() extends Bundle{ + val NX, UF, OF, DZ, NV = Bool() +} + +case class FpuCompletion() extends Bundle{ + val flags = FpuFlags() + val written = Bool() //Used for verification purposes +} + +case class FpuCmd(p : FpuParameter) extends Bundle{ + val opcode = p.Opcode() + val arg = Bits(2 bits) + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + val format = p.Format() + val roundMode = FpuRoundMode() +} + +case class FpuCommit(p : FpuParameter) extends Bundle{ + val opcode = FpuOpcode() + val rd = UInt(5 bits) + val write = Bool() + val value = p.storeLoadType() // IEEE 754 +} + +case class FpuRsp(p : FpuParameter) extends Bundle{ + val value = p.storeLoadType() // IEEE754 store || Integer + val NV, NX = Bool() +} + +case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave { + val cmd = Stream(FpuCmd(p)) + val commit = Stream(FpuCommit(p)) + val rsp = Stream(FpuRsp(p)) + val completion = Flow(FpuCompletion()) + + override def asMaster(): Unit = { + master(cmd, commit) + slave(rsp) + in(completion) + } +} |