aboutsummaryrefslogtreecommitdiff
path: root/VexRiscv/src/main/scala/vexriscv/ip
diff options
context:
space:
mode:
Diffstat (limited to 'VexRiscv/src/main/scala/vexriscv/ip')
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala1184
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala487
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala1944
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala140
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala116
-rw-r--r--VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala186
6 files changed, 4057 insertions, 0 deletions
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala
new file mode 100644
index 0000000..2b70400
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala
@@ -0,0 +1,1184 @@
+package vexriscv.ip
+
+import vexriscv._
+import spinal.core._
+import spinal.lib._
+import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4Shared}
+import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig}
+import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbCmd, BmbInvalidationParameter, BmbParameter, BmbSourceParameter}
+import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig}
+import spinal.lib.bus.simple._
+import vexriscv.plugin.DBusSimpleBus
+
+
+case class DataCacheConfig(cacheSize : Int,
+ bytePerLine : Int,
+ wayCount : Int,
+ addressWidth : Int,
+ cpuDataWidth : Int,
+ var rfDataWidth : Int = -1, //-1 mean cpuDataWidth
+ memDataWidth : Int,
+ catchAccessError : Boolean,
+ catchIllegal : Boolean,
+ catchUnaligned : Boolean,
+ earlyWaysHits : Boolean = true,
+ earlyDataMux : Boolean = false,
+ tagSizeShift : Int = 0, //Used to force infering ram
+ withLrSc : Boolean = false,
+ withAmo : Boolean = false,
+ withExclusive : Boolean = false,
+ withInvalidate : Boolean = false,
+ pendingMax : Int = 64,
+ directTlbHit : Boolean = false,
+ mergeExecuteMemory : Boolean = false,
+ asyncTagMemory : Boolean = false,
+ withWriteAggregation : Boolean = false){
+
+ if(rfDataWidth == -1) rfDataWidth = cpuDataWidth
+ assert(!(mergeExecuteMemory && (earlyDataMux || earlyWaysHits)))
+ assert(!(earlyDataMux && !earlyWaysHits))
+ assert(isPow2(pendingMax))
+ assert(rfDataWidth <= memDataWidth)
+
+ def lineCount = cacheSize/bytePerLine/wayCount
+ def sizeMax = log2Up(bytePerLine)
+ def sizeWidth = log2Up(sizeMax + 1)
+ val aggregationWidth = if(withWriteAggregation) log2Up(memDataBytes+1) else 0
+ def withWriteResponse = withExclusive
+ def burstSize = bytePerLine*8/memDataWidth
+ val burstLength = bytePerLine/(cpuDataWidth/8)
+ def catchSomething = catchUnaligned || catchIllegal || catchAccessError
+ def withInternalAmo = withAmo && !withExclusive
+ def withInternalLrSc = withLrSc && !withExclusive
+ def withExternalLrSc = withLrSc && withExclusive
+ def withExternalAmo = withAmo && withExclusive
+ def cpuDataBytes = cpuDataWidth/8
+ def rfDataBytes = rfDataWidth/8
+ def memDataBytes = memDataWidth/8
+ def getAxi4SharedConfig() = Axi4Config(
+ addressWidth = addressWidth,
+ dataWidth = memDataWidth,
+ useId = false,
+ useRegion = false,
+ useBurst = false,
+ useLock = false,
+ useQos = false
+ )
+
+
+ def getAvalonConfig() = AvalonMMConfig.bursted(
+ addressWidth = addressWidth,
+ dataWidth = memDataWidth,
+ burstCountWidth = log2Up(burstSize + 1)).copy(
+ useByteEnable = true,
+ constantBurstBehavior = true,
+ burstOnBurstBoundariesOnly = true,
+ useResponse = true,
+ maximumPendingReadTransactions = 2
+ )
+
+ def getWishboneConfig() = WishboneConfig(
+ addressWidth = 32-log2Up(memDataWidth/8),
+ dataWidth = memDataWidth,
+ selWidth = memDataBytes,
+ useSTALL = false,
+ useLOCK = false,
+ useERR = true,
+ useRTY = false,
+ tgaWidth = 0,
+ tgcWidth = 0,
+ tgdWidth = 0,
+ useBTE = true,
+ useCTI = true
+ )
+
+ def getBmbParameter() = BmbParameter(
+ BmbAccessParameter(
+ addressWidth = 32,
+ dataWidth = memDataWidth
+ ).addSources(1, BmbSourceParameter(
+ lengthWidth = log2Up(this.bytePerLine),
+ contextWidth = (if(!withWriteResponse) 1 else 0) + aggregationWidth,
+ alignment = BmbParameter.BurstAlignement.LENGTH,
+ canExclusive = withExclusive,
+ withCachedRead = true,
+ canInvalidate = withInvalidate,
+ canSync = withInvalidate
+ )),
+ BmbInvalidationParameter(
+ invalidateLength = log2Up(this.bytePerLine),
+ invalidateAlignment = BmbParameter.BurstAlignement.LENGTH
+ )
+ )
+}
+
+object DataCacheCpuExecute{
+ implicit def implArgs(that : DataCacheCpuExecute) = that.args
+}
+
+case class DataCacheCpuExecute(p : DataCacheConfig) extends Bundle with IMasterSlave{
+ val isValid = Bool
+ val address = UInt(p.addressWidth bit)
+ val haltIt = Bool
+ val args = DataCacheCpuExecuteArgs(p)
+ val refilling = Bool
+
+ override def asMaster(): Unit = {
+ out(isValid, args, address)
+ in(haltIt, refilling)
+ }
+}
+
+case class DataCacheCpuExecuteArgs(p : DataCacheConfig) extends Bundle{
+ val wr = Bool
+ val size = UInt(log2Up(log2Up(p.cpuDataBytes)+1) bits)
+ val isLrsc = p.withLrSc generate Bool()
+ val isAmo = p.withAmo generate Bool()
+ val amoCtrl = p.withAmo generate new Bundle {
+ val swap = Bool()
+ val alu = Bits(3 bits)
+ }
+
+ val totalyConsistent = Bool() //Only for AMO/LRSC
+}
+
+case class DataCacheCpuMemory(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+ val isValid = Bool
+ val isStuck = Bool
+ val isWrite = Bool
+ val address = UInt(p.addressWidth bit)
+ val mmuRsp = MemoryTranslatorRsp(mmu)
+
+ override def asMaster(): Unit = {
+ out(isValid, isStuck, address)
+ in(isWrite)
+ out(mmuRsp)
+ }
+}
+
+
+case class FenceFlags() extends Bundle {
+ val SW,SR,SO,SI,PW,PR,PO,PI = Bool()
+ val FM = Bits(4 bits)
+
+ def SL = SR || SI
+ def SS = SW || SO
+ def PL = PR || PI
+ def PS = PW || PO
+ def forceAll(): Unit ={
+ List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := True)
+ }
+ def clearAll(): Unit ={
+ List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := False)
+ }
+}
+
+case class DataCacheCpuWriteBack(p : DataCacheConfig) extends Bundle with IMasterSlave{
+ val isValid = Bool()
+ val isStuck = Bool()
+ val isFiring = Bool()
+ val isUser = Bool()
+ val haltIt = Bool()
+ val isWrite = Bool()
+ val storeData = Bits(p.cpuDataWidth bit)
+ val data = Bits(p.cpuDataWidth bit)
+ val address = UInt(p.addressWidth bit)
+ val mmuException, unalignedAccess, accessError = Bool()
+ val keepMemRspData = Bool() //Used by external AMO to avoid having an internal buffer
+ val fence = FenceFlags()
+ val exclusiveOk = Bool()
+
+ override def asMaster(): Unit = {
+ out(isValid,isStuck,isUser, address, fence, storeData, isFiring)
+ in(haltIt, data, mmuException, unalignedAccess, accessError, isWrite, keepMemRspData, exclusiveOk)
+ }
+}
+
+case class DataCacheFlush(lineCount : Int) extends Bundle{
+ val singleLine = Bool()
+ val lineId = UInt(log2Up(lineCount) bits)
+}
+
+case class DataCacheCpuBus(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+ val execute = DataCacheCpuExecute(p)
+ val memory = DataCacheCpuMemory(p, mmu)
+ val writeBack = DataCacheCpuWriteBack(p)
+
+ val redo = Bool()
+ val flush = Stream(DataCacheFlush(p.lineCount))
+
+ override def asMaster(): Unit = {
+ master(execute)
+ master(memory)
+ master(writeBack)
+ master(flush)
+ in(redo)
+ }
+}
+
+
+case class DataCacheMemCmd(p : DataCacheConfig) extends Bundle{
+ val wr = Bool
+ val uncached = Bool
+ val address = UInt(p.addressWidth bit)
+ val data = Bits(p.cpuDataWidth bits)
+ val mask = Bits(p.cpuDataWidth/8 bits)
+ val size = UInt(p.sizeWidth bits) //... 1 => 2 bytes ... 2 => 4 bytes ...
+ val exclusive = p.withExclusive generate Bool()
+ val last = Bool
+
+// def beatCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes)))
+// def beatCount = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes-1)))
+
+ //Utilities which does quite a few assumtions about the bus utilisation
+ def byteCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)-1, log2Up(p.bytePerLine) bits)))
+ def beatCountMinusOne = (size === log2Up(p.bytePerLine)) ? U(p.burstSize-1) | U(0)
+ def beatCount = (size === log2Up(p.bytePerLine)) ? U(p.burstSize) | U(1)
+ def isBurst = size === log2Up(p.bytePerLine)
+}
+case class DataCacheMemRsp(p : DataCacheConfig) extends Bundle{
+ val aggregated = UInt(p.aggregationWidth bits)
+ val last = Bool()
+ val data = Bits(p.memDataWidth bit)
+ val error = Bool
+ val exclusive = p.withExclusive generate Bool()
+}
+case class DataCacheInv(p : DataCacheConfig) extends Bundle{
+ val enable = Bool()
+ val address = UInt(p.addressWidth bit)
+}
+case class DataCacheAck(p : DataCacheConfig) extends Bundle{
+ val hit = Bool()
+}
+
+case class DataCacheSync(p : DataCacheConfig) extends Bundle{
+ val aggregated = UInt(p.aggregationWidth bits)
+}
+
+case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave{
+ val cmd = Stream (DataCacheMemCmd(p))
+ val rsp = Flow (DataCacheMemRsp(p))
+
+ val inv = p.withInvalidate generate Stream(Fragment(DataCacheInv(p)))
+ val ack = p.withInvalidate generate Stream(Fragment(DataCacheAck(p)))
+ val sync = p.withInvalidate generate Stream(DataCacheSync(p))
+
+ override def asMaster(): Unit = {
+ master(cmd)
+ slave(rsp)
+
+ if(p.withInvalidate) {
+ slave(inv)
+ master(ack)
+ slave(sync)
+ }
+ }
+
+ def toAxi4Shared(stageCmd : Boolean = false, pendingWritesMax : Int = 7): Axi4Shared = {
+ val axi = Axi4Shared(p.getAxi4SharedConfig()).setName("dbus_axi")
+
+ val cmdPreFork = if (stageCmd) cmd.stage.stage().s2mPipe() else cmd
+
+ val pendingWrites = CounterUpDown(
+ stateCount = pendingWritesMax + 1,
+ incWhen = cmdPreFork.fire && cmdPreFork.wr,
+ decWhen = axi.writeRsp.fire
+ )
+
+ val hazard = (pendingWrites =/= 0 && !cmdPreFork.wr) || pendingWrites === pendingWritesMax
+ val (cmdFork, dataFork) = StreamFork2(cmdPreFork.haltWhen(hazard))
+ val cmdStage = cmdFork.throwWhen(RegNextWhen(!cmdFork.last,cmdFork.fire).init(False))
+ val dataStage = dataFork.throwWhen(!dataFork.wr)
+
+ axi.sharedCmd.arbitrationFrom(cmdStage)
+ axi.sharedCmd.write := cmdStage.wr
+ axi.sharedCmd.prot := "010"
+ axi.sharedCmd.cache := "1111"
+ axi.sharedCmd.size := log2Up(p.memDataBytes)
+ axi.sharedCmd.addr := cmdStage.address
+ axi.sharedCmd.len := cmdStage.beatCountMinusOne.resized
+
+ axi.writeData.arbitrationFrom(dataStage)
+ axi.writeData.data := dataStage.data
+ axi.writeData.strb := dataStage.mask
+ axi.writeData.last := dataStage.last
+
+ rsp.valid := axi.r.valid
+ rsp.error := !axi.r.isOKAY()
+ rsp.data := axi.r.data
+
+ axi.r.ready := True
+ axi.b.ready := True
+
+ axi
+ }
+
+
+ def toAvalon(): AvalonMM = {
+ val avalonConfig = p.getAvalonConfig()
+ val mm = AvalonMM(avalonConfig)
+ mm.read := cmd.valid && !cmd.wr
+ mm.write := cmd.valid && cmd.wr
+ mm.address := cmd.address(cmd.address.high downto log2Up(p.memDataWidth/8)) @@ U(0,log2Up(p.memDataWidth/8) bits)
+ mm.burstCount := cmd.beatCount
+ mm.byteEnable := cmd.mask
+ mm.writeData := cmd.data
+
+ cmd.ready := mm.waitRequestn
+ rsp.valid := mm.readDataValid
+ rsp.data := mm.readData
+ rsp.error := mm.response =/= AvalonMM.Response.OKAY
+
+ mm
+ }
+
+ def toWishbone(): Wishbone = {
+ val wishboneConfig = p.getWishboneConfig()
+ val bus = Wishbone(wishboneConfig)
+ val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+ val addressShift = log2Up(p.memDataWidth/8)
+
+ val cmdBridge = Stream (DataCacheMemCmd(p))
+ val isBurst = cmdBridge.isBurst
+ cmdBridge.valid := cmd.valid
+ cmdBridge.address := (isBurst ? (cmd.address(31 downto widthOf(counter) + addressShift) @@ counter @@ U(0, addressShift bits)) | (cmd.address(31 downto addressShift) @@ U(0, addressShift bits)))
+ cmdBridge.wr := cmd.wr
+ cmdBridge.mask := cmd.mask
+ cmdBridge.data := cmd.data
+ cmdBridge.size := cmd.size
+ cmdBridge.last := !isBurst || counter === p.burstSize-1
+ cmd.ready := cmdBridge.ready && (cmdBridge.wr || cmdBridge.last)
+
+
+ when(cmdBridge.fire){
+ counter := counter + 1
+ when(cmdBridge.last){
+ counter := 0
+ }
+ }
+
+
+ bus.ADR := cmdBridge.address >> addressShift
+ bus.CTI := Mux(isBurst, cmdBridge.last ? B"111" | B"010", B"000")
+ bus.BTE := B"00"
+ bus.SEL := cmdBridge.wr ? cmdBridge.mask | B((1 << p.memDataBytes)-1)
+ bus.WE := cmdBridge.wr
+ bus.DAT_MOSI := cmdBridge.data
+
+ cmdBridge.ready := cmdBridge.valid && bus.ACK
+ bus.CYC := cmdBridge.valid
+ bus.STB := cmdBridge.valid
+
+ rsp.valid := RegNext(cmdBridge.valid && !bus.WE && bus.ACK) init(False)
+ rsp.data := RegNext(bus.DAT_MISO)
+ rsp.error := False //TODO
+ bus
+ }
+
+
+
+ def toPipelinedMemoryBus(): PipelinedMemoryBus = {
+ val bus = PipelinedMemoryBus(32,32)
+
+ val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+ when(bus.cmd.fire){ counter := counter + 1 }
+ when( cmd.fire && cmd.last){ counter := 0 }
+
+ bus.cmd.valid := cmd.valid
+ bus.cmd.address := (cmd.address(31 downto 2) | counter.resized) @@ U"00"
+ bus.cmd.write := cmd.wr
+ bus.cmd.mask := cmd.mask
+ bus.cmd.data := cmd.data
+ cmd.ready := bus.cmd.ready && (cmd.wr || counter === p.burstSize-1)
+ rsp.valid := bus.rsp.valid
+ rsp.data := bus.rsp.payload.data
+ rsp.error := False
+ bus
+ }
+
+
+ def toBmb(syncPendingMax : Int = 32,
+ timeoutCycles : Int = 16) : Bmb = new Area{
+ setCompositeName(DataCacheMemBus.this, "Bridge", true)
+ val pipelinedMemoryBusConfig = p.getBmbParameter()
+ val bus = Bmb(pipelinedMemoryBusConfig).setCompositeName(this,"toBmb", true)
+
+ case class Context() extends Bundle{
+ val isWrite = !p.withWriteResponse generate Bool()
+ val rspCount = (p.aggregationWidth != 0) generate UInt(p.aggregationWidth bits)
+ }
+
+
+ def sizeToLength(size : UInt) = size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> U((1 << i)-1, log2Up(p.cpuDataBytes) bits)))
+
+ val withoutWriteBuffer = if(p.aggregationWidth == 0) new Area {
+ val busCmdContext = Context()
+
+ bus.cmd.valid := cmd.valid
+ bus.cmd.last := cmd.last
+ bus.cmd.opcode := (cmd.wr ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+ bus.cmd.address := cmd.address.resized
+ bus.cmd.data := cmd.data
+ bus.cmd.length := cmd.byteCountMinusOne
+ bus.cmd.mask := cmd.mask
+ if (p.withExclusive) bus.cmd.exclusive := cmd.exclusive
+ if (!p.withWriteResponse) busCmdContext.isWrite := cmd.wr
+ bus.cmd.context := B(busCmdContext)
+
+ cmd.ready := bus.cmd.ready
+ if(p.withInvalidate) sync.arbitrationFrom(bus.sync)
+ }
+
+ val withWriteBuffer = if(p.aggregationWidth != 0) new Area {
+ val buffer = new Area {
+ val stream = cmd.toEvent().m2sPipe()
+ val address = Reg(UInt(p.addressWidth bits))
+ val length = Reg(UInt(pipelinedMemoryBusConfig.access.lengthWidth bits))
+ val write = Reg(Bool)
+ val exclusive = Reg(Bool)
+ val data = Reg(Bits(p.memDataWidth bits))
+ val mask = Reg(Bits(p.memDataWidth/8 bits)) init(0)
+ }
+
+ val aggregationRange = log2Up(p.memDataWidth/8)-1 downto log2Up(p.cpuDataWidth/8)
+ val tagRange = p.addressWidth-1 downto aggregationRange.high+1
+ val aggregationEnabled = Reg(Bool)
+ val aggregationCounter = Reg(UInt(p.aggregationWidth bits)) init(0)
+ val aggregationCounterFull = aggregationCounter === aggregationCounter.maxValue
+ val timer = Reg(UInt(log2Up(timeoutCycles)+1 bits)) init(0)
+ val timerFull = timer.msb
+ val hit = cmd.address(tagRange) === buffer.address(tagRange)
+ val cmdExclusive = if(p.withExclusive) cmd.exclusive else False
+ val canAggregate = cmd.valid && cmd.wr && !cmd.uncached && !cmdExclusive && !timerFull && !aggregationCounterFull && (!buffer.stream.valid || aggregationEnabled && hit)
+ val doFlush = cmd.valid && !canAggregate || timerFull || aggregationCounterFull || !aggregationEnabled
+// val canAggregate = False
+// val doFlush = True
+ val busCmdContext = Context()
+ val halt = False
+
+ when(cmd.fire){
+ aggregationCounter := aggregationCounter + 1
+ }
+ when(buffer.stream.valid && !timerFull){
+ timer := timer + 1
+ }
+ when(bus.cmd.fire || !buffer.stream.valid){
+ buffer.mask := 0
+ aggregationCounter := 0
+ timer := 0
+ }
+
+ buffer.stream.ready := (bus.cmd.ready && doFlush || canAggregate) && !halt
+ bus.cmd.valid := buffer.stream.valid && doFlush && !halt
+ bus.cmd.last := True
+ bus.cmd.opcode := (buffer.write ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+ bus.cmd.address := buffer.address
+ bus.cmd.length := buffer.length
+ bus.cmd.data := buffer.data
+ bus.cmd.mask := buffer.mask
+
+ if (p.withExclusive) bus.cmd.exclusive := buffer.exclusive
+ bus.cmd.context.removeAssignments() := B(busCmdContext)
+ if (!p.withWriteResponse) busCmdContext.isWrite := bus.cmd.isWrite
+ busCmdContext.rspCount := aggregationCounter
+
+ val aggregationSel = cmd.address(aggregationRange)
+ when(cmd.fire){
+ val dIn = cmd.data.subdivideIn(8 bits)
+ val dReg = buffer.data.subdivideIn(8 bits)
+ for(byteId <- 0 until p.memDataBytes){
+ when(aggregationSel === byteId / p.cpuDataBytes && cmd.mask(byteId % p.cpuDataBytes)){
+ dReg.write(byteId, dIn(byteId % p.cpuDataBytes))
+ buffer.mask(byteId) := True
+ }
+ }
+ }
+
+ when(cmd.fire){
+ buffer.write := cmd.wr
+ buffer.address := cmd.address.resized
+ buffer.length := cmd.byteCountMinusOne
+ if (p.withExclusive) buffer.exclusive := cmd.exclusive
+
+ when(cmd.wr && !cmd.uncached && !cmdExclusive){
+ aggregationEnabled := True
+ buffer.address(aggregationRange.high downto 0) := 0
+ buffer.length := p.memDataBytes-1
+ } otherwise {
+ aggregationEnabled := False
+ }
+ }
+
+
+ val rspCtx = bus.rsp.context.as(Context())
+ rsp.aggregated := rspCtx.rspCount
+
+ val syncLogic = p.withInvalidate generate new Area{
+ val cmdCtx = Stream(UInt(p.aggregationWidth bits))
+ cmdCtx.valid := bus.cmd.fire && bus.cmd.isWrite
+ cmdCtx.payload := aggregationCounter
+ halt setWhen(!cmdCtx.ready)
+
+ val syncCtx = cmdCtx.queue(syncPendingMax).s2mPipe().m2sPipe() //Assume latency of sync is at least 3 cycles
+ syncCtx.ready := bus.sync.fire
+
+ sync.arbitrationFrom(bus.sync)
+ sync.aggregated := syncCtx.payload
+ }
+ }
+
+
+ rsp.valid := bus.rsp.valid
+ if(!p.withWriteResponse) rsp.valid clearWhen(bus.rsp.context(0))
+ rsp.data := bus.rsp.data
+ rsp.error := bus.rsp.isError
+ rsp.last := bus.rsp.last
+ if(p.withExclusive) rsp.exclusive := bus.rsp.exclusive
+ bus.rsp.ready := True
+
+ val invalidateLogic = p.withInvalidate generate new Area{
+ val beatCountMinusOne = bus.inv.transferBeatCountMinusOne(p.bytePerLine)
+ val counter = Reg(UInt(widthOf(beatCountMinusOne) bits)) init(0)
+
+ inv.valid := bus.inv.valid
+ inv.address := bus.inv.address + (counter << log2Up(p.bytePerLine))
+ inv.enable := bus.inv.all
+ inv.last := counter === beatCountMinusOne
+ bus.inv.ready := inv.last && inv.ready
+
+ if(widthOf(counter) != 0) when(inv.fire){
+ counter := counter + 1
+ when(inv.last){
+ counter := 0
+ }
+ }
+
+ bus.ack.arbitrationFrom(ack.throwWhen(!ack.last))
+ }
+ }.bus
+
+}
+
+object DataCacheExternalAmoStates extends SpinalEnum{
+ val LR_CMD, LR_RSP, SC_CMD, SC_RSP = newElement();
+}
+
+//If external amo, mem rsp should stay
+class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{
+ import p._
+
+ val io = new Bundle{
+ val cpu = slave(DataCacheCpuBus(p, mmuParameter))
+ val mem = master(DataCacheMemBus(p))
+ }
+
+ val haltCpu = False
+ val lineWidth = bytePerLine*8
+ val lineCount = cacheSize/bytePerLine
+ val wordWidth = cpuDataWidth
+ val wordWidthLog2 = log2Up(wordWidth)
+ val wordPerLine = lineWidth/wordWidth
+ val bytePerWord = wordWidth/8
+ val wayLineCount = lineCount/wayCount
+ val wayLineLog2 = log2Up(wayLineCount)
+ val wayWordCount = wayLineCount * wordPerLine
+ val memWordPerLine = lineWidth/memDataWidth
+ val memTransactionPerLine = p.bytePerLine / (p.memDataWidth/8)
+ val bytePerMemWord = memDataWidth/8
+ val wayMemWordCount = wayLineCount * memWordPerLine
+
+ val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine)
+ val lineRange = tagRange.low-1 downto log2Up(bytePerLine)
+ val cpuWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerWord)
+ val memWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerMemWord)
+ val hitRange = tagRange.high downto lineRange.low
+ val memWordToCpuWordRange = log2Up(bytePerMemWord)-1 downto log2Up(bytePerWord)
+ val cpuWordToRfWordRange = log2Up(bytePerWord)-1 downto log2Up(p.rfDataBytes)
+
+
+ class LineInfo() extends Bundle{
+ val valid, error = Bool()
+ val address = UInt(tagRange.length bit)
+ }
+
+ val tagsReadCmd = Flow(UInt(log2Up(wayLineCount) bits))
+ val tagsInvReadCmd = withInvalidate generate Flow(UInt(log2Up(wayLineCount) bits))
+ val tagsWriteCmd = Flow(new Bundle{
+ val way = Bits(wayCount bits)
+ val address = UInt(log2Up(wayLineCount) bits)
+ val data = new LineInfo()
+ })
+
+ val tagsWriteLastCmd = RegNext(tagsWriteCmd)
+
+ val dataReadCmd = Flow(UInt(log2Up(wayMemWordCount) bits))
+ val dataWriteCmd = Flow(new Bundle{
+ val way = Bits(wayCount bits)
+ val address = UInt(log2Up(wayMemWordCount) bits)
+ val data = Bits(memDataWidth bits)
+ val mask = Bits(memDataWidth/8 bits)
+ })
+
+
+ val ways = for(i <- 0 until wayCount) yield new Area{
+ val tags = Mem(new LineInfo(), wayLineCount)
+ val data = Mem(Bits(memDataWidth bit), wayMemWordCount)
+
+ //Reads
+ val tagsReadRsp = asyncTagMemory match {
+ case false => tags.readSync(tagsReadCmd.payload, tagsReadCmd.valid && !io.cpu.memory.isStuck)
+ case true => tags.readAsync(RegNextWhen(tagsReadCmd.payload, io.cpu.execute.isValid && !io.cpu.memory.isStuck))
+ }
+ val dataReadRspMem = data.readSync(dataReadCmd.payload, dataReadCmd.valid && !io.cpu.memory.isStuck)
+ val dataReadRspSel = if(mergeExecuteMemory) io.cpu.writeBack.address else io.cpu.memory.address
+ val dataReadRsp = dataReadRspMem.subdivideIn(cpuDataWidth bits).read(dataReadRspSel(memWordToCpuWordRange))
+
+ val tagsInvReadRsp = withInvalidate generate(asyncTagMemory match {
+ case false => tags.readSync(tagsInvReadCmd.payload, tagsInvReadCmd.valid)
+ case true => tags.readAsync(RegNextWhen(tagsInvReadCmd.payload, tagsInvReadCmd.valid))
+ })
+
+ //Writes
+ when(tagsWriteCmd.valid && tagsWriteCmd.way(i)){
+ tags.write(tagsWriteCmd.address, tagsWriteCmd.data)
+ }
+ when(dataWriteCmd.valid && dataWriteCmd.way(i)){
+ data.write(
+ address = dataWriteCmd.address,
+ data = dataWriteCmd.data,
+ mask = dataWriteCmd.mask
+ )
+ }
+ }
+
+
+ tagsReadCmd.valid := False
+ tagsReadCmd.payload.assignDontCare()
+ dataReadCmd.valid := False
+ dataReadCmd.payload.assignDontCare()
+ tagsWriteCmd.valid := False
+ tagsWriteCmd.payload.assignDontCare()
+ dataWriteCmd.valid := False
+ dataWriteCmd.payload.assignDontCare()
+
+ when(io.cpu.execute.isValid && !io.cpu.memory.isStuck){
+ tagsReadCmd.valid := True
+ dataReadCmd.valid := True
+ tagsReadCmd.payload := io.cpu.execute.address(lineRange)
+ dataReadCmd.payload := io.cpu.execute.address(lineRange.high downto memWordRange.low)
+ }
+
+ def collisionProcess(readAddress : UInt, readMask : Bits): Bits ={
+ val ret = Bits(wayCount bits)
+ val readAddressAligned = (readAddress >> log2Up(memDataWidth/cpuDataWidth))
+ val dataWriteMaskAligned = dataWriteCmd.mask.subdivideIn(memDataWidth/cpuDataWidth slices).read(readAddress(log2Up(memDataWidth/cpuDataWidth)-1 downto 0))
+ for(i <- 0 until wayCount){
+ ret(i) := dataWriteCmd.valid && dataWriteCmd.way(i) && dataWriteCmd.address === readAddressAligned && (readMask & dataWriteMaskAligned) =/= 0
+ }
+ ret
+ }
+
+
+ io.cpu.execute.haltIt := False
+
+ val rspSync = True
+ val rspLast = True
+ val memCmdSent = RegInit(False) setWhen (io.mem.cmd.fire) clearWhen (!io.cpu.writeBack.isStuck)
+ val pending = withExclusive generate new Area{
+ val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+ val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - ((io.mem.rsp.valid && io.mem.rsp.last) ? (io.mem.rsp.aggregated +^ 1) | 0)
+ counter := counterNext
+
+ val done = RegNext(counterNext === 0)
+ val full = RegNext(counter.msb) //Has margin
+ val last = RegNext(counterNext === 1) //Equivalent to counter === 1 but pipelined
+
+ if(!withInvalidate) {
+ io.cpu.execute.haltIt setWhen(full)
+ }
+
+ rspSync clearWhen (!last || !memCmdSent)
+ rspLast clearWhen (!last)
+ }
+
+ val sync = withInvalidate generate new Area{
+ io.mem.sync.ready := True
+ val syncCount = io.mem.sync.aggregated +^ 1
+ val syncContext = new Area{
+ val history = Mem(Bool, pendingMax)
+ val wPtr, rPtr = Reg(UInt(log2Up(pendingMax)+1 bits)) init(0)
+ when(io.mem.cmd.fire && io.mem.cmd.wr){
+ history.write(wPtr.resized, io.mem.cmd.uncached)
+ wPtr := wPtr + 1
+ }
+
+ when(io.mem.sync.fire){
+ rPtr := rPtr + syncCount
+ }
+ val uncached = history.readAsync(rPtr.resized)
+ val full = RegNext(wPtr - rPtr >= pendingMax-1)
+ io.cpu.execute.haltIt setWhen(full)
+ }
+
+ def pending(inc : Bool, dec : Bool) = new Area {
+ val pendingSync = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+ val pendingSyncNext = pendingSync + U(io.mem.cmd.fire && io.mem.cmd.wr && inc) - ((io.mem.sync.fire && dec) ? syncCount | 0)
+ pendingSync := pendingSyncNext
+ }
+
+ val writeCached = pending(inc = !io.mem.cmd.uncached, dec = !syncContext.uncached)
+ val writeUncached = pending(inc = io.mem.cmd.uncached, dec = syncContext.uncached)
+
+ def track(load : Bool, uncached : Boolean) = new Area {
+ val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+ counter := counter - ((io.mem.sync.fire && counter =/= 0 && (if(uncached) syncContext.uncached else !syncContext.uncached)) ? syncCount | 0)
+ when(load){ counter := (if(uncached) writeUncached.pendingSyncNext else writeCached.pendingSyncNext) }
+
+ val busy = counter =/= 0
+ }
+
+ val w2w = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SW, uncached = false)
+ val w2r = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SR, uncached = false)
+ val w2i = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SI, uncached = false)
+ val w2o = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SO, uncached = false)
+ val o2w = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SW, uncached = true)
+ val o2r = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SR, uncached = true)
+ //Assume o2i and o2o are ordered by the interconnect
+
+ val notTotalyConsistent = w2w.busy || w2r.busy || w2i.busy || w2o.busy || o2w.busy || o2r.busy
+ }
+
+
+
+
+ val stage0 = new Area{
+// val mask = io.cpu.execute.size.mux (
+// U(0) -> B"0001",
+// U(1) -> B"0011",
+// default -> B"1111"
+// ) |<< io.cpu.execute.address(1 downto 0)
+
+ val mask = io.cpu.execute.size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> B((1 << (1 << i)) -1, p.cpuDataBytes bits))) |<< io.cpu.execute.address(log2Up(p.cpuDataBytes)-1 downto 0)
+
+
+ val dataColisions = collisionProcess(io.cpu.execute.address(lineRange.high downto cpuWordRange.low), mask)
+ val wayInvalidate = B(0, wayCount bits) //Used if invalidate enabled
+
+ val isAmo = if(withAmo) io.cpu.execute.isAmo else False
+ }
+
+ val stageA = new Area{
+ def stagePipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else RegNextWhen(that, !io.cpu.memory.isStuck)
+ val request = stagePipe(io.cpu.execute.args)
+ val mask = stagePipe(stage0.mask)
+ io.cpu.memory.isWrite := request.wr
+
+ val isAmo = if(withAmo) request.isAmo else False
+ val isLrsc = if(withAmo) request.isLrsc else False
+ val consistancyCheck = (withInvalidate || withWriteResponse) generate new Area {
+ val hazard = False
+ val w = sync.w2w.busy || sync.o2w.busy
+ val r = stagePipe(sync.w2r.busy || sync.o2r.busy) || sync.w2r.busy || sync.o2r.busy // As it use the cache, need to check against the execute stage status too
+ val o = CombInit(sync.w2o.busy)
+ val i = CombInit(sync.w2i.busy)
+
+ val s = io.cpu.memory.mmuRsp.isIoAccess ? o | w
+ val l = io.cpu.memory.mmuRsp.isIoAccess ? i | r
+
+ when(isAmo? (s || l) | (request.wr ? s | l)){
+ hazard := True
+ }
+ when(request.totalyConsistent && (sync.notTotalyConsistent || io.cpu.writeBack.isValid && io.cpu.writeBack.isWrite)){
+ hazard := True
+ }
+ }
+
+ val wayHits = earlyWaysHits generate Bits(wayCount bits)
+ val indirectTlbHitGen = (earlyWaysHits && !directTlbHit) generate new Area {
+ wayHits := B(ways.map(way => (io.cpu.memory.mmuRsp.physicalAddress(tagRange) === way.tagsReadRsp.address && way.tagsReadRsp.valid)))
+ }
+ val directTlbHitGen = (earlyWaysHits && directTlbHit) generate new Area {
+ val wayTlbHits = for (way <- ways) yield for (tlb <- io.cpu.memory.mmuRsp.ways) yield {
+ way.tagsReadRsp.address === tlb.physical(tagRange) && tlb.sel
+ }
+ val translatedHits = B(wayTlbHits.map(_.orR))
+ val bypassHits = B(ways.map(_.tagsReadRsp.address === io.cpu.memory.address(tagRange)))
+ wayHits := (io.cpu.memory.mmuRsp.bypassTranslation ? bypassHits | translatedHits) & B(ways.map(_.tagsReadRsp.valid))
+ }
+
+ val dataMux = earlyDataMux generate MuxOH(wayHits, ways.map(_.dataReadRsp))
+ val wayInvalidate = stagePipe(stage0. wayInvalidate)
+ val dataColisions = if(mergeExecuteMemory){
+ stagePipe(stage0.dataColisions)
+ } else {
+ //Assume the writeback stage will never be unstall memory acces while memory stage is stalled
+ stagePipe(stage0.dataColisions) | collisionProcess(io.cpu.memory.address(lineRange.high downto cpuWordRange.low), mask)
+ }
+ }
+
+ val stageB = new Area {
+ def stagePipe[T <: Data](that : T) = RegNextWhen(that, !io.cpu.writeBack.isStuck)
+ def ramPipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else RegNextWhen(that, !io.cpu.writeBack.isStuck)
+ val request = RegNextWhen(stageA.request, !io.cpu.writeBack.isStuck)
+ val mmuRspFreeze = False
+ val mmuRsp = RegNextWhen(io.cpu.memory.mmuRsp, !io.cpu.writeBack.isStuck && !mmuRspFreeze)
+ val tagsReadRsp = ways.map(w => ramPipe(w.tagsReadRsp))
+ val dataReadRsp = !earlyDataMux generate ways.map(w => ramPipe(w.dataReadRsp))
+ val wayInvalidate = stagePipe(stageA. wayInvalidate)
+ val consistancyHazard = if(stageA.consistancyCheck != null) stagePipe(stageA.consistancyCheck.hazard) else False
+ val dataColisions = stagePipe(stageA.dataColisions)
+// val unaligned = if(!catchUnaligned) False else stagePipe((stageA.request.size === 2 && io.cpu.memory.address(1 downto 0) =/= 0) || (stageA.request.size === 1 && io.cpu.memory.address(0 downto 0) =/= 0))
+ val unaligned = if(!catchUnaligned) False else stagePipe((1 to log2Up(p.cpuDataBytes)).map(i => stageA.request.size === i && io.cpu.memory.address(i-1 downto 0) =/= 0).orR)
+ val waysHitsBeforeInvalidate = if(earlyWaysHits) stagePipe(B(stageA.wayHits)) else B(tagsReadRsp.map(tag => mmuRsp.physicalAddress(tagRange) === tag.address && tag.valid).asBits())
+ val waysHits = waysHitsBeforeInvalidate & ~wayInvalidate
+ val waysHit = waysHits.orR
+ val dataMux = if(earlyDataMux) stagePipe(stageA.dataMux) else MuxOH(waysHits, dataReadRsp)
+ val mask = stagePipe(stageA.mask)
+
+ //Loader interface
+ val loaderValid = False
+
+ val ioMemRspMuxed = io.mem.rsp.data.subdivideIn(cpuDataWidth bits).read(io.cpu.writeBack.address(memWordToCpuWordRange))
+
+ io.cpu.writeBack.haltIt := True
+
+ //Evict the cache after reset logics
+ val flusher = new Area {
+ val waitDone = RegInit(False) clearWhen(io.cpu.flush.ready)
+ val hold = False
+ val counter = Reg(UInt(lineRange.size + 1 bits)) init(0)
+ when(!counter.msb) {
+ tagsWriteCmd.valid := True
+ tagsWriteCmd.address := counter.resized
+ tagsWriteCmd.way.setAll()
+ tagsWriteCmd.data.valid := False
+ io.cpu.execute.haltIt := True
+ when(!hold) {
+ counter := counter + 1
+ when(io.cpu.flush.singleLine){
+ counter.msb := True
+ }
+ }
+ }
+
+ io.cpu.flush.ready := waitDone && counter.msb
+
+ val start = RegInit(True) //Used to relax timings
+ start := !waitDone && !start && io.cpu.flush.valid && !io.cpu.execute.isValid && !io.cpu.memory.isValid && !io.cpu.writeBack.isValid && !io.cpu.redo
+
+ when(start){
+ waitDone := True
+ counter := 0
+ when(io.cpu.flush.singleLine){
+ counter := U"0" @@ io.cpu.flush.lineId
+ }
+ }
+ }
+
+ val lrSc = withInternalLrSc generate new Area{
+ val reserved = RegInit(False)
+ when(io.cpu.writeBack.isValid && io.cpu.writeBack.isFiring){
+ reserved setWhen(request.isLrsc)
+ reserved clearWhen(request.wr)
+ }
+ }
+
+ val isAmo = if(withAmo) request.isAmo else False
+ val isAmoCached = if(withInternalAmo) isAmo else False
+ val isExternalLsrc = if(withExternalLrSc) request.isLrsc else False
+ val isExternalAmo = if(withExternalAmo) request.isAmo else False
+
+ val requestDataBypass = CombInit(io.cpu.writeBack.storeData)
+ import DataCacheExternalAmoStates._
+ val amo = withAmo generate new Area{
+ def rf = io.cpu.writeBack.storeData(p.rfDataWidth-1 downto 0)
+ def memLarger = if(withInternalAmo) dataMux else ioMemRspMuxed
+ def mem = memLarger.subdivideIn(rfDataWidth bits).read(io.cpu.writeBack.address(cpuWordToRfWordRange))
+ val compare = request.amoCtrl.alu.msb
+ val unsigned = request.amoCtrl.alu(2 downto 1) === B"11"
+ val addSub = (rf.asSInt + Mux(compare, ~mem, mem).asSInt + Mux(compare, S(1), S(0))).asBits
+ val less = Mux(rf.msb === mem.msb, addSub.msb, Mux(unsigned, mem.msb, rf.msb))
+ val selectRf = request.amoCtrl.swap ? True | (request.amoCtrl.alu.lsb ^ less)
+
+ val result = (request.amoCtrl.alu | (request.amoCtrl.swap ## B"00")).mux(
+ B"000" -> addSub,
+ B"001" -> (rf ^ mem),
+ B"010" -> (rf | mem),
+ B"011" -> (rf & mem),
+ default -> (selectRf ? rf | mem)
+ )
+ // val resultRegValid = RegNext(True) clearWhen(!io.cpu.writeBack.isStuck)
+ // val resultReg = RegNext(result)
+ val resultReg = Reg(Bits(32 bits))
+
+ val internal = withInternalAmo generate new Area{
+ val resultRegValid = RegNext(io.cpu.writeBack.isStuck)
+ resultReg := result
+ }
+ val external = !withInternalAmo generate new Area{
+ val state = RegInit(LR_CMD)
+ }
+ }
+
+
+ val cpuWriteToCache = False
+ when(cpuWriteToCache){
+ dataWriteCmd.valid setWhen(request.wr && waysHit)
+ dataWriteCmd.address := mmuRsp.physicalAddress(lineRange.high downto memWordRange.low)
+ dataWriteCmd.data.subdivideIn(cpuDataWidth bits).foreach(_ := requestDataBypass)
+ dataWriteCmd.mask := 0
+ dataWriteCmd.mask.subdivideIn(cpuDataWidth/8 bits).write(io.cpu.writeBack.address(memWordToCpuWordRange), mask)
+ dataWriteCmd.way := waysHits
+ }
+
+ val badPermissions = (!mmuRsp.allowWrite && request.wr) || (!mmuRsp.allowRead && (!request.wr || isAmo))
+ val loadStoreFault = io.cpu.writeBack.isValid && (mmuRsp.exception || badPermissions)
+
+ io.cpu.redo := False
+ io.cpu.writeBack.accessError := False
+ io.cpu.writeBack.mmuException := loadStoreFault && (if(catchIllegal) mmuRsp.isPaging else False)
+ io.cpu.writeBack.unalignedAccess := io.cpu.writeBack.isValid && unaligned
+ io.cpu.writeBack.isWrite := request.wr
+
+
+ io.mem.cmd.valid := False
+ io.mem.cmd.address := mmuRsp.physicalAddress
+ io.mem.cmd.last := True
+ io.mem.cmd.wr := request.wr
+ io.mem.cmd.mask := mask
+ io.mem.cmd.data := requestDataBypass
+ io.mem.cmd.uncached := mmuRsp.isIoAccess
+ io.mem.cmd.size := request.size.resized
+ if(withExternalLrSc) io.mem.cmd.exclusive := request.isLrsc || isAmo
+
+
+ val bypassCache = mmuRsp.isIoAccess || isExternalLsrc || isExternalAmo
+
+ io.cpu.writeBack.keepMemRspData := False
+ when(io.cpu.writeBack.isValid) {
+ when(isExternalAmo){
+ if(withExternalAmo) switch(amo.external.state){
+ is(LR_CMD){
+ io.mem.cmd.valid := True
+ io.mem.cmd.wr := False
+ when(io.mem.cmd.ready) {
+ amo.external.state := LR_RSP
+ }
+ }
+ is(LR_RSP){
+ when(io.mem.rsp.valid && pending.last) {
+ amo.external.state := SC_CMD
+ amo.resultReg := amo.result
+ }
+ }
+ is(SC_CMD){
+ io.mem.cmd.valid := True
+ when(io.mem.cmd.ready) {
+ amo.external.state := SC_RSP
+ }
+ }
+ is(SC_RSP){
+ io.cpu.writeBack.keepMemRspData := True
+ when(io.mem.rsp.valid) {
+ amo.external.state := LR_CMD
+ when(io.mem.rsp.exclusive){ //Success
+ cpuWriteToCache := True
+ io.cpu.writeBack.haltIt := False
+ }
+ }
+ }
+ }
+ } elsewhen(mmuRsp.isIoAccess || isExternalLsrc) {
+ val waitResponse = !request.wr
+ if(withExternalLrSc) waitResponse setWhen(request.isLrsc)
+
+ io.cpu.writeBack.haltIt.clearWhen(waitResponse ? (io.mem.rsp.valid && rspSync) | io.mem.cmd.ready)
+
+ io.mem.cmd.valid := !memCmdSent
+
+ if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){
+ io.mem.cmd.valid := False
+ io.cpu.writeBack.haltIt := False
+ }
+ } otherwise {
+ when(waysHit || request.wr && !isAmoCached) { //Do not require a cache refill ?
+ cpuWriteToCache := True
+
+ //Write through
+ io.mem.cmd.valid setWhen(request.wr)
+ io.cpu.writeBack.haltIt clearWhen(!request.wr || io.mem.cmd.ready)
+
+ if(withInternalAmo) when(isAmo){
+ when(!amo.internal.resultRegValid) {
+ io.mem.cmd.valid := False
+ dataWriteCmd.valid := False
+ io.cpu.writeBack.haltIt := True
+ }
+ }
+
+ //On write to read dataColisions
+ when((!request.wr || isAmoCached) && (dataColisions & waysHits) =/= 0){
+ io.cpu.redo := True
+ if(withAmo) io.mem.cmd.valid := False
+ }
+
+ if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){
+ io.mem.cmd.valid := False
+ dataWriteCmd.valid := False
+ io.cpu.writeBack.haltIt := False
+ }
+ } otherwise { //Do refill
+ //Emit cmd
+ io.mem.cmd.valid setWhen(!memCmdSent)
+ io.mem.cmd.wr := False
+ io.mem.cmd.address(0, lineRange.low bits) := 0
+ io.mem.cmd.size := log2Up(p.bytePerLine)
+
+ loaderValid setWhen(io.mem.cmd.ready)
+ }
+ }
+ }
+
+ when(bypassCache){
+ io.cpu.writeBack.data := ioMemRspMuxed
+ def isLast = if(pending != null) pending.last else True
+ if(catchAccessError) io.cpu.writeBack.accessError := !request.wr && isLast && io.mem.rsp.valid && io.mem.rsp.error
+ } otherwise {
+ io.cpu.writeBack.data := dataMux
+ if(catchAccessError) io.cpu.writeBack.accessError := (waysHits & B(tagsReadRsp.map(_.error))) =/= 0 || (loadStoreFault && !mmuRsp.isPaging)
+ }
+
+ if(withLrSc) {
+ val success = if(withInternalLrSc)lrSc.reserved else io.mem.rsp.exclusive
+ io.cpu.writeBack.exclusiveOk := success
+ when(request.isLrsc && request.wr){
+ // io.cpu.writeBack.data := B(!success).resized
+ if(withExternalLrSc) when(io.cpu.writeBack.isValid && io.mem.rsp.valid && rspSync && success && waysHit){
+ cpuWriteToCache := True
+ }
+ }
+ }
+ if(withAmo) when(request.isAmo){
+ requestDataBypass.subdivideIn(p.rfDataWidth bits).foreach(_ := amo.resultReg)
+ }
+
+ //remove side effects on exceptions
+ when(consistancyHazard || mmuRsp.refilling || io.cpu.writeBack.accessError || io.cpu.writeBack.mmuException || io.cpu.writeBack.unalignedAccess){
+ io.mem.cmd.valid := False
+ tagsWriteCmd.valid := False
+ dataWriteCmd.valid := False
+ loaderValid := False
+ io.cpu.writeBack.haltIt := False
+ if(withInternalLrSc) lrSc.reserved := lrSc.reserved
+ if(withExternalAmo) amo.external.state := LR_CMD
+ }
+ io.cpu.redo setWhen(io.cpu.writeBack.isValid && (mmuRsp.refilling || consistancyHazard))
+
+ assert(!(io.cpu.writeBack.isValid && !io.cpu.writeBack.haltIt && io.cpu.writeBack.isStuck), "writeBack stuck by another plugin is not allowed", ERROR)
+ }
+
+ val loader = new Area{
+ val valid = RegInit(False) setWhen(stageB.loaderValid)
+ val baseAddress = stageB.mmuRsp.physicalAddress
+
+ val counter = Counter(memTransactionPerLine)
+ val waysAllocator = Reg(Bits(wayCount bits)) init(1)
+ val error = RegInit(False)
+ val kill = False
+ val killReg = RegInit(False) setWhen(kill)
+
+ when(valid && io.mem.rsp.valid && rspLast){
+ dataWriteCmd.valid := True
+ dataWriteCmd.address := baseAddress(lineRange) @@ counter
+ dataWriteCmd.data := io.mem.rsp.data
+ dataWriteCmd.mask.setAll()
+ dataWriteCmd.way := waysAllocator
+ error := error | io.mem.rsp.error
+ counter.increment()
+ }
+
+ val done = CombInit(counter.willOverflow)
+ if(withInvalidate) done setWhen(valid && pending.counter === 0) //Used to solve invalidate write request at the same time
+
+ when(done){
+ valid := False
+
+ //Update tags
+ tagsWriteCmd.valid := True
+ tagsWriteCmd.address := baseAddress(lineRange)
+ tagsWriteCmd.data.valid := !(kill || killReg)
+ tagsWriteCmd.data.address := baseAddress(tagRange)
+ tagsWriteCmd.data.error := error || (io.mem.rsp.valid && io.mem.rsp.error)
+ tagsWriteCmd.way := waysAllocator
+
+ error := False
+ killReg := False
+ }
+
+ when(!valid){
+ waysAllocator := (waysAllocator ## waysAllocator.msb).resized
+ }
+
+ io.cpu.redo setWhen(valid.rise())
+ io.cpu.execute.refilling := valid
+
+ stageB.mmuRspFreeze setWhen(stageB.loaderValid || valid)
+ }
+
+ val invalidate = withInvalidate generate new Area{
+ val s0 = new Area{
+ val input = io.mem.inv
+ tagsInvReadCmd.valid := input.fire
+ tagsInvReadCmd.payload := input.address(lineRange)
+
+ val loaderTagHit = input.address(tagRange) === loader.baseAddress(tagRange)
+ val loaderLineHit = input.address(lineRange) === loader.baseAddress(lineRange)
+ when(input.valid && input.enable && loader.valid && loaderLineHit && loaderTagHit){
+ loader.kill := True
+ }
+ }
+ val s1 = new Area{
+ val input = s0.input.stage()
+ val loaderValid = RegNextWhen(loader.valid, s0.input.ready)
+ val loaderWay = RegNextWhen(loader.waysAllocator, s0.input.ready)
+ val loaderTagHit = RegNextWhen(s0.loaderTagHit, s0.input.ready)
+ val loaderLineHit = RegNextWhen(s0.loaderLineHit, s0.input.ready)
+ val invalidations = Bits(wayCount bits)
+
+ var wayHits = B(ways.map(way => (input.address(tagRange) === way.tagsInvReadRsp.address && way.tagsInvReadRsp.valid))) & ~invalidations
+
+ //Handle invalider read during loader write hazard
+ when(loaderValid && loaderLineHit && !loaderTagHit){
+ wayHits \= wayHits & ~loaderWay
+ }
+ }
+ val s2 = new Area{
+ val input = s1.input.stage()
+ val wayHits = RegNextWhen(s1.wayHits, s1.input.ready)
+ val wayHit = wayHits.orR
+
+ when(input.valid && input.enable) {
+ //Manage invalidate write during cpu read hazard
+ when(input.address(lineRange) === io.cpu.execute.address(lineRange)) {
+ stage0.wayInvalidate := wayHits
+ }
+
+ //Invalidate cache tag
+ when(wayHit) {
+ tagsWriteCmd.valid := True
+ stageB.flusher.hold := True
+ tagsWriteCmd.address := input.address(lineRange)
+ tagsWriteCmd.data.valid := False
+ tagsWriteCmd.way := wayHits
+ loader.done := False //Hold loader tags write
+ }
+ }
+ io.mem.ack.arbitrationFrom(input)
+ io.mem.ack.hit := wayHit
+ io.mem.ack.last := input.last
+
+ //Manage invalidation read during write hazard
+ s1.invalidations := RegNextWhen((input.valid && input.enable && input.address(lineRange) === s0.input.address(lineRange)) ? wayHits | 0, s0.input.ready)
+ }
+ }
+}
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala
new file mode 100644
index 0000000..e09712c
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala
@@ -0,0 +1,487 @@
+package vexriscv.ip
+
+import vexriscv._
+import spinal.core._
+import spinal.lib._
+import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4ReadOnly}
+import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig}
+import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbParameter, BmbSourceParameter}
+import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig}
+import spinal.lib.bus.simple._
+import vexriscv.plugin.{IBusSimpleBus, IBusSimplePlugin}
+
+
+case class InstructionCacheConfig( cacheSize : Int,
+ bytePerLine : Int,
+ wayCount : Int,
+ addressWidth : Int,
+ cpuDataWidth : Int,
+ memDataWidth : Int,
+ catchIllegalAccess : Boolean,
+ catchAccessFault : Boolean,
+ asyncTagMemory : Boolean,
+ twoCycleCache : Boolean = true,
+ twoCycleRam : Boolean = false,
+ twoCycleRamInnerMux : Boolean = false,
+ preResetFlush : Boolean = false,
+ bypassGen : Boolean = false,
+ reducedBankWidth : Boolean = false){
+
+ assert(!(twoCycleRam && !twoCycleCache))
+
+ def burstSize = bytePerLine*8/memDataWidth
+ def catchSomething = catchAccessFault || catchIllegalAccess
+
+ def getAxi4Config() = Axi4Config(
+ addressWidth = addressWidth,
+ dataWidth = memDataWidth,
+ useId = false,
+ useRegion = false,
+ useLock = false,
+ useQos = false,
+ useSize = false
+ )
+
+ def getAvalonConfig() = AvalonMMConfig.bursted(
+ addressWidth = addressWidth,
+ dataWidth = memDataWidth,
+ burstCountWidth = log2Up(burstSize + 1)).getReadOnlyConfig.copy(
+ useResponse = true,
+ constantBurstBehavior = true
+ )
+
+ def getPipelinedMemoryBusConfig() = PipelinedMemoryBusConfig(
+ addressWidth = 32,
+ dataWidth = 32
+ )
+
+ def getWishboneConfig() = WishboneConfig(
+ addressWidth = 32-log2Up(memDataWidth/8),
+ dataWidth = memDataWidth,
+ selWidth = memDataWidth/8,
+ useSTALL = false,
+ useLOCK = false,
+ useERR = true,
+ useRTY = false,
+ tgaWidth = 0,
+ tgcWidth = 0,
+ tgdWidth = 0,
+ useBTE = true,
+ useCTI = true
+ )
+
+ def getBmbParameter() = BmbParameter(
+ BmbAccessParameter(
+ addressWidth = 32,
+ dataWidth = memDataWidth
+ ).addSources(1, BmbSourceParameter(
+ lengthWidth = log2Up(this.bytePerLine),
+ contextWidth = 0,
+ canWrite = false,
+ alignment = BmbParameter.BurstAlignement.LENGTH,
+ maximumPendingTransaction = 1
+ ))
+ )
+}
+
+
+
+case class InstructionCacheCpuPrefetch(p : InstructionCacheConfig) extends Bundle with IMasterSlave{
+ val isValid = Bool
+ val haltIt = Bool
+ val pc = UInt(p.addressWidth bit)
+
+ override def asMaster(): Unit = {
+ out(isValid, pc)
+ in(haltIt)
+ }
+}
+
+trait InstructionCacheCommons{
+ val isValid : Bool
+ val isStuck : Bool
+ val pc : UInt
+ val physicalAddress : UInt
+ val data : Bits
+ val cacheMiss, error, mmuRefilling, mmuException, isUser : Bool
+}
+
+case class InstructionCacheCpuFetch(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave with InstructionCacheCommons {
+ val isValid = Bool()
+ val isStuck = Bool()
+ val isRemoved = Bool()
+ val pc = UInt(p.addressWidth bits)
+ val data = Bits(p.cpuDataWidth bits)
+ val dataBypassValid = p.bypassGen generate Bool()
+ val dataBypass = p.bypassGen generate Bits(p.cpuDataWidth bits)
+ val mmuRsp = MemoryTranslatorRsp(mmuParameter)
+ val physicalAddress = UInt(p.addressWidth bits)
+ val cacheMiss, error, mmuRefilling, mmuException, isUser = ifGen(!p.twoCycleCache)(Bool)
+
+ override def asMaster(): Unit = {
+ out(isValid, isStuck, isRemoved, pc)
+ inWithNull(error,mmuRefilling,mmuException,data, cacheMiss,physicalAddress)
+ outWithNull(isUser, dataBypass, dataBypassValid)
+ out(mmuRsp)
+ }
+}
+
+
+case class InstructionCacheCpuDecode(p : InstructionCacheConfig) extends Bundle with IMasterSlave with InstructionCacheCommons {
+ val isValid = Bool
+ val isStuck = Bool
+ val pc = UInt(p.addressWidth bits)
+ val physicalAddress = UInt(p.addressWidth bits)
+ val data = Bits(p.cpuDataWidth bits)
+ val cacheMiss, error, mmuRefilling, mmuException, isUser = ifGen(p.twoCycleCache)(Bool)
+
+ override def asMaster(): Unit = {
+ out(isValid, isStuck, pc)
+ outWithNull(isUser)
+ inWithNull(error, mmuRefilling, mmuException,data, cacheMiss, physicalAddress)
+ }
+}
+
+case class InstructionCacheCpuBus(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+ val prefetch = InstructionCacheCpuPrefetch(p)
+ val fetch = InstructionCacheCpuFetch(p, mmuParameter)
+ val decode = InstructionCacheCpuDecode(p)
+ val fill = Flow(UInt(p.addressWidth bits))
+
+ override def asMaster(): Unit = {
+ master(prefetch, fetch, decode, fill)
+ }
+}
+
+case class InstructionCacheMemCmd(p : InstructionCacheConfig) extends Bundle{
+ val address = UInt(p.addressWidth bit)
+ val size = UInt(log2Up(log2Up(p.bytePerLine) + 1) bits)
+}
+
+case class InstructionCacheMemRsp(p : InstructionCacheConfig) extends Bundle{
+ val data = Bits(p.memDataWidth bit)
+ val error = Bool
+}
+
+case class InstructionCacheMemBus(p : InstructionCacheConfig) extends Bundle with IMasterSlave{
+ val cmd = Stream (InstructionCacheMemCmd(p))
+ val rsp = Flow (InstructionCacheMemRsp(p))
+
+ override def asMaster(): Unit = {
+ master(cmd)
+ slave(rsp)
+ }
+
+ def toAxi4ReadOnly(): Axi4ReadOnly = {
+ val axiConfig = p.getAxi4Config()
+ val mm = Axi4ReadOnly(axiConfig)
+
+ mm.readCmd.valid := cmd.valid
+ mm.readCmd.len := p.burstSize-1
+ mm.readCmd.addr := cmd.address
+ mm.readCmd.prot := "110"
+ mm.readCmd.cache := "1111"
+ mm.readCmd.setBurstINCR()
+ cmd.ready := mm.readCmd.ready
+ rsp.valid := mm.readRsp.valid
+ rsp.data := mm.readRsp.data
+ rsp.error := !mm.readRsp.isOKAY()
+ mm.readRsp.ready := True
+ mm
+ }
+
+ def toAvalon(): AvalonMM = {
+ val avalonConfig = p.getAvalonConfig()
+ val mm = AvalonMM(avalonConfig)
+ mm.read := cmd.valid
+ mm.burstCount := U(p.burstSize)
+ mm.address := cmd.address
+ cmd.ready := mm.waitRequestn
+ rsp.valid := mm.readDataValid
+ rsp.data := mm.readData
+ rsp.error := mm.response =/= AvalonMM.Response.OKAY
+ mm
+ }
+
+
+ def toPipelinedMemoryBus(): PipelinedMemoryBus = {
+ val pipelinedMemoryBusConfig = p.getPipelinedMemoryBusConfig()
+ val bus = PipelinedMemoryBus(pipelinedMemoryBusConfig)
+ val counter = Counter(p.burstSize, bus.cmd.fire)
+ bus.cmd.valid := cmd.valid
+ bus.cmd.address := cmd.address(31 downto widthOf(counter.value) + 2) @@ counter @@ U"00"
+ bus.cmd.write := False
+ bus.cmd.mask.assignDontCare()
+ bus.cmd.data.assignDontCare()
+ cmd.ready := counter.willOverflow
+ rsp.valid := bus.rsp.valid
+ rsp.data := bus.rsp.payload.data
+ rsp.error := False
+ bus
+ }
+
+
+ def toWishbone(): Wishbone = {
+ val wishboneConfig = p.getWishboneConfig()
+ val bus = Wishbone(wishboneConfig)
+ val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+ val pending = counter =/= 0
+ val lastCycle = counter === counter.maxValue
+
+ bus.ADR := (cmd.address >> widthOf(counter) + log2Up(p.memDataWidth/8)) @@ counter
+ bus.CTI := lastCycle ? B"111" | B"010"
+ bus.BTE := "00"
+ bus.SEL.setAll()
+ bus.WE := False
+ bus.DAT_MOSI.assignDontCare()
+ bus.CYC := False
+ bus.STB := False
+ when(cmd.valid || pending){
+ bus.CYC := True
+ bus.STB := True
+ when(bus.ACK){
+ counter := counter + 1
+ }
+ }
+
+ cmd.ready := cmd.valid && bus.ACK
+ rsp.valid := RegNext(bus.CYC && bus.ACK) init(False)
+ rsp.data := RegNext(bus.DAT_MISO)
+ rsp.error := False //TODO
+ bus
+ }
+
+ def toBmb() : Bmb = {
+ val busParameter = p.getBmbParameter
+ val bus = Bmb(busParameter).setCompositeName(this,"toBmb", true)
+ bus.cmd.arbitrationFrom(cmd)
+ bus.cmd.opcode := Bmb.Cmd.Opcode.READ
+ bus.cmd.address := cmd.address.resized
+ bus.cmd.length := p.bytePerLine - 1
+ bus.cmd.last := True
+ rsp.valid := bus.rsp.valid
+ rsp.data := bus.rsp.data
+ rsp.error := bus.rsp.isError
+ bus.rsp.ready := True
+ bus
+ }
+}
+
+
+case class InstructionCacheFlushBus() extends Bundle with IMasterSlave{
+ val cmd = Event
+ val rsp = Bool
+
+ override def asMaster(): Unit = {
+ master(cmd)
+ in(rsp)
+ }
+}
+
+class InstructionCache(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{
+ import p._
+ val io = new Bundle{
+ val flush = in Bool()
+ val cpu = slave(InstructionCacheCpuBus(p, mmuParameter))
+ val mem = master(InstructionCacheMemBus(p))
+ }
+
+ val lineWidth = bytePerLine*8
+ val lineCount = cacheSize/bytePerLine
+ val cpuWordWidth = cpuDataWidth
+ val memWordPerLine = lineWidth/memDataWidth
+ val bytePerCpuWord = cpuWordWidth/8
+ val wayLineCount = lineCount/wayCount
+
+ val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine)
+ val lineRange = tagRange.low-1 downto log2Up(bytePerLine)
+
+ case class LineTag() extends Bundle{
+ val valid = Bool
+ val error = Bool
+ val address = UInt(tagRange.length bit)
+ }
+
+ val bankCount = wayCount
+ val bankWidth = if(!reducedBankWidth) memDataWidth else Math.max(cpuDataWidth, memDataWidth/wayCount)
+ val bankByteSize = cacheSize/bankCount
+ val bankWordCount = bankByteSize*8/bankWidth
+ val bankWordToCpuWordRange = log2Up(bankWidth/8)-1 downto log2Up(bytePerCpuWord)
+ val memToBankRatio = bankWidth*bankCount / memDataWidth
+
+ val banks = Seq.fill(bankCount)(Mem(Bits(bankWidth bits), bankWordCount))
+
+ val ways = Seq.fill(wayCount)(new Area{
+ val tags = Mem(LineTag(),wayLineCount)
+
+ if(preResetFlush){
+ tags.initBigInt(List.fill(wayLineCount)(BigInt(0)))
+ }
+ })
+
+
+ val lineLoader = new Area{
+ val fire = False
+ val valid = RegInit(False) clearWhen(fire)
+ val address = KeepAttribute(Reg(UInt(addressWidth bits)))
+ val hadError = RegInit(False) clearWhen(fire)
+ val flushPending = RegInit(True)
+
+ when(io.cpu.fill.valid){
+ valid := True
+ address := io.cpu.fill.payload
+ }
+
+ io.cpu.prefetch.haltIt := valid || flushPending
+
+ val flushCounter = Reg(UInt(log2Up(wayLineCount) + 1 bit))
+ when(!flushCounter.msb){
+ io.cpu.prefetch.haltIt := True
+ flushCounter := flushCounter + 1
+ }
+ when(!RegNext(flushCounter.msb)){
+ io.cpu.prefetch.haltIt := True
+ }
+
+ when(io.flush){
+ io.cpu.prefetch.haltIt := True
+ flushPending := True
+ }
+
+ when(flushPending && !(valid || io.cpu.fetch.isValid) ){
+ flushCounter := 0
+ flushPending := False
+ }
+
+
+
+ val cmdSent = RegInit(False) setWhen(io.mem.cmd.fire) clearWhen(fire)
+ io.mem.cmd.valid := valid && !cmdSent
+ io.mem.cmd.address := address(tagRange.high downto lineRange.low) @@ U(0,lineRange.low bit)
+ io.mem.cmd.size := log2Up(p.bytePerLine)
+
+ val wayToAllocate = Counter(wayCount, !valid)
+ val wordIndex = KeepAttribute(Reg(UInt(log2Up(memWordPerLine) bits)) init(0))
+
+
+ val write = new Area{
+ val tag = ways.map(_.tags.writePort)
+ val data = banks.map(_.writePort)
+ }
+
+ for(wayId <- 0 until wayCount){
+ val wayHit = wayToAllocate === wayId
+ val tag = write.tag(wayId)
+ tag.valid := ((wayHit && fire) || !flushCounter.msb)
+ tag.address := (flushCounter.msb ? address(lineRange) | flushCounter(flushCounter.high-1 downto 0))
+ tag.data.valid := flushCounter.msb
+ tag.data.error := hadError || io.mem.rsp.error
+ tag.data.address := address(tagRange)
+ }
+
+ for((writeBank, bankId) <- write.data.zipWithIndex){
+ if(!reducedBankWidth) {
+ writeBank.valid := io.mem.rsp.valid && wayToAllocate === bankId
+ writeBank.address := address(lineRange) @@ wordIndex
+ writeBank.data := io.mem.rsp.data
+ } else {
+ val sel = U(bankId) - wayToAllocate.value
+ val groupSel = wayToAllocate(log2Up(bankCount)-1 downto log2Up(bankCount/memToBankRatio))
+ val subSel = sel(log2Up(bankCount/memToBankRatio) -1 downto 0)
+ writeBank.valid := io.mem.rsp.valid && groupSel === (bankId >> log2Up(bankCount/memToBankRatio))
+ writeBank.address := address(lineRange) @@ wordIndex @@ (subSel)
+ writeBank.data := io.mem.rsp.data.subdivideIn(bankCount/memToBankRatio slices)(subSel)
+ }
+ }
+
+
+ when(io.mem.rsp.valid) {
+ wordIndex := (wordIndex + 1).resized
+ hadError.setWhen(io.mem.rsp.error)
+ when(wordIndex === wordIndex.maxValue) {
+ fire := True
+ }
+ }
+ }
+
+ val fetchStage = new Area{
+ val read = new Area{
+ val banksValue = for(bank <- banks) yield new Area{
+ val dataMem = bank.readSync(io.cpu.prefetch.pc(lineRange.high downto log2Up(bankWidth/8)), !io.cpu.fetch.isStuck)
+ val data = if(!twoCycleRamInnerMux) dataMem.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange)) else dataMem
+ }
+
+ val waysValues = for((way, wayId) <- ways.zipWithIndex) yield new Area{
+ val tag = if(asyncTagMemory) {
+ way.tags.readAsync(io.cpu.fetch.pc(lineRange))
+ }else {
+ way.tags.readSync(io.cpu.prefetch.pc(lineRange), !io.cpu.fetch.isStuck)
+ }
+// val data = CombInit(banksValue(wayId).data)
+ }
+ }
+
+
+ val hit = (!twoCycleRam) generate new Area{
+ val hits = read.waysValues.map(way => way.tag.valid && way.tag.address === io.cpu.fetch.mmuRsp.physicalAddress(tagRange))
+ val valid = Cat(hits).orR
+ val wayId = OHToUInt(hits)
+ val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (io.cpu.fetch.mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio)))
+ val error = read.waysValues.map(_.tag.error).read(wayId)
+ val data = read.banksValue.map(_.data).read(bankId)
+ val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(data) else data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange))
+ io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | word) else word)
+ if(twoCycleCache){
+ io.cpu.decode.data := RegNextWhen(io.cpu.fetch.data,!io.cpu.decode.isStuck)
+ }
+ }
+
+ if(twoCycleRam && wayCount == 1){
+ val cacheData = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(read.banksValue.head.data) else read.banksValue.head.data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange))
+ io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | cacheData) else cacheData)
+ }
+
+ io.cpu.fetch.physicalAddress := io.cpu.fetch.mmuRsp.physicalAddress
+
+ val resolution = ifGen(!twoCycleCache)( new Area{
+ val mmuRsp = io.cpu.fetch.mmuRsp
+
+ io.cpu.fetch.cacheMiss := !hit.valid
+ io.cpu.fetch.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute))
+ io.cpu.fetch.mmuRefilling := mmuRsp.refilling
+ io.cpu.fetch.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)
+ })
+ }
+
+
+
+ val decodeStage = ifGen(twoCycleCache) (new Area{
+ def stage[T <: Data](that : T) = RegNextWhen(that,!io.cpu.decode.isStuck)
+ val mmuRsp = stage(io.cpu.fetch.mmuRsp)
+
+ val hit = if(!twoCycleRam) new Area{
+ val valid = stage(fetchStage.hit.valid)
+ val error = stage(fetchStage.hit.error)
+ } else new Area{
+ val tags = fetchStage.read.waysValues.map(way => stage(way.tag))
+ val hits = tags.map(tag => tag.valid && tag.address === mmuRsp.physicalAddress(tagRange))
+ val valid = Cat(hits).orR
+ val wayId = OHToUInt(hits)
+ val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio)))
+ val error = tags(wayId).error
+ val data = fetchStage.read.banksValue.map(bank => stage(bank.data)).read(bankId)
+ val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) data else data.subdivideIn(cpuDataWidth bits).read(io.cpu.decode.pc(bankWordToCpuWordRange))
+ if(p.bypassGen) when(stage(io.cpu.fetch.dataBypassValid)){
+ word := stage(io.cpu.fetch.dataBypass)
+ }
+ io.cpu.decode.data := word
+ }
+
+ io.cpu.decode.cacheMiss := !hit.valid
+ io.cpu.decode.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute))
+ io.cpu.decode.mmuRefilling := mmuRsp.refilling
+ io.cpu.decode.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)
+ io.cpu.decode.physicalAddress := mmuRsp.physicalAddress
+ })
+}
+
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
new file mode 100644
index 0000000..657b2fb
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -0,0 +1,1944 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets}
+import spinal.lib.math.UnsignedDivider
+
+import scala.collection.mutable.ArrayBuffer
+
+object FpuDivSqrtIterationState extends SpinalEnum{
+ val IDLE, YY, XYY, Y2_XYY, DIV, _15_XYY2, Y_15_XYY2, Y_15_XYY2_RESULT, SQRT = newElement()
+}
+
+
+case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
+ val io = new Bundle {
+ val port = Vec(slave(FpuPort(p)), portCount)
+ }
+
+ val portCountWidth = log2Up(portCount)
+ val Source = HardType(UInt(portCountWidth bits))
+ val exponentOne = (1 << p.internalExponentSize-1) - 1
+ val exponentF32Subnormal = exponentOne-127
+ val exponentF64Subnormal = exponentOne-1023
+ val exponentF32Infinity = exponentOne+127+1
+ val exponentF64Infinity = exponentOne+1023+1
+
+
+
+ def whenDouble(format : FpuFormat.C)(yes : => Unit)(no : => Unit): Unit ={
+ if(p.withDouble) when(format === FpuFormat.DOUBLE) { yes } otherwise{ no }
+ if(!p.withDouble) no
+ }
+
+ def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={
+ if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no })
+ else no
+ }
+
+ case class RfReadInput() extends Bundle{
+ val source = Source()
+ val opcode = p.Opcode()
+ val rs1, rs2, rs3 = p.rfAddress()
+ val rd = p.rfAddress()
+ val arg = p.Arg()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+ case class RfReadOutput() extends Bundle{
+ val source = Source()
+ val opcode = p.Opcode()
+ val rs1, rs2, rs3 = p.internalFloating()
+ val rd = p.rfAddress()
+ val arg = p.Arg()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
+ }
+
+
+ case class LoadInput() extends Bundle{
+ val source = Source()
+ val rd = p.rfAddress()
+ val i2f = Bool()
+ val arg = Bits(2 bits)
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+ case class ShortPipInput() extends Bundle{
+ val source = Source()
+ val opcode = p.Opcode()
+ val rs1, rs2 = p.internalFloating()
+ val rd = p.rfAddress()
+ val value = Bits(32 bits)
+ val arg = Bits(2 bits)
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
+ }
+
+ class MulInput() extends Bundle{
+ val source = Source()
+ val rs1, rs2, rs3 = p.internalFloating()
+ val rd = p.rfAddress()
+ val add = Bool()
+ val divSqrt = Bool()
+ val msb1, msb2 = Bool() //allow usage of msb bits of mul
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+
+ case class DivSqrtInput() extends Bundle{
+ val source = Source()
+ val rs1, rs2 = p.internalFloating()
+ val rd = p.rfAddress()
+ val div = Bool()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+ case class DivInput() extends Bundle{
+ val source = Source()
+ val rs1, rs2 = p.internalFloating()
+ val rd = p.rfAddress()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+
+ case class SqrtInput() extends Bundle{
+ val source = Source()
+ val rs1 = p.internalFloating()
+ val rd = p.rfAddress()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+
+ val addExtraBits = 2
+ case class AddInput() extends Bundle{
+ val source = Source()
+ val rs1, rs2 = FpuFloat(exponentSize = p.internalExponentSize, mantissaSize = p.internalMantissaSize+addExtraBits)
+ val rd = p.rfAddress()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ val needCommit = Bool()
+ }
+
+
+ class MergeInput() extends Bundle{
+ val source = Source()
+ val rd = p.rfAddress()
+ val value = p.writeFloating()
+ val scrap = Bool()
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ val NV = Bool()
+ val DZ = Bool()
+ }
+
+ case class RoundOutput() extends Bundle{
+ val source = Source()
+ val rd = p.rfAddress()
+ val value = p.internalFloating()
+ val format = p.withDouble generate FpuFormat()
+ val NV, NX, OF, UF, DZ = Bool()
+ val write = Bool()
+ }
+
+ val rf = new Area{
+ case class Entry() extends Bundle{
+ val value = p.internalFloating()
+ val boxed = p.withDouble generate Bool()
+ }
+ val ram = Mem(Entry(), 32*portCount)
+
+ val init = new Area{
+ val counter = Reg(UInt(6 bits)) init(0)
+ val done = CombInit(counter.msb)
+ when(!done){
+ counter := counter + 1
+ }
+ def apply(port : Flow[MemWriteCmd[Bool]]) = {
+ port.valid := !done
+ port.address := counter.resized
+ port.data := False
+ port
+ }
+ }
+
+ val scoreboards = Array.fill(portCount)(new Area{
+ val target, hit = Mem(Bool, 32) // XOR
+ val writes = Mem(Bool, 32)
+
+ val targetWrite = init(target.writePort)
+ val hitWrite = init(hit.writePort)
+ })
+ }
+
+ val commitFork = new Area{
+ val load, commit = Vec(Stream(FpuCommit(p)), portCount)
+ for(i <- 0 until portCount){
+ val fork = new StreamFork(FpuCommit(p), 2, synchronous = true)
+ fork.io.input << io.port(i).commit
+ fork.io.outputs(0) >> load(i)
+ fork.io.outputs(1).pipelined(m2s = false, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload
+ }
+ }
+
+ class Tracker(width : Int) extends Area{
+ val counter = Reg(UInt(width bits)) init(0)
+ val full = counter.andR
+ val notEmpty = counter.orR
+ val inc = False
+ val dec = False
+ counter := counter + U(inc) - U(dec)
+ }
+
+ class CommitArea(source : Int) extends Area{
+ val pending = new Tracker(4)
+ val add, mul, div, sqrt, short = new Tracker(4)
+ val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow
+
+ when(input.fire){
+ add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR)
+ mul.inc setWhen(List(FpuOpcode.MUL, FpuOpcode.FMA).map(input.opcode === _).orR)
+ div.inc setWhen(List(FpuOpcode.DIV).map(input.opcode === _).orR)
+ sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR)
+ short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR)
+ rf.scoreboards(source).writes(input.rd) := input.write
+ pending.dec := True
+ }
+ }
+
+ val commitLogic = for(source <- 0 until portCount) yield new CommitArea(source)
+
+ def commitConsume(what : CommitArea => Tracker, source : UInt, fire : Bool) : Bool = {
+ for(i <- 0 until portCount) what(commitLogic(i)).dec setWhen(fire && source === i)
+ commitLogic.map(what(_).notEmpty).read(source)
+ }
+
+
+ val scheduler = for(portId <- 0 until portCount;
+ scoreboard = rf.scoreboards(portId)) yield new Area{
+ val input = io.port(portId).cmd.pipelined(s2m = true)
+ val useRs1, useRs2, useRs3, useRd = False
+ switch(input.opcode){
+ is(p.Opcode.LOAD) { useRd := True }
+ is(p.Opcode.STORE) { useRs2 := True }
+ is(p.Opcode.ADD) { useRd := True; useRs1 := True; useRs2 := True }
+ is(p.Opcode.MUL) { useRd := True; useRs1 := True; useRs2 := True }
+ is(p.Opcode.DIV) { useRd := True; useRs1 := True; useRs2 := True }
+ is(p.Opcode.SQRT) { useRd := True; useRs1 := True }
+ is(p.Opcode.FMA) { useRd := True; useRs1 := True; useRs2 := True; useRs3 := True }
+ is(p.Opcode.I2F) { useRd := True }
+ is(p.Opcode.F2I) { useRs1 := True }
+ is(p.Opcode.MIN_MAX) { useRd := True; useRs1 := True; useRs2 := True }
+ is(p.Opcode.CMP) { useRs1 := True; useRs2 := True }
+ is(p.Opcode.SGNJ) { useRd := True; useRs1 := True; useRs2 := True }
+ is(p.Opcode.FMV_X_W) { useRs1 := True }
+ is(p.Opcode.FMV_W_X) { useRd := True }
+ is(p.Opcode.FCLASS ) { useRs1 := True }
+ is(p.Opcode.FCVT_X_X ) { useRd := True; useRs1 := True }
+ }
+
+ val uses = List(useRs1, useRs2, useRs3, useRd)
+ val regs = List(input.rs1, input.rs2, input.rs3, input.rd)
+ val rfHits = regs.map(scoreboard.hit.readAsync(_))
+ val rfTargets = regs.map(scoreboard.target.readAsync(_))
+ val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _)
+
+ val hits = (0 to 3).map(id => uses(id) && rfBusy(id))
+ val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full
+ val output = input.haltWhen(hazard)
+ when(input.opcode === p.Opcode.STORE){
+ output.rs1 := input.rs2 //Datapath optimisation to unify rs source in the store pipeline
+ }
+ when(input.valid && rf.init.done){
+ scoreboard.targetWrite.address := input.rd
+ scoreboard.targetWrite.data := !rfTargets.last
+ }
+ when(output.fire && useRd){
+ scoreboard.targetWrite.valid := True
+ commitLogic(portId).pending.inc := True
+ }
+ }
+
+
+ val cmdArbiter = new Area{
+ val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
+ arbiter.io.inputs <> Vec(scheduler.map(_.output.pipelined(m2s = p.schedulerM2sPipe)))
+
+ val output = arbiter.io.output.swapPayload(RfReadInput())
+ output.source := arbiter.io.chosen
+ output.payload.assignSomeByName(arbiter.io.output.payload)
+ }
+
+ val read = new Area{
+ val s0 = cmdArbiter.output.pipelined()
+ val s1 = s0.m2sPipe()
+ val output = s1.swapPayload(RfReadOutput())
+ val rs = if(p.asyncRegFile){
+ List(s1.rs1, s1.rs2, s1.rs3).map(a => rf.ram.readAsync(s1.source @@ a))
+ } else {
+ List(s0.rs1, s0.rs2, s0.rs3).map(a => rf.ram.readSync(s0.source @@ a, enable = !output.isStall))
+ }
+ output.source := s1.source
+ output.opcode := s1.opcode
+ output.arg := s1.arg
+ output.roundMode := s1.roundMode
+ output.rd := s1.rd
+ output.rs1 := rs(0).value
+ output.rs2 := rs(1).value
+ output.rs3 := rs(2).value
+ if(p.withDouble){
+ output.rs1Boxed := rs(0).boxed
+ output.rs2Boxed := rs(1).boxed
+ output.format := s1.format
+ val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
+ val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE
+ when(!sgnjBypass) {
+ when(store) { //Pass through
+ output.format := rs(0).boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
+ } elsewhen (s1.format === FpuFormat.FLOAT =/= rs(0).boxed) {
+ output.rs1.setNanQuiet
+ output.rs1.sign := False
+ }
+ }
+ when(s1.format === FpuFormat.FLOAT =/= rs(1).boxed) {
+ output.rs2.setNanQuiet
+ output.rs2.sign := False
+ }
+ when(s1.format === FpuFormat.FLOAT =/= rs(2).boxed) {
+ output.rs3.setNanQuiet
+ }
+ }
+ }
+
+ val decode = new Area{
+ val input = read.output/*.s2mPipe()*/.combStage()
+ input.ready := False
+
+ val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR
+ val load = Stream(LoadInput())
+ load.valid := input.valid && loadHit
+ input.ready setWhen(loadHit && load.ready)
+ load.payload.assignSomeByName(input.payload)
+ load.i2f := input.opcode === FpuOpcode.I2F
+
+ val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR
+ val shortPip = Stream(ShortPipInput())
+ input.ready setWhen(shortPipHit && shortPip.ready)
+ shortPip.valid := input.valid && shortPipHit
+ shortPip.payload.assignSomeByName(input.payload)
+
+ val divSqrtHit = input.opcode === p.Opcode.DIV || input.opcode === p.Opcode.SQRT
+ val divSqrt = Stream(DivSqrtInput())
+ if(p.withDivSqrt) {
+ input.ready setWhen (divSqrtHit && divSqrt.ready)
+ divSqrt.valid := input.valid && divSqrtHit
+ divSqrt.payload.assignSomeByName(input.payload)
+ divSqrt.div := input.opcode === p.Opcode.DIV
+ }
+
+ val divHit = input.opcode === p.Opcode.DIV
+ val div = Stream(DivInput())
+ if(p.withDiv) {
+ input.ready setWhen (divHit && div.ready)
+ div.valid := input.valid && divHit
+ div.payload.assignSomeByName(input.payload)
+ }
+
+ val sqrtHit = input.opcode === p.Opcode.SQRT
+ val sqrt = Stream(SqrtInput())
+ if(p.withSqrt) {
+ input.ready setWhen (sqrtHit && sqrt.ready)
+ sqrt.valid := input.valid && sqrtHit
+ sqrt.payload.assignSomeByName(input.payload)
+ }
+
+
+ val fmaHit = input.opcode === p.Opcode.FMA
+ val mulHit = input.opcode === p.Opcode.MUL || fmaHit
+ val mul = Stream(new MulInput())
+ val divSqrtToMul = Stream(new MulInput())
+ if(!p.withDivSqrt){
+ divSqrtToMul.valid := False
+ divSqrtToMul.payload.assignDontCare()
+ }
+
+ if(p.withMul) {
+ input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid)
+ mul.valid := input.valid && mulHit || divSqrtToMul.valid
+
+ divSqrtToMul.ready := mul.ready
+ mul.payload := divSqrtToMul.payload
+ when(!divSqrtToMul.valid) {
+ mul.payload.assignSomeByName(input.payload)
+ mul.add := fmaHit
+ mul.divSqrt := False
+ mul.msb1 := True
+ mul.msb2 := True
+ mul.rs2.sign.allowOverride();
+ mul.rs2.sign := input.rs2.sign ^ input.arg(0)
+ mul.rs3.sign.allowOverride();
+ mul.rs3.sign := input.rs3.sign ^ input.arg(1)
+ }
+ }
+
+ val addHit = input.opcode === p.Opcode.ADD
+ val add = Stream(AddInput())
+ val mulToAdd = Stream(AddInput())
+
+
+ if(p.withAdd) {
+ input.ready setWhen (addHit && add.ready && !mulToAdd.valid)
+ add.valid := input.valid && addHit || mulToAdd.valid
+
+ mulToAdd.ready := add.ready
+ add.payload := mulToAdd.payload
+ when(!mulToAdd.valid) {
+ add.source := input.source
+ add.rd := input.rd
+ add.roundMode := input.roundMode
+ if(p.withDouble) add.format := input.format
+ add.needCommit := True
+ add.rs1.special := input.rs1.special
+ add.rs2.special := input.rs2.special
+ add.rs1.exponent := input.rs1.exponent
+ add.rs2.exponent := input.rs2.exponent
+ add.rs1.sign := input.rs1.sign
+ add.rs2.sign := input.rs2.sign ^ input.arg(0)
+ add.rs1.mantissa := input.rs1.mantissa << addExtraBits
+ add.rs2.mantissa := input.rs2.mantissa << addExtraBits
+ }
+ }
+ }
+
+ val load = new Area{
+
+ case class S0() extends Bundle{
+ val source = Source()
+ val rd = p.rfAddress()
+ val value = p.storeLoadType()
+ val i2f = Bool()
+ val arg = Bits(2 bits)
+ val roundMode = FpuRoundMode()
+ val format = p.withDouble generate FpuFormat()
+ }
+
+ val s0 = new Area{
+ val input = decode.load.pipelined(m2s = true, s2m = true).stage()
+ val filtred = commitFork.load.map(port => port.takeWhen(List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(_ === port.opcode).orR))
+ def feed = filtred(input.source)
+ val hazard = !feed.valid
+
+
+ val output = input.haltWhen(hazard).swapPayload(S0())
+ filtred.foreach(_.ready := False)
+ feed.ready := input.valid && output.ready
+ output.source := input.source
+ output.rd := input.rd
+ output.value := feed.value
+ output.i2f := input.i2f
+ output.arg := input.arg
+ output.roundMode := input.roundMode
+ if(p.withDouble) {
+ output.format := input.format
+ when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing
+ output.format := FpuFormat.FLOAT
+ }
+ }
+ }
+
+ val s1 = new Area{
+ val input = s0.output.stage()
+ val busy = False
+
+ val f32 = new Area{
+ val mantissa = input.value(0, 23 bits).asUInt
+ val exponent = input.value(23, 8 bits).asUInt
+ val sign = input.value(31)
+ }
+ val f64 = p.withDouble generate new Area{
+ val mantissa = input.value(0, 52 bits).asUInt
+ val exponent = input.value(52, 11 bits).asUInt
+ val sign = input.value(63)
+ }
+
+ val recodedExpOffset = UInt(p.internalExponentSize bits)
+ val passThroughFloat = p.internalFloating()
+ passThroughFloat.special := False
+
+ whenDouble(input.format){
+ passThroughFloat.sign := f64.sign
+ passThroughFloat.exponent := f64.exponent.resized
+ passThroughFloat.mantissa := f64.mantissa
+ recodedExpOffset := exponentF64Subnormal
+ } {
+ passThroughFloat.sign := f32.sign
+ passThroughFloat.exponent := f32.exponent.resized
+ passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0)
+ recodedExpOffset := exponentF32Subnormal
+ }
+
+
+ val manZero = passThroughFloat.mantissa === 0
+ val expZero = passThroughFloat.exponent === 0
+ val expOne = passThroughFloat.exponent(7 downto 0).andR
+ if(p.withDouble) {
+ expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0)
+ expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7)
+ }
+
+ val isZero = expZero && manZero
+ val isSubnormal = expZero && !manZero
+ val isInfinity = expOne && manZero
+ val isNan = expOne && !manZero
+
+
+ val fsm = new Area{
+ val done, boot, patched = Reg(Bool())
+ val ohInputWidth = 32 max p.internalMantissaSize
+ val ohInput = Bits(ohInputWidth bits).assignDontCare()
+ when(!input.i2f) {
+ if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9
+ if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits
+ } otherwise {
+ ohInput(ohInputWidth-32-1 downto 0) := 0
+ ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0)
+ }
+
+ val i2fZero = Reg(Bool)
+
+ val shift = new Area{
+ val by = Reg(UInt(log2Up(ohInputWidth) bits))
+ val input = UInt(ohInputWidth bits).assignDontCare()
+ var logic = input
+ for(i <- by.range){
+ logic \= by(i) ? (logic |<< (BigInt(1) << i)) | logic
+ }
+ val output = RegNextWhen(logic, !done)
+ }
+ shift.input := (ohInput.asUInt |<< 1).resized
+
+ when(input.valid && (input.i2f || isSubnormal) && !done){
+ busy := True
+ when(boot){
+ when(input.i2f && !patched && input.value(31) && input.arg(0)){
+ input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits))
+ patched := True
+ } otherwise {
+ shift.by := OHToUInt(OHMasking.first((ohInput).reversed))
+ boot := False
+ i2fZero := input.value(31 downto 0) === 0
+ }
+ } otherwise {
+ done := True
+ }
+ }
+
+ val expOffset = (UInt(p.internalExponentSize bits))
+ expOffset := 0
+ when(isSubnormal){
+ expOffset := shift.by.resized
+ }
+
+ when(!input.isStall){
+ done := False
+ boot := True
+ patched := False
+ }
+ }
+
+
+ val i2fSign = fsm.patched
+ val (i2fHigh, i2fLow) = fsm.shift.output.splitAt(if(p.withDouble) 0 else widthOf(input.value)-24)
+ val scrap = i2fLow =/= 0
+
+ val recoded = p.internalFloating()
+ recoded.mantissa := passThroughFloat.mantissa
+ recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized
+ recoded.sign := passThroughFloat.sign
+ recoded.setNormal
+ when(isZero){recoded.setZero}
+ when(isInfinity){recoded.setInfinity}
+ when(isNan){recoded.setNan}
+
+ val output = input.haltWhen(busy).swapPayload(new MergeInput())
+ output.source := input.source
+ output.roundMode := input.roundMode
+ if(p.withDouble) {
+ output.format := input.format
+ }
+ output.rd := input.rd
+ output.value.sign := recoded.sign
+ output.value.exponent := recoded.exponent
+ output.value.mantissa := recoded.mantissa @@ U"0"
+ output.value.special := recoded.special
+ output.scrap := False
+ output.NV := False
+ output.DZ := False
+ when(input.i2f){
+ output.value.sign := i2fSign
+ output.value.exponent := (U(exponentOne+31) - fsm.shift.by).resized
+ output.value.setNormal
+ output.scrap := scrap
+ when(fsm.i2fZero) { output.value.setZero }
+ }
+
+ when(input.i2f || isSubnormal){
+ output.value.mantissa := U(i2fHigh) @@ (if(p.withDouble) U"0" else U"")
+ }
+ }
+
+ }
+
+ val shortPip = new Area{
+ val input = decode.shortPip.stage()
+
+ val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR
+ val rfOutput = Stream(new MergeInput())
+
+ val isCommited = commitConsume(_.short, input.source, input.fire && toFpuRf)
+ val output = rfOutput.haltWhen(!isCommited)
+
+ val result = p.storeLoadType().assignDontCare()
+
+ val halt = False
+ val recodedResult = p.storeLoadType()
+ val f32 = new Area{
+ val exp = (input.rs1.exponent - (exponentOne-127)).resize(8 bits)
+ val man = CombInit(input.rs1.mantissa(if(p.withDouble) 51 downto 29 else 22 downto 0))
+ }
+ val f64 = p.withDouble generate new Area{
+ val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits)
+ val man = CombInit(input.rs1.mantissa)
+ }
+
+ whenDouble(input.format){
+ recodedResult := input.rs1.sign ## f64.exp ## f64.man
+ } {
+ recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
+ }
+
+ val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+ val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold
+ val isSubnormal = !input.rs1.special && expInSubnormalRange
+ val isNormal = !input.rs1.special && !expInSubnormalRange
+ val fsm = new Area{
+ val f2iShift = input.rs1.exponent - U(exponentOne)
+ val isF2i = input.opcode === FpuOpcode.F2I
+ val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal
+ val done, boot = Reg(Bool())
+ val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1
+
+ val shift = new Area{
+ val by = Reg(UInt(log2Up(p.internalMantissaSize+1 max 33) bits))
+ val input = UInt(p.internalMantissaSize+1 max 33 bits).assignDontCare()
+ var logic = input
+ val scrap = Reg(Bool)
+ for(i <- by.range.reverse){
+ scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0)
+ logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic
+ }
+ when(boot){
+ scrap := False
+ }
+ val output = RegNextWhen(logic, !done)
+ }
+
+ shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9)
+
+ val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10)))
+ when(input.valid && (needRecoding || isF2i) && !done){
+ halt := True
+ when(boot){
+ when(isF2i){
+ shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge
+ } otherwise {
+ shift.by := (formatShiftOffset - input.rs1.exponent).resized
+ }
+ boot := False
+ } otherwise {
+ done := True
+ }
+ }
+
+ when(!input.isStall){
+ done := False
+ boot := True
+ }
+ }
+
+ val mantissaForced = False
+ val exponentForced = False
+ val mantissaForcedValue = Bool().assignDontCare()
+ val exponentForcedValue = Bool().assignDontCare()
+ val cononicalForced = False
+
+
+ when(input.rs1.special){
+ switch(input.rs1.exponent(1 downto 0)){
+ is(FpuFloat.ZERO){
+ mantissaForced := True
+ exponentForced := True
+ mantissaForcedValue := False
+ exponentForcedValue := False
+ }
+ is(FpuFloat.INFINITY){
+ mantissaForced := True
+ exponentForced := True
+ mantissaForcedValue := False
+ exponentForcedValue := True
+ }
+ is(FpuFloat.NAN){
+ exponentForced := True
+ exponentForcedValue := True
+ when(input.rs1.isCanonical){
+ cononicalForced := True
+ mantissaForced := True
+ mantissaForcedValue := False
+ }
+ }
+ }
+ }
+
+
+
+ when(isSubnormal){
+ exponentForced := True
+ exponentForcedValue := False
+ recodedResult(0,23 bits) := fsm.shift.output(22 downto 0).asBits
+ whenDouble(input.format){
+ recodedResult(51 downto 23) := fsm.shift.output(51 downto 23).asBits
+ }{}
+ }
+ when(mantissaForced){
+ recodedResult(0,23 bits) := (default -> mantissaForcedValue)
+ whenDouble(input.format){
+ recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue)
+ }{}
+ }
+ when(exponentForced){
+ whenDouble(input.format){
+ recodedResult(52, 11 bits) := (default -> exponentForcedValue)
+ } {
+ recodedResult(23, 8 bits) := (default -> exponentForcedValue)
+ }
+ }
+ when(cononicalForced){
+ whenDouble(input.format){
+ recodedResult(63) := False
+ recodedResult(51) := True
+ } {
+ recodedResult(31) := False
+ recodedResult(22) := True
+ }
+ }
+
+ val rspNv = False
+ val rspNx = False
+
+ val f2i = new Area{ //Will not work for 64 bits float max value rounding
+ val unsigned = fsm.shift.output(32 downto 0) >> 1
+ val resign = input.arg(0) && input.rs1.sign
+ val round = fsm.shift.output(0) ## fsm.shift.scrap
+ val increment = input.roundMode.mux(
+ FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))),
+ FpuRoundMode.RTZ -> False,
+ FpuRoundMode.RDN -> (round =/= 0 && input.rs1.sign),
+ FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign),
+ FpuRoundMode.RMM -> (round(1))
+ )
+ val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt)
+ val overflow = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan
+ val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
+ val isZero = input.rs1.isZero
+ if(p.withDouble){
+ overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31)))
+ }
+ when(isZero){
+ result := 0
+ } elsewhen(underflow || overflow) {
+ val low = overflow
+ val high = input.arg(0) ^ overflow
+ result := (31 -> high, default -> low)
+ rspNv := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero
+ } otherwise {
+ rspNx := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0
+ }
+ }
+
+ val bothZero = input.rs1.isZero && input.rs2.isZero
+ val rs1Equal = input.rs1 === input.rs2
+ val rs1AbsSmaller = (input.rs1.exponent @@ input.rs1.mantissa) < (input.rs2.exponent @@ input.rs2.mantissa)
+ rs1AbsSmaller.setWhen(input.rs2.isInfinity)
+ rs1AbsSmaller.setWhen(input.rs1.isZero)
+ rs1AbsSmaller.clearWhen(input.rs2.isZero)
+ rs1AbsSmaller.clearWhen(input.rs1.isInfinity)
+ rs1Equal setWhen(input.rs1.sign === input.rs2.sign && input.rs1.isInfinity && input.rs2.isInfinity)
+ val rs1Smaller = (input.rs1.sign ## input.rs2.sign).mux(
+ 0 -> rs1AbsSmaller,
+ 1 -> False,
+ 2 -> True,
+ 3 -> (!rs1AbsSmaller && !rs1Equal)
+ )
+
+
+ val minMaxSelectRs2 = !(((rs1Smaller ^ input.arg(0)) && !input.rs1.isNan || input.rs2.isNan))
+ val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan
+ val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0))
+ when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 }
+ val sgnjRs1Sign = CombInit(input.rs1.sign)
+ val sgnjRs2Sign = CombInit(input.rs2.sign)
+ if(p.withDouble){
+ sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE)
+ }
+ val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0)
+ val fclassResult = B(0, 32 bits)
+ val decoded = input.rs1.decode()
+ fclassResult(0) := input.rs1.sign && decoded.isInfinity
+ fclassResult(1) := input.rs1.sign && isNormal
+ fclassResult(2) := input.rs1.sign && isSubnormal
+ fclassResult(3) := input.rs1.sign && decoded.isZero
+ fclassResult(4) := !input.rs1.sign && decoded.isZero
+ fclassResult(5) := !input.rs1.sign && isSubnormal
+ fclassResult(6) := !input.rs1.sign && isNormal
+ fclassResult(7) := !input.rs1.sign && decoded.isInfinity
+ fclassResult(8) := decoded.isNan && !decoded.isQuiet
+ fclassResult(9) := decoded.isNan && decoded.isQuiet
+
+
+ switch(input.opcode){
+ is(FpuOpcode.STORE) { result := recodedResult }
+ is(FpuOpcode.FMV_X_W) { result := recodedResult }
+ is(FpuOpcode.F2I) { result(31 downto 0) := f2i.result.asBits }
+ is(FpuOpcode.CMP) { result(31 downto 0) := cmpResult.resized }
+ is(FpuOpcode.FCLASS) { result(31 downto 0) := fclassResult.resized }
+ }
+
+
+ rfOutput.valid := input.valid && toFpuRf && !halt
+ rfOutput.source := input.source
+ rfOutput.rd := input.rd
+ rfOutput.roundMode := input.roundMode
+ if(p.withDouble) rfOutput.format := input.format
+ rfOutput.scrap := False
+ rfOutput.value.sign := input.rs1.sign
+ rfOutput.value.exponent := input.rs1.exponent
+ rfOutput.value.mantissa := input.rs1.mantissa @@ U"0"
+ rfOutput.value.special := input.rs1.special
+
+ switch(input.opcode){
+ is(FpuOpcode.MIN_MAX){
+ when(minMaxSelectRs2) {
+ rfOutput.value.sign := input.rs2.sign
+ rfOutput.value.exponent := input.rs2.exponent
+ rfOutput.value.mantissa := input.rs2.mantissa @@ U"0"
+ rfOutput.value.special := input.rs2.special
+ }
+ when(minMaxSelectNanQuiet){
+ rfOutput.value.setNanQuiet
+ }
+ }
+ is(FpuOpcode.SGNJ){
+ when(!input.rs1.isNan) {
+ rfOutput.value.sign := sgnjResult
+ }
+ if(p.withDouble) when(input.rs1Boxed && input.format === FpuFormat.DOUBLE){
+ rfOutput.value.sign := input.rs1.sign
+ rfOutput.format := FpuFormat.FLOAT
+ }
+ }
+ if(p.withDouble) is(FpuOpcode.FCVT_X_X){
+ rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT)
+ when(input.rs1.isNan){
+ rfOutput.value.setNanQuiet
+ }
+ }
+ }
+
+ val signalQuiet = input.opcode === FpuOpcode.CMP && input.arg =/= 2
+ val rs1Nan = input.rs1.isNan
+ val rs2Nan = input.rs2.isNan
+ val rs1NanNv = input.rs1.isNan && (!input.rs1.isQuiet || signalQuiet)
+ val rs2NanNv = input.rs2.isNan && (!input.rs2.isQuiet || signalQuiet)
+ val NV = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv ||
+ List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv
+ rspNv setWhen(NV)
+
+ val rspStreams = Vec(Stream(FpuRsp(p)), portCount)
+ input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source))
+ for(i <- 0 until portCount){
+ def rsp = rspStreams(i)
+ rsp.valid := input.valid && input.source === i && !toFpuRf && !halt
+ rsp.value := result
+ rsp.NV := rspNv
+ rsp.NX := rspNx
+ io.port(i).rsp << rsp.stage()
+ }
+
+
+ rfOutput.NV := NV
+ rfOutput.DZ := False
+ }
+
+ val mul = p.withMul generate new Area{
+ val inWidthA = p.internalMantissaSize+1
+ val inWidthB = p.internalMantissaSize+1
+ val outWidth = p.internalMantissaSize*2+2
+
+ case class MulSplit(offsetA : Int, offsetB : Int, widthA : Int, widthB : Int, id : Int){
+ val offsetC = offsetA+offsetB
+ val widthC = widthA + widthB
+ val endC = offsetC+widthC
+ }
+ val splitsUnordered = for(offsetA <- 0 until inWidthA by p.mulWidthA;
+ offsetB <- 0 until inWidthB by p.mulWidthB;
+ widthA = (inWidthA - offsetA) min p.mulWidthA;
+ widthB = (inWidthB - offsetB) min p.mulWidthB) yield {
+ MulSplit(offsetA, offsetB, widthA, widthB, -1)
+ }
+ val splits = splitsUnordered.sortWith(_.endC < _.endC).zipWithIndex.map(e => e._1.copy(id=e._2))
+
+ class MathWithExp extends MulInput{
+ val exp = UInt(p.internalExponentSize+1 bits)
+ }
+ val preMul = new Area{
+ val input = decode.mul.stage()
+ val output = input.swapPayload(new MathWithExp())
+ output.payload.assignSomeByName(input.payload)
+ output.exp := input.rs1.exponent +^ input.rs2.exponent
+ }
+ class MathWithMul extends MathWithExp{
+ val muls = Vec(splits.map(e => UInt(e.widthA + e.widthB bits)))
+ }
+ val mul = new Area{
+ val input = preMul.output.stage()
+ val output = input.swapPayload(new MathWithMul())
+ val mulA = U(input.msb1) @@ input.rs1.mantissa
+ val mulB = U(input.msb2) @@ input.rs2.mantissa
+ output.payload.assignSomeByName(input.payload)
+ splits.foreach(e => output.muls(e.id) := mulA(e.offsetA, e.widthA bits) * mulB(e.offsetB, e.widthB bits))
+ }
+
+ val sumSplitAt = splits.size/2//splits.filter(e => e.endC <= p.internalMantissaSize).size
+
+ class Sum1Output extends MathWithExp{
+ val muls2 = Vec(splits.drop(sumSplitAt).map(e => UInt(e.widthA + e.widthB bits)))
+ val mulC2 = UInt(p.internalMantissaSize*2+2 bits)
+ }
+ class Sum2Output extends MathWithExp{
+ val mulC = UInt(p.internalMantissaSize*2+2 bits)
+ }
+
+ val sum1 = new Area {
+ val input = mul.output.stage()
+ val sum = splits.take(sumSplitAt).map(e => (input.muls(e.id) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _)
+
+ val output = input.swapPayload(new Sum1Output())
+ output.payload.assignSomeByName(input.payload)
+ output.mulC2 := sum.resized
+ output.muls2 := Vec(input.muls.drop(sumSplitAt))
+ }
+
+ val sum2 = new Area {
+ val input = sum1.output.stage()
+ val sum = input.mulC2 + splits.drop(sumSplitAt).map(e => (input.muls2(e.id-sumSplitAt) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _)
+
+ val isCommited = commitConsume(_.mul, input.source, input.fire)
+ val output = input.haltWhen(!isCommited).swapPayload(new Sum2Output())
+ output.payload.assignSomeByName(input.payload)
+ output.mulC := sum
+ }
+
+ val norm = new Area{
+ val input = sum2.output.stage()
+ val (mulHigh, mulLow) = input.mulC.splitAt(p.internalMantissaSize-1)
+ val scrap = mulLow =/= 0
+ val needShift = mulHigh.msb
+ val exp = input.exp + U(needShift)
+ val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
+ scrap setWhen(needShift && mulHigh(0))
+ val forceZero = input.rs1.isZero || input.rs2.isZero
+ val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24)
+ val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25)
+ val forceUnderflow = exp < underflowThreshold
+ val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity
+ val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero))
+ val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+
+ val output = p.writeFloating()
+ output.sign := input.rs1.sign ^ input.rs2.sign
+ output.exponent := (exp - exponentOne).resized
+ output.mantissa := man.asUInt
+ output.setNormal
+ val NV = False
+
+ when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+
+ when(forceNan) {
+ output.setNanQuiet
+ NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)
+ } elsewhen(forceOverflow) {
+ output.setInfinity
+ } elsewhen(forceZero) {
+ output.setZero
+ } elsewhen(forceUnderflow) {
+ output.exponent := underflowExp.resized
+ }
+ }
+
+ val result = new Area {
+ def input = norm.input
+ def NV = norm.NV
+
+ val notMul = new Area {
+ val output = Flow(UInt(p.internalMantissaSize + 1 bits))
+ output.valid := input.valid && input.divSqrt
+ output.payload := input.mulC(p.internalMantissaSize, p.internalMantissaSize + 1 bits)
+ }
+
+ val output = Stream(new MergeInput())
+ output.valid := input.valid && !input.add && !input.divSqrt
+ output.source := input.source
+ output.rd := input.rd
+ if (p.withDouble) output.format := input.format
+ output.roundMode := input.roundMode
+ output.scrap := norm.scrap
+ output.value := norm.output
+ output.NV := NV
+ output.DZ := False
+
+ val mulToAdd = Stream(AddInput())
+ decode.mulToAdd << mulToAdd.stage()
+
+ mulToAdd.valid := input.valid && input.add
+ mulToAdd.source := input.source
+ mulToAdd.rs1.mantissa := norm.output.mantissa @@ norm.scrap //FMA Precision lost
+ mulToAdd.rs1.exponent := norm.output.exponent
+ mulToAdd.rs1.sign := norm.output.sign
+ mulToAdd.rs1.special := norm.output.special
+ mulToAdd.rs2 := input.rs3
+ mulToAdd.rs2.mantissa.removeAssignments() := input.rs3.mantissa << addExtraBits
+ mulToAdd.rd := input.rd
+ mulToAdd.roundMode := input.roundMode
+ mulToAdd.needCommit := False
+ if (p.withDouble) mulToAdd.format := input.format
+
+ when(NV){
+ mulToAdd.rs1.mantissa.msb := False
+ }
+
+ input.ready := (input.add ? mulToAdd.ready | output.ready) || input.divSqrt
+ }
+ }
+
+
+ val div = p.withDiv generate new Area{
+ val input = decode.div.halfPipe()
+ val haltIt = True
+ val isCommited = RegNext(commitConsume(_.div, input.source, input.fire))
+ val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput())
+
+ val dividerShift = if(p.withDouble) 0 else 1
+ val divider = FpuDiv(p.internalMantissaSize + dividerShift)
+ divider.io.input.a := input.rs1.mantissa << dividerShift
+ divider.io.input.b := input.rs2.mantissa << dividerShift
+ val dividerResult = divider.io.output.result >> dividerShift
+ val dividerScrap = divider.io.output.remain =/= 0 || divider.io.output.result(0, dividerShift bits) =/= 0
+
+ val cmdSent = RegInit(False) setWhen(divider.io.input.fire) clearWhen(!haltIt)
+ divider.io.input.valid := input.valid && !cmdSent
+ divider.io.output.ready := input.ready
+ output.payload.assignSomeByName(input.payload)
+
+ val needShift = !dividerResult.msb
+ val mantissa = needShift ? dividerResult(0, p.internalMantissaSize + 1 bits) | dividerResult(1, p.internalMantissaSize + 1 bits)
+ val scrap = dividerScrap || !needShift && dividerResult(0)
+ val exponentOffset = 1 << (p.internalExponentSize + 1)
+ val exponent = input.rs1.exponent + U(exponentOffset | exponentOne) - input.rs2.exponent - U(needShift)
+
+ output.value.setNormal
+ output.value.sign := input.rs1.sign ^ input.rs2.sign
+ output.value.exponent := exponent.resized
+ output.value.mantissa := mantissa
+ output.scrap := scrap
+ when(exponent.takeHigh(2) === 3){ output.value.exponent(p.internalExponentSize-3, 3 bits) := 7} //Handle overflow
+
+
+
+ val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24)
+ val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25)
+ val forceUnderflow = exponent < underflowThreshold
+ val forceOverflow = input.rs1.isInfinity || input.rs2.isZero
+ val infinitynan = input.rs1.isZero && input.rs2.isZero || input.rs1.isInfinity && input.rs2.isInfinity
+ val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+ val forceZero = input.rs1.isZero || input.rs2.isInfinity
+
+
+
+ output.NV := False
+ output.DZ := !forceNan && !input.rs1.isInfinity && input.rs2.isZero
+
+ when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+
+ when(forceNan) {
+ output.value.setNanQuiet
+ output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
+ } elsewhen(forceOverflow) {
+ output.value.setInfinity
+ } elsewhen(forceZero) {
+ output.value.setZero
+ } elsewhen(forceUnderflow) {
+ output.value.exponent := underflowExp.resized
+ }
+
+
+ haltIt clearWhen(divider.io.output.valid)
+ }
+
+
+
+ val sqrt = p.withSqrt generate new Area{
+ val input = decode.sqrt.halfPipe()
+ val haltIt = True
+ val isCommited = RegNext(commitConsume(_.sqrt, input.source, input.fire))
+ val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput())
+
+ val needShift = !input.rs1.exponent.lsb
+ val sqrt = FpuSqrt(p.internalMantissaSize)
+ sqrt.io.input.a := (needShift ? (U"1" @@ input.rs1.mantissa @@ U"0") | (U"01" @@ input.rs1.mantissa))
+
+ val cmdSent = RegInit(False) setWhen(sqrt.io.input.fire) clearWhen(!haltIt)
+ sqrt.io.input.valid := input.valid && !cmdSent
+ sqrt.io.output.ready := input.ready
+ output.payload.assignSomeByName(input.payload)
+
+
+ val scrap = sqrt.io.output.remain =/= 0
+ val exponent = RegNext(exponentOne-exponentOne/2 -1 +^ (input.rs1.exponent >> 1) + U(input.rs1.exponent.lsb))
+
+ output.value.setNormal
+ output.value.sign := input.rs1.sign
+ output.value.exponent := exponent
+ output.value.mantissa := sqrt.io.output.result
+ output.scrap := scrap
+ output.NV := False
+ output.DZ := False
+
+ val negative = !input.rs1.isNan && !input.rs1.isZero && input.rs1.sign
+
+ when(input.rs1.isInfinity){
+ output.value.setInfinity
+ }
+ when(negative){
+ output.value.setNanQuiet
+ output.NV := True
+ }
+ when(input.rs1.isNan){
+ output.value.setNanQuiet
+ output.NV := !input.rs1.isQuiet
+ }
+ when(input.rs1.isZero){
+ output.value.setZero
+ }
+
+
+// val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24)
+// val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25)
+// val forceUnderflow = exponent < underflowThreshold
+// val forceOverflow = input.rs1.isInfinity// || input.rs2.isInfinity
+// val infinitynan = input.rs1.isZero && input.rs2.isZero
+// val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+// val forceZero = input.rs1.isZero
+//
+//
+//
+// output.NV := False
+// output.DZ := !forceNan && input.rs2.isZero
+//
+// when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+//
+// when(forceNan) {
+// output.value.setNanQuiet
+// output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
+// } elsewhen(forceOverflow) {
+// output.value.setInfinity
+// } elsewhen(forceZero) {
+// output.value.setZero
+// } elsewhen(forceUnderflow) {
+// output.value.exponent := underflowExp.resized
+// }
+
+
+ haltIt clearWhen(sqrt.io.output.valid)
+ }
+
+ //divSqrt isn't realy used anymore
+ val divSqrt = p.withDivSqrt generate new Area {
+ val input = decode.divSqrt.halfPipe()
+ assert(false, "Need to implement commit tracking")
+ val aproxWidth = 8
+ val aproxDepth = 64
+ val divIterationCount = 3
+ val sqrtIterationCount = 3
+
+ val mulWidth = p.internalMantissaSize + 1
+
+ import FpuDivSqrtIterationState._
+ val state = RegInit(FpuDivSqrtIterationState.IDLE())
+ val iteration = Reg(UInt(log2Up(divIterationCount max sqrtIterationCount) bits))
+
+ decode.divSqrtToMul.valid := False
+ decode.divSqrtToMul.source := input.source
+ decode.divSqrtToMul.rs1.assignDontCare()
+ decode.divSqrtToMul.rs2.assignDontCare()
+ decode.divSqrtToMul.rs3.assignDontCare()
+ decode.divSqrtToMul.rd := input.rd
+ decode.divSqrtToMul.add := False
+ decode.divSqrtToMul.divSqrt := True
+ decode.divSqrtToMul.msb1 := True
+ decode.divSqrtToMul.msb2 := True
+ decode.divSqrtToMul.rs1.special := False //TODO
+ decode.divSqrtToMul.rs2.special := False
+ decode.divSqrtToMul.roundMode := input.roundMode
+ if(p.withDouble) decode.divSqrtToMul.format := input.format
+
+
+ val aprox = new Area {
+ val rom = Mem(UInt(aproxWidth bits), aproxDepth * 2)
+ val divTable, sqrtTable = ArrayBuffer[Double]()
+ for(i <- 0 until aproxDepth){
+ val value = 1+(i+0.5)/aproxDepth
+ divTable += 1/value
+ }
+ for(i <- 0 until aproxDepth){
+ val scale = if(i < aproxDepth/2) 2 else 1
+ val value = scale+(scale*(i%(aproxDepth/2)+0.5)/aproxDepth*2)
+// println(s"$i => $value" )
+ sqrtTable += 1/Math.sqrt(value)
+ }
+ val romElaboration = (sqrtTable ++ divTable).map(v => BigInt(((v-0.5)*2*(1 << aproxWidth)).round))
+
+ rom.initBigInt(romElaboration)
+ val div = input.rs2.mantissa.takeHigh(log2Up(aproxDepth))
+ val sqrt = U(input.rs1.exponent.lsb ## input.rs1.mantissa).takeHigh(log2Up(aproxDepth))
+ val address = U(input.div ## (input.div ? div | sqrt))
+ val raw = rom.readAsync(address)
+ val result = U"01" @@ (raw << (mulWidth-aproxWidth-2))
+ }
+
+ val divExp = new Area{
+ val value = (1 << p.internalExponentSize) - 3 - input.rs2.exponent
+ }
+ val sqrtExp = new Area{
+ val value = ((1 << p.internalExponentSize-1) + (1 << p.internalExponentSize-2) - 2 -1) - (input.rs1.exponent >> 1) + U(!input.rs1.exponent.lsb)
+ }
+
+ def mulArg(rs1 : UInt, rs2 : UInt): Unit ={
+ decode.divSqrtToMul.rs1.mantissa := rs1.resized
+ decode.divSqrtToMul.rs2.mantissa := rs2.resized
+ decode.divSqrtToMul.msb1 := rs1.msb
+ decode.divSqrtToMul.msb2 := rs2.msb
+ }
+
+ val mulBuffer = mul.result.notMul.output.toStream.stage
+ mulBuffer.ready := False
+
+ val iterationValue = Reg(UInt(mulWidth bits))
+
+ input.ready := False
+ switch(state){
+ is(IDLE){
+ iterationValue := aprox.result
+ iteration := 0
+ when(input.valid) {
+ state := YY
+ }
+ }
+ is(YY){
+ decode.divSqrtToMul.valid := True
+ mulArg(iterationValue, iterationValue)
+ when(decode.divSqrtToMul.ready) {
+ state := XYY
+ }
+ }
+ is(XYY){
+ decode.divSqrtToMul.valid := mulBuffer.valid
+ val sqrtIn = !input.rs1.exponent.lsb ? (U"1" @@ input.rs1.mantissa) | ((U"1" @@ input.rs1.mantissa) |>> 1)
+ val divIn = U"1" @@ input.rs2.mantissa
+ mulArg(input.div ? divIn| sqrtIn, mulBuffer.payload)
+ when(mulBuffer.valid && decode.divSqrtToMul.ready) {
+ state := (input.div ? Y2_XYY | _15_XYY2)
+ mulBuffer.ready := True
+ }
+ }
+ is(Y2_XYY){
+ mulBuffer.ready := True
+ when(mulBuffer.valid) {
+ iterationValue := ((iterationValue << 1) - mulBuffer.payload).resized
+ mulBuffer.ready := True
+ iteration := iteration + 1
+ when(iteration =/= divIterationCount-1){ //TODO
+ state := YY
+ } otherwise {
+ state := DIV
+ }
+ }
+ }
+ is(DIV){
+ decode.divSqrtToMul.valid := True
+ decode.divSqrtToMul.divSqrt := False
+ decode.divSqrtToMul.rs1 := input.rs1
+ decode.divSqrtToMul.rs2.sign := input.rs2.sign
+ decode.divSqrtToMul.rs2.exponent := divExp.value + iterationValue.msb.asUInt
+ decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized
+ val zero = input.rs2.isInfinity
+ val overflow = input.rs2.isZero
+ val nan = input.rs2.isNan || (input.rs1.isZero && input.rs2.isZero)
+
+ when(nan){
+ decode.divSqrtToMul.rs2.setNanQuiet
+ } elsewhen(overflow) {
+ decode.divSqrtToMul.rs2.setInfinity
+ } elsewhen(zero) {
+ decode.divSqrtToMul.rs2.setZero
+ }
+ when(decode.divSqrtToMul.ready) {
+ state := IDLE
+ input.ready := True
+ }
+ }
+ is(_15_XYY2){
+ when(mulBuffer.valid) {
+ state := Y_15_XYY2
+ mulBuffer.payload.getDrivingReg := (U"11" << mulWidth-2) - (mulBuffer.payload)
+ }
+ }
+ is(Y_15_XYY2){
+ decode.divSqrtToMul.valid := True
+ mulArg(iterationValue, mulBuffer.payload)
+ when(decode.divSqrtToMul.ready) {
+ mulBuffer.ready := True
+ state := Y_15_XYY2_RESULT
+ }
+ }
+ is(Y_15_XYY2_RESULT){
+ iterationValue := mulBuffer.payload
+ mulBuffer.ready := True
+ when(mulBuffer.valid) {
+ iteration := iteration + 1
+ when(iteration =/= sqrtIterationCount-1){
+ state := YY
+ } otherwise {
+ state := SQRT
+ }
+ }
+ }
+ is(SQRT){
+ decode.divSqrtToMul.valid := True
+ decode.divSqrtToMul.divSqrt := False
+ decode.divSqrtToMul.rs1 := input.rs1
+ decode.divSqrtToMul.rs2.sign := False
+ decode.divSqrtToMul.rs2.exponent := sqrtExp.value + iterationValue.msb.asUInt
+ decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized
+
+ val nan = input.rs1.sign && !input.rs1.isZero
+
+ when(nan){
+ decode.divSqrtToMul.rs2.setNanQuiet
+ }
+
+ when(decode.divSqrtToMul.ready) {
+ state := IDLE
+ input.ready := True
+ }
+ }
+ }
+ }
+
+ val add = p.withAdd generate new Area{
+
+
+ class PreShifterOutput extends AddInput{
+ val absRs1Bigger = Bool()
+ val rs1ExponentBigger = Bool()
+ }
+
+ val preShifter = new Area{
+ val input = decode.add.combStage()
+ val output = input.swapPayload(new PreShifterOutput)
+
+ val exp21 = input.rs2.exponent -^ input.rs1.exponent
+ val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero
+ val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent
+ val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa
+ val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity
+
+ output.payload.assignSomeByName(input.payload)
+ output.absRs1Bigger := absRs1Bigger
+ output.rs1ExponentBigger := rs1ExponentBigger
+ }
+
+ class ShifterOutput extends AddInput{
+ val xSign, ySign = Bool()
+ val xMantissa, yMantissa = UInt(p.internalMantissaSize+1+addExtraBits bits)
+ val xyExponent = UInt(p.internalExponentSize bits)
+ val xySign = Bool()
+ val roundingScrap = Bool()
+ }
+
+ val shifter = new Area {
+ val input = preShifter.output.stage()
+ val output = input.swapPayload(new ShifterOutput)
+ output.payload.assignSomeByName(input.payload)
+
+ val exp21 = input.rs2.exponent -^ input.rs1.exponent
+ val shiftBy = exp21.asSInt.abs//rs1ExponentBigger ? (0-exp21) | exp21
+ val shiftOverflow = (shiftBy >= p.internalMantissaSize+1+addExtraBits)
+ val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero)
+
+ def absRs1Bigger = input.absRs1Bigger
+ def rs1ExponentBigger = input.rs1ExponentBigger
+
+ //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
+ val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
+ output.xSign := xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign)
+ output.ySign := xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign)
+ val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa)
+ val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa)
+ var yMantissa = CombInit(yMantissaUnshifted)
+ val roundingScrap = False
+ for(i <- log2Up(p.internalMantissaSize) - 1 downto 0){
+ roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0)
+ yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa
+ }
+ when(passThrough) { yMantissa := 0 }
+ when(shiftOverflow) { roundingScrap := True }
+ when(input.rs1.special || input.rs2.special){ roundingScrap := False }
+ output.xyExponent := rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent
+ output.xMantissa := xMantissa
+ output.yMantissa := yMantissa
+ output.xySign := xySign
+ output.roundingScrap := roundingScrap
+ }
+
+ class MathOutput extends ShifterOutput{
+ val xyMantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits)
+ }
+
+ val math = new Area {
+ val input = shifter.output.stage()
+ val output = input.swapPayload(new MathOutput)
+ output.payload.assignSomeByName(input.payload)
+ import input.payload._
+
+ val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ?
+ val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here
+ output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits)
+
+ }
+
+ class OhOutput extends MathOutput{
+ val shift = UInt(log2Up(p.internalMantissaSize+1+addExtraBits+1) bits)
+ }
+
+ val oh = new Area {
+ val input = math.output.stage()
+ val isCommited = commitConsume(_.add, input.source, input.fire && input.needCommit)
+ val output = input.haltWhen(input.needCommit && !isCommited).swapPayload(new OhOutput)
+ output.payload.assignSomeByName(input.payload)
+ import input.payload._
+
+ val shiftOh = OHMasking.first(output.xyMantissa.asBools.reverse) //The OhMasking.first can be processed in parallel to the xyMantissa carry chaine
+// output.shiftOh := shiftOh
+
+ val shift = OHToUInt(shiftOh)
+ output.shift := shift
+ }
+
+
+ class NormOutput extends AddInput{
+ val mantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits)
+ val exponent = UInt(p.internalExponentSize+1 bits)
+ val infinityNan, forceNan, forceZero, forceInfinity = Bool()
+ val xySign, roundingScrap = Bool()
+ val xyMantissaZero = Bool()
+ }
+
+ val norm = new Area{
+ val input = oh.output.stage()
+ val output = input.swapPayload(new NormOutput)
+ output.payload.assignSomeByName(input.payload)
+ import input.payload._
+
+ output.mantissa := (xyMantissa |<< shift)
+ output.exponent := xyExponent -^ shift + 1
+ output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity)
+ output.forceZero := xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero)
+ output.infinityNan := (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign))
+ output.forceNan := input.rs1.isNan || input.rs2.isNan || output.infinityNan
+ output.xyMantissaZero := xyMantissa === 0
+ }
+
+ val result = new Area {
+ val input = norm.output.pipelined()
+ val output = input.swapPayload(new MergeInput())
+ import input.payload._
+
+ output.source := input.source
+ output.rd := input.rd
+ output.value.sign := xySign
+ output.value.mantissa := (mantissa >> addExtraBits).resized
+ output.value.exponent := exponent.resized
+ output.value.special := False
+ output.roundMode := input.roundMode
+ if (p.withDouble) output.format := input.format
+ output.scrap := (mantissa(1) | mantissa(0) | roundingScrap)
+
+ output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling
+ output.DZ := False
+ when(forceNan) {
+ output.value.setNanQuiet
+ } elsewhen (forceInfinity) {
+ output.value.setInfinity
+ } elsewhen (forceZero) {
+ output.value.setZero
+ when(xyMantissaZero || input.rs1.isZero && input.rs2.isZero) {
+ output.value.sign := input.rs1.sign && input.rs2.sign
+ }
+ when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN) {
+ output.value.sign := True
+ }
+ }
+ }
+ }
+
+
+ val merge = new Area {
+ val inputs = ArrayBuffer[Stream[MergeInput]]()
+ inputs += load.s1.output.stage()
+ if(p.withSqrt) (inputs += sqrt.output)
+ if(p.withDiv) (inputs += div.output)
+ if(p.withAdd) (inputs += add.result.output)
+ if(p.withMul) (inputs += mul.result.output)
+ if(p.withShortPipMisc) (inputs += shortPip.output.pipelined(m2s = true))
+ val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs).toFlow
+ }
+
+ class RoundFront extends MergeInput{
+ val mantissaIncrement = Bool()
+ val roundAdjusted = Bits(2 bits)
+ val exactMask = UInt(p.internalMantissaSize + 2 bits)
+ }
+
+ val roundFront = new Area {
+ val input = merge.arbitrated.stage()
+ val output = input.swapPayload(new RoundFront())
+ output.payload.assignSomeByName(input.payload)
+
+ val manAggregate = input.value.mantissa @@ input.scrap
+ val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal + 1)(exponentF32Subnormal + 1)
+ val expDif = expBase -^ input.value.exponent
+ val expSubnormal = !expDif.msb
+ var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0))
+ if (p.withDouble) when(input.format === FpuFormat.FLOAT) {
+ discardCount \= discardCount + 29
+ }
+ val exactMask = (List(True) ++ (0 until p.internalMantissaSize + 1).map(_ < discardCount)).asBits.asUInt
+ val roundAdjusted = (True ## (manAggregate >> 1)) (discardCount) ## ((manAggregate & exactMask) =/= 0)
+
+ val mantissaIncrement = !input.value.special && input.roundMode.mux(
+ FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate >> 2)) (discardCount))),
+ FpuRoundMode.RTZ -> False,
+ FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign),
+ FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign),
+ FpuRoundMode.RMM -> (roundAdjusted(1))
+ )
+
+ output.mantissaIncrement := mantissaIncrement
+ output.roundAdjusted := roundAdjusted
+ output.exactMask := exactMask
+ }
+
+ val roundBack = new Area{
+ val input = roundFront.output.stage()
+ val output = input.swapPayload(RoundOutput())
+ import input.payload._
+
+ val math = p.internalFloating()
+ val mantissaRange = p.internalMantissaSize downto 1
+ val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue)
+ val adderRightOp = (mantissaIncrement ? (exactMask >> 1)| U(0)).resize(p.internalMantissaSize bits)
+ val adder = KeepAttribute(KeepAttribute(input.value.exponent @@ adderMantissa) + KeepAttribute(adderRightOp) + KeepAttribute(U(mantissaIncrement)))
+ math.special := input.value.special
+ math.sign := input.value.sign
+ math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits)
+ math.mantissa := adder(0, p.internalMantissaSize bits)
+
+ val patched = CombInit(math)
+ val nx,of,uf = False
+
+ val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+ val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
+ val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1)
+
+ //catch exact 1.17549435E-38 underflow, but, who realy care ?
+// val borringCase = input.value.exponent === ufSubnormalThreshold && roundAdjusted.asUInt < U"11"
+// when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
+// uf := True
+// }
+ val threshold = input.roundMode.mux(
+ FpuRoundMode.RNE -> U"110",
+ FpuRoundMode.RTZ -> U"110",
+ FpuRoundMode.RDN -> (input.value.sign ? U"101" | U"111"),
+ FpuRoundMode.RUP -> (input.value.sign ? U"111" | U"101"),
+ FpuRoundMode.RMM -> U"110"
+ )
+ val borringRound = (input.value.mantissa(1 downto 0) ## input.scrap)
+ if(p.withDouble) when(input.format === FpuFormat.FLOAT) { borringRound := (input.value.mantissa(30 downto 29) ## input.value.mantissa(28 downto 0).orR)}
+
+ val borringCase = input.value.exponent === ufSubnormalThreshold && borringRound.asUInt < threshold
+ when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
+ uf := True
+ }
+ when(!math.special && math.exponent > ofThreshold){
+ nx := True
+ of := True
+ val doMax = input.roundMode.mux(
+ FpuRoundMode.RNE -> (False),
+ FpuRoundMode.RTZ -> (True),
+ FpuRoundMode.RDN -> (!math.sign),
+ FpuRoundMode.RUP -> (math.sign),
+ FpuRoundMode.RMM -> (False)
+ )
+ when(doMax){
+ patched.exponent := ofThreshold
+ patched.mantissa.setAll()
+ } otherwise {
+ patched.setInfinity
+ }
+ }
+
+
+ when(!math.special && math.exponent < ufThreshold){
+ nx := True
+ uf := True
+ val doMin = input.roundMode.mux(
+ FpuRoundMode.RNE -> (False),
+ FpuRoundMode.RTZ -> (False),
+ FpuRoundMode.RDN -> (math.sign),
+ FpuRoundMode.RUP -> (!math.sign),
+ FpuRoundMode.RMM -> (False)
+ )
+ when(doMin){
+ patched.exponent := ufThreshold.resized
+ patched.mantissa := 0
+ } otherwise {
+ patched.setZero
+ }
+ }
+
+
+ nx setWhen(!input.value.special && (roundAdjusted =/= 0))
+ val writes = rf.scoreboards.map(_.writes.readAsync(input.rd))
+ val write = writes.toList.read(input.source)
+ output.NX := nx & write
+ output.OF := of & write
+ output.UF := uf & write
+ output.NV := input.NV & write
+ output.DZ := input.DZ & write
+ output.source := input.source
+ output.rd := input.rd
+ output.write := write
+ if(p.withDouble) output.format := input.format
+ output.value := patched
+ }
+
+ val writeback = new Area{
+ val input = roundBack.output.stage()
+
+ for(i <- 0 until portCount){
+ val c = io.port(i).completion
+ c.valid := input.fire && input.source === i
+ c.flags.NX := input.NX
+ c.flags.OF := input.OF
+ c.flags.UF := input.UF
+ c.flags.NV := input.NV
+ c.flags.DZ := input.DZ
+ c.written := input.write
+ }
+
+ when(input.valid){
+ for(i <- 0 until portCount) {
+ val port = rf.scoreboards(i).hitWrite
+ port.valid setWhen(input.source === i)
+ port.address := input.rd
+ port.data := !rf.scoreboards(i).hit(input.rd) //TODO improve
+ }
+ }
+
+ val port = rf.ram.writePort
+ port.valid := input.valid && input.write
+ port.address := input.source @@ input.rd
+ port.data.value := input.value
+ if(p.withDouble) port.data.boxed := input.format === FpuFormat.FLOAT
+
+ val randomSim = p.sim generate (in UInt(p.internalMantissaSize bits))
+ if(p.sim) when(port.data.value.isZero || port.data.value.isInfinity){
+ port.data.value.mantissa := randomSim
+ }
+ if(p.sim) when(input.value.special){
+ port.data.value.exponent(p.internalExponentSize-1 downto 3) := randomSim.resized
+ when(!input.value.isNan){
+ port.data.value.exponent(2 downto 2) := randomSim.resized
+ }
+ }
+
+ when(port.valid){
+ assert(!(port.data.value.exponent === 0 && !port.data.value.special), "Special violation")
+ assert(!(port.data.value.exponent === port.data.value.exponent.maxValue && !port.data.value.special), "Special violation")
+ }
+ }
+}
+
+
+
+
+object FpuSynthesisBench extends App{
+ val payloadType = HardType(Bits(8 bits))
+ class Fpu(name : String, portCount : Int, p : FpuParameter) extends Rtl{
+ override def getName(): String = "Fpu_" + name
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new FpuCore(portCount, p){
+
+ setDefinitionName(Fpu.this.getName())
+ })
+ }
+
+ class Shifter(width : Int) extends Rtl{
+ override def getName(): String = "shifter_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new Component{
+ val a = in UInt(width bits)
+ val sel = in UInt(log2Up(width) bits)
+ val result = out(a >> sel)
+ setDefinitionName(Shifter.this.getName())
+ })
+ }
+
+ class Rotate(width : Int) extends Rtl{
+ override def getName(): String = "rotate_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new Component{
+ val a = in UInt(width bits)
+ val sel = in UInt(log2Up(width) bits)
+ val result = out(Delay(Delay(a,3).rotateLeft(Delay(sel,3)),3))
+ setDefinitionName(Rotate.this.getName())
+ })
+ }
+
+// rotate2_24 ->
+// Artix 7 -> 233 Mhz 96 LUT 167 FF
+// Artix 7 -> 420 Mhz 86 LUT 229 FF
+// rotate2_32 ->
+// Artix 7 -> 222 Mhz 108 LUT 238 FF
+// Artix 7 -> 399 Mhz 110 LUT 300 FF
+// rotate2_52 ->
+// Artix 7 -> 195 Mhz 230 LUT 362 FF
+// Artix 7 -> 366 Mhz 225 LUT 486 FF
+// rotate2_64 ->
+// Artix 7 -> 182 Mhz 257 LUT 465 FF
+// Artix 7 -> 359 Mhz 266 LUT 591 FF
+ class Rotate2(width : Int) extends Rtl{
+ override def getName(): String = "rotate2_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new Component{
+ val a = in UInt(width bits)
+ val sel = in UInt(log2Up(width) bits)
+ val result = out(Delay((U(0, width bits) @@ Delay(a,3)).rotateLeft(Delay(sel,3)),3))
+ setDefinitionName(Rotate2.this.getName())
+ })
+ }
+
+ class Rotate3(width : Int) extends Rtl{
+ override def getName(): String = "rotate3_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new Component{
+ val a = Delay(in UInt(width bits), 3)
+ val sel = Delay(in UInt(log2Up(width) bits),3)
+ // val result =
+ // val output = Delay(result, 3)
+ setDefinitionName(Rotate3.this.getName())
+ })
+ }
+
+ class Div(width : Int) extends Rtl{
+ override def getName(): String = "div_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new UnsignedDivider(width,width, false).setDefinitionName(Div.this.getName()))
+ }
+
+ class Add(width : Int) extends Rtl{
+ override def getName(): String = "add_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new Component{
+ val a, b = in UInt(width bits)
+ val result = out(a + b)
+ setDefinitionName(Add.this.getName())
+ })
+ }
+
+ class DivSqrtRtl(width : Int) extends Rtl{
+ override def getName(): String = "DivSqrt_" + width
+ override def getRtlPath(): String = getName() + ".v"
+ SpinalVerilog(new FpuDiv(width).setDefinitionName(DivSqrtRtl.this.getName()))
+ }
+
+ val rtls = ArrayBuffer[Rtl]()
+ rtls += new Fpu(
+ "32",
+ portCount = 1,
+ FpuParameter(
+// withDivSqrt = false,
+ withDouble = false
+ )
+ )
+ rtls += new Fpu(
+ "64",
+ portCount = 1,
+ FpuParameter(
+// withDivSqrt = false,
+ withDouble = true
+ )
+ )
+
+// rtls += new Div(52)
+// rtls += new Div(23)
+// rtls += new Add(64)
+// rtls += new DivSqrtRtl(52)
+// rtls += new DivSqrtRtl(23)
+
+ // rtls += new Shifter(24)
+// rtls += new Shifter(32)
+// rtls += new Shifter(52)
+// rtls += new Shifter(64)
+// rtls += new Rotate(24)
+// rtls += new Rotate(32)
+// rtls += new Rotate(52)
+// rtls += new Rotate(64)
+// rtls += new Rotate3(24)
+// rtls += new Rotate3(32)
+// rtls += new Rotate3(52)
+// rtls += new Rotate3(64)
+
+ val targets = XilinxStdTargets()// ++ AlteraStdTargets()
+
+
+ Bench(rtls, targets)
+}
+
+//Fpu_32 ->
+//Artix 7 -> 136 Mhz 1471 LUT 1336 FF
+//Artix 7 -> 196 Mhz 1687 LUT 1371 FF
+//Fpu_64 ->
+//Artix 7 -> 105 Mhz 2822 LUT 2132 FF
+//Artix 7 -> 161 Mhz 3114 LUT 2272 FF
+//
+//
+//
+//Fpu_32 ->
+//Artix 7 -> 128 Mhz 1693 LUT 1481 FF
+//Artix 7 -> 203 Mhz 1895 LUT 1481 FF
+//Fpu_64 ->
+//Artix 7 -> 99 Mhz 3073 LUT 2396 FF
+//Artix 7 -> 164 Mhz 3433 LUT 2432 FF
+
+
+//Fpu_32 ->
+//Artix 7 -> 112 Mhz 1790 LUT 1666 FF
+//Artix 7 -> 158 Mhz 1989 LUT 1701 FF
+//Fpu_64 ->
+//Artix 7 -> 100 Mhz 3294 LUT 2763 FF
+//Artix 7 -> 151 Mhz 3708 LUT 2904 FF
+
+//Fpu_32 ->
+//Artix 7 -> 139 Mhz 1879 LUT 1713 FF
+//Artix 7 -> 206 Mhz 2135 LUT 1723 FF
+//Fpu_64 ->
+//Artix 7 -> 106 Mhz 3502 LUT 2811 FF
+//Artix 7 -> 163 Mhz 3905 LUT 2951 FF
+
+//Fpu_32 ->
+//Artix 7 -> 130 Mhz 1889 LUT 1835 FF
+//Artix 7 -> 210 Mhz 2131 LUT 1845 FF
+//Fpu_64 ->
+//Artix 7 -> 106 Mhz 3322 LUT 3023 FF
+//Artix 7 -> 161 Mhz 3675 LUT 3163 FF
+
+//Fpu_32 ->
+//Artix 7 -> 132 Mhz 1891 LUT 1837 FF
+//Artix 7 -> 209 Mhz 2132 LUT 1847 FF
+//Fpu_64 ->
+//Artix 7 -> 105 Mhz 3348 LUT 3024 FF
+//Artix 7 -> 162 Mhz 3712 LUT 3165 FF
+
+//Fpu_32 ->
+//Artix 7 -> 128 Mhz 1796 LUT 1727 FF
+//Artix 7 -> 208 Mhz 2049 LUT 1727 FF
+//Fpu_64 ->
+//Artix 7 -> 109 Mhz 3417 LUT 2913 FF
+//Artix 7 -> 168 Mhz 3844 LUT 3053 FF
+
+/*
+testfloat -tininessafter -all1 > all1.txt
+cat all1.txt | grep "Errors found in"
+
+testfloat -tininessafter -all2 > all2.txt
+cat all2.txt | grep "Errors found in"
+
+testfloat -tininessafter -f32_mulAdd > fma.txt
+
+testfloat -tininessafter -all2 -level 2 -checkall > all2.txt
+
+
+
+all1 =>
+Errors found in f32_to_ui64_rx_minMag:
+Errors found in f32_to_i64_rx_minMag:
+Errors found in f64_to_ui64_rx_minMag:
+Errors found in f64_to_i64_rx_minMag:
+
+all2 =>
+
+
+Errors found in f32_mulAdd, rounding min:
++00.7FFFFF +67.000001 -01.000000
+ => -01.000000 ...ux expected -01.000000 ....x
++67.000001 +00.7FFFFF -01.000000
+ => -01.000000 ...ux expected -01.000000 ....x
+-00.7FFFFF -67.000001 -01.000000
+ => -01.000000 ...ux expected -01.000000 ....x
+-67.000001 -00.7FFFFF -01.000000
+ => -01.000000 ...ux expected -01.000000 ....x
+Errors found in f32_mulAdd, rounding max:
++00.7FFFFF -67.000001 +01.000000
+ => +01.000000 ...ux expected +01.000000 ....x
++67.000001 -00.7FFFFF +01.000000
+ => +01.000000 ...ux expected +01.000000 ....x
++66.7FFFFE -01.000001 +01.000000
+ => +01.000000 ...ux expected +01.000000 ....x
+-00.7FFFFF +67.000001 +01.000000
+ => +01.000000 ...ux expected +01.000000 ....x
+-67.000001 +00.7FFFFF +01.000000
+ => +01.000000 ...ux expected +01.000000 ....x
+
+
+
+ */ \ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala
new file mode 100644
index 0000000..7c9e713
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala
@@ -0,0 +1,140 @@
+package vexriscv.ip.fpu
+
+
+import spinal.core._
+import spinal.lib.math.{UnsignedDividerCmd, UnsignedDividerRsp}
+import spinal.lib._
+import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer}
+
+import scala.collection.mutable
+import scala.util.Random
+
+case class FpuDivCmd(mantissaWidth : Int) extends Bundle{
+ val a,b = UInt(mantissaWidth bits)
+}
+
+case class FpuDivRsp(mantissaWidth : Int) extends Bundle{
+ val result = UInt(mantissaWidth+1 + 2 bits)
+ val remain = UInt(mantissaWidth+1 bits)
+}
+
+case class FpuDiv(val mantissaWidth : Int) extends Component {
+ assert(mantissaWidth % 2 == 0)
+ val io = new Bundle{
+ val input = slave Stream(FpuDivCmd(mantissaWidth))
+ val output = master Stream(FpuDivRsp(mantissaWidth))
+ }
+
+ val iterations = (mantissaWidth+2+2)/2
+ val counter = Reg(UInt(log2Up(iterations) bits))
+ val busy = RegInit(False) clearWhen(io.output.fire)
+ val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire)
+
+ val shifter = Reg(UInt(mantissaWidth + 3 bits))
+ val result = Reg(UInt(mantissaWidth+1+2 bits))
+
+ val div1, div3 = Reg(UInt(mantissaWidth+3 bits))
+ val div2 = div1 |<< 1
+
+ val sub1 = shifter -^ div1
+ val sub2 = shifter -^ div2
+ val sub3 = shifter -^ div3
+
+ io.output.valid := done
+ io.output.result := (result << 0).resized
+ io.output.remain := (shifter >> 2).resized
+ io.input.ready := !busy
+
+ when(!done){
+ counter := counter + 1
+ val sel = CombInit(shifter)
+ result := result |<< 2
+ when(!sub1.msb){
+ sel := sub1.resized
+ result(1 downto 0) := 1
+ }
+ when(!sub2.msb){
+ sel := sub2.resized
+ result(1 downto 0) := 2
+ }
+ when(!sub3.msb){
+ sel := sub3.resized
+ result(1 downto 0) := 3
+ }
+ shifter := sel |<< 2
+ }
+
+ when(!busy){
+ counter := 0
+ shifter := (U"1" @@ io.input.a @@ U"").resized
+ div1 := (U"1" @@ io.input.b).resized
+ div3 := (U"1" @@ io.input.b) +^ (((U"1" @@ io.input.b)) << 1)
+ busy := io.input.valid
+ }
+}
+
+
+object FpuDivTester extends App{
+ import spinal.core.sim._
+
+ for(w <- List(16, 20)) {
+ val config = SimConfig
+ config.withFstWave
+ config.compile(new FpuDiv(w)).doSim(seed=2){dut =>
+ dut.clockDomain.forkStimulus(10)
+
+
+ val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain)
+ val rspQueue = mutable.Queue[FpuDivRsp => Unit]()
+ StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_))
+ StreamReadyRandomizer(dut.io.output, dut.clockDomain)
+
+ def test(a : Int, b : Int): Unit ={
+ cmdQueue +={p =>
+ p.a #= a
+ p.b #= b
+ }
+ rspQueue += {p =>
+ val x = (a | (1 << dut.mantissaWidth)).toLong
+ val y = (b | (1 << dut.mantissaWidth)).toLong
+ val result = (x << dut.mantissaWidth+2) / y
+ val remain = (x << dut.mantissaWidth+2) % y
+
+ assert(p.result.toLong == result, f"$x%x/$y%x=${p.result.toLong}%x instead of $result%x")
+ assert(p.remain.toLong == remain, f"$x%x %% $y%x=${p.remain.toLong}%x instead of $remain%x")
+ }
+ }
+
+ val s = dut.mantissaWidth-16
+ val f = (1 << dut.mantissaWidth)-1
+ test(0xE000 << s, 0x8000 << s)
+ test(0xC000 << s, 0x4000 << s)
+ test(0xC835 << s, 0x4742 << s)
+ test(0,0)
+ test(0,f)
+ test(f,0)
+ test(f,f)
+
+ for(i <- 0 until 10000){
+ test(Random.nextInt(1 << dut.mantissaWidth), Random.nextInt(1 << dut.mantissaWidth))
+ }
+
+ waitUntil(rspQueue.isEmpty)
+
+ dut.clockDomain.waitSampling(100)
+
+ }
+ }
+}
+
+object FpuDivTester2 extends App{
+ val mantissaWidth = 52
+ val a = BigInt(0xfffffff810000l)
+ val b = BigInt(0x0000000000FF0l)
+ val x = (a | (1l << mantissaWidth))
+ val y = (b | (1l << mantissaWidth))
+ val result = (x << mantissaWidth+2) / y
+ val remain = (x << mantissaWidth+2) % y
+ println("done")
+
+} \ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala
new file mode 100644
index 0000000..0f80905
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala
@@ -0,0 +1,116 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer}
+
+import scala.collection.mutable
+import scala.util.Random
+
+case class FpuSqrtCmd(mantissaWidth : Int) extends Bundle{
+ val a = UInt(mantissaWidth+2 bits)
+}
+
+case class FpuSqrtRsp(mantissaWidth : Int) extends Bundle{
+ val result = UInt(mantissaWidth+1 bits)
+ val remain = UInt(mantissaWidth+5 bits)
+}
+
+case class FpuSqrt(val mantissaWidth : Int) extends Component {
+ val io = new Bundle{
+ val input = slave Stream(FpuSqrtCmd(mantissaWidth))
+ val output = master Stream(FpuSqrtRsp(mantissaWidth))
+ }
+
+ val iterations = mantissaWidth+2
+ val counter = Reg(UInt(log2Up(iterations ) bits))
+ val busy = RegInit(False) clearWhen(io.output.fire)
+ val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire)
+
+ val a = Reg(UInt(mantissaWidth+5 bits))
+ val x = Reg(UInt(mantissaWidth bits))
+ val q = Reg(UInt(mantissaWidth+1 bits))
+ val t = a-(q @@ U"01")
+
+
+ io.output.valid := done
+ io.output.result := (q << 0).resized
+ io.output.remain := a
+ io.input.ready := !busy
+
+ when(!done){
+ counter := counter + 1
+ val sel = CombInit(a)
+ when(!t.msb){
+ sel := t.resized
+ }
+ q := (q @@ !t.msb).resized
+ a := (sel @@ x(widthOf(x)-2,2 bits)).resized
+ x := x |<< 2
+ }
+
+ when(!busy){
+ q := 0
+ a := io.input.a(widthOf(io.input.a)-2,2 bits).resized
+ x := (io.input.a).resized
+ counter := 0
+ when(io.input.valid){
+ busy := True
+ }
+ }
+}
+
+
+object FpuSqrtTester extends App{
+ import spinal.core.sim._
+
+ for(w <- List(16)) {
+ val config = SimConfig
+ config.withFstWave
+ config.compile(new FpuSqrt(w)).doSim(seed=2){dut =>
+ dut.clockDomain.forkStimulus(10)
+
+
+ val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain)
+ val rspQueue = mutable.Queue[FpuSqrtRsp => Unit]()
+ StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_))
+ StreamReadyRandomizer(dut.io.output, dut.clockDomain)
+
+ def test(a : Int): Unit ={
+ cmdQueue +={p =>
+ p.a #= a
+ }
+ rspQueue += {p =>
+// val x = (a * (1l << dut.mantissaWidth)).toLong
+// val result = Math.sqrt(x).toLong/(1 << dut.mantissaWidth/2)
+// val remain = a-x*x
+ val x = a.toDouble / (1 << dut.mantissaWidth)
+ val result = (Math.sqrt(x)*(1 << dut.mantissaWidth+1)).toLong
+ val filtred = result % (1 << dut.mantissaWidth+1)
+// val remain = (a-(result*result)).toLong
+ assert(p.result.toLong == filtred, f"$a%x=${p.result.toLong}%x instead of $filtred%x")
+// assert(p.remain.toLong == remain, f"$a%x=${p.remain.toLong}%x instead of $remain%x")
+ }
+ }
+
+ val s = dut.mantissaWidth-16
+ val f = (1 << dut.mantissaWidth)-1
+// test(121)
+ test(0x20000)
+ test(0x18000)
+// test(0,0)
+// test(0,f)
+// test(f,0)
+// test(f,f)
+
+ for(i <- 0 until 10000){
+ test(Random.nextInt(3 << dut.mantissaWidth) + (1 << dut.mantissaWidth))
+ }
+
+ waitUntil(rspQueue.isEmpty)
+
+ dut.clockDomain.waitSampling(100)
+
+ }
+ }
+} \ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala
new file mode 100644
index 0000000..9338c35
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala
@@ -0,0 +1,186 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+
+
+object Fpu{
+
+ object Function{
+ val MUL = 0
+ val ADD = 1
+ }
+
+}
+
+
+case class FpuFloatDecoded() extends Bundle{
+ val isNan = Bool()
+ val isNormal = Bool()
+ val isSubnormal = Bool()
+ val isZero = Bool()
+ val isInfinity = Bool()
+ val isQuiet = Bool()
+}
+
+object FpuFloat{
+ val ZERO = 0
+ val INFINITY = 1
+ val NAN = 2
+ val NAN_CANONICAL_BIT = 2
+}
+
+case class FpuFloat(exponentSize: Int,
+ mantissaSize: Int) extends Bundle {
+ val mantissa = UInt(mantissaSize bits)
+ val exponent = UInt(exponentSize bits)
+ val sign = Bool()
+ val special = Bool()
+
+ def withInvertSign : FpuFloat ={
+ val ret = FpuFloat(exponentSize,mantissaSize)
+ ret.sign := !sign
+ ret.exponent := exponent
+ ret.mantissa := mantissa
+ ret
+ }
+
+ def isNormal = !special
+ def isZero = special && exponent(1 downto 0) === FpuFloat.ZERO
+ def isInfinity = special && exponent(1 downto 0) === FpuFloat.INFINITY
+ def isNan = special && exponent(1 downto 0) === FpuFloat.NAN
+ def isQuiet = mantissa.msb
+ def isNanSignaling = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet
+ def isCanonical = exponent(FpuFloat.NAN_CANONICAL_BIT)
+
+ def setNormal = { special := False }
+ def setZero = { special := True; exponent(1 downto 0) := FpuFloat.ZERO }
+ def setInfinity = { special := True; exponent(1 downto 0) := FpuFloat.INFINITY }
+ def setNan = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := False}
+ def setNanQuiet = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := True; mantissa.msb := True; }
+
+ def decode() = {
+ val ret = FpuFloatDecoded()
+ ret.isZero := isZero
+ ret.isNormal := isNormal
+ ret.isInfinity := isInfinity
+ ret.isNan := isNan
+ ret.isQuiet := mantissa.msb
+ ret
+ }
+
+ def decodeIeee754() = {
+ val ret = FpuFloatDecoded()
+ val expZero = exponent === 0
+ val expOne = exponent === exponent.maxValue
+ val manZero = mantissa === 0
+ ret.isZero := expZero && manZero
+ ret.isSubnormal := expZero && !manZero
+ ret.isNormal := !expOne && !expZero
+ ret.isInfinity := expOne && manZero
+ ret.isNan := expOne && !manZero
+ ret.isQuiet := mantissa.msb
+ ret
+ }
+}
+
+object FpuOpcode extends SpinalEnum{
+ val LOAD, STORE, MUL, ADD, FMA, I2F, F2I, CMP, DIV, SQRT, MIN_MAX, SGNJ, FMV_X_W, FMV_W_X, FCLASS, FCVT_X_X = newElement()
+}
+
+object FpuFormat extends SpinalEnum{
+ val FLOAT, DOUBLE = newElement()
+}
+
+object FpuRoundMode extends SpinalEnum(){
+ val RNE, RTZ, RDN, RUP, RMM = newElement()
+ defaultEncoding = SpinalEnumEncoding("opt")(
+ RNE -> 0,
+ RTZ -> 1,
+ RDN -> 2,
+ RUP -> 3,
+ RMM -> 4
+ )
+}
+object FpuRoundModeInstr extends SpinalEnum(){
+ val RNE, RTZ, RDN, RUP, RMM, DYN = newElement()
+ defaultEncoding = SpinalEnumEncoding("opt")(
+ RNE -> 0,
+ RTZ -> 1,
+ RDN -> 2,
+ RUP -> 3,
+ RMM -> 4,
+ DYN -> 7
+ )
+}
+
+
+case class FpuParameter( withDouble : Boolean,
+ asyncRegFile : Boolean = false,
+ mulWidthA : Int = 18,
+ mulWidthB : Int = 18,
+ schedulerM2sPipe : Boolean = false,
+ sim : Boolean = false,
+ withAdd : Boolean = true,
+ withMul : Boolean = true,
+ withDivSqrt : Boolean = false,
+ withDiv : Boolean = true,
+ withSqrt : Boolean = true,
+ withShortPipMisc : Boolean = true){
+
+ val internalMantissaSize = if(withDouble) 52 else 23
+ val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits))
+ val internalExponentSize = (if(withDouble) 11 else 8) + 1
+ val internalFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize))
+ val writeFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize+1))
+
+ val rfAddress = HardType(UInt(5 bits))
+
+ val Opcode = FpuOpcode
+ val Format = FpuFormat
+ val argWidth = 2
+ val Arg = HardType(Bits(2 bits))
+}
+
+case class FpuFlags() extends Bundle{
+ val NX, UF, OF, DZ, NV = Bool()
+}
+
+case class FpuCompletion() extends Bundle{
+ val flags = FpuFlags()
+ val written = Bool() //Used for verification purposes
+}
+
+case class FpuCmd(p : FpuParameter) extends Bundle{
+ val opcode = p.Opcode()
+ val arg = Bits(2 bits)
+ val rs1, rs2, rs3 = p.rfAddress()
+ val rd = p.rfAddress()
+ val format = p.Format()
+ val roundMode = FpuRoundMode()
+}
+
+case class FpuCommit(p : FpuParameter) extends Bundle{
+ val opcode = FpuOpcode()
+ val rd = UInt(5 bits)
+ val write = Bool()
+ val value = p.storeLoadType() // IEEE 754
+}
+
+case class FpuRsp(p : FpuParameter) extends Bundle{
+ val value = p.storeLoadType() // IEEE754 store || Integer
+ val NV, NX = Bool()
+}
+
+case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave {
+ val cmd = Stream(FpuCmd(p))
+ val commit = Stream(FpuCommit(p))
+ val rsp = Stream(FpuRsp(p))
+ val completion = Flow(FpuCompletion())
+
+ override def asMaster(): Unit = {
+ master(cmd, commit)
+ slave(rsp)
+ in(completion)
+ }
+}