6 files changed, 4057 insertions, 0 deletions
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala
new file mode 100644
index 0000000..2b70400
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/DataCache.scala
@@ -0,0 +1,1184 @@
+package vexriscv.ip
+
+import vexriscv._
+import spinal.core._
+import spinal.lib._
+import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4Shared}
+import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig}
+import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbCmd, BmbInvalidationParameter, BmbParameter, BmbSourceParameter}
+import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig}
+import spinal.lib.bus.simple._
+import vexriscv.plugin.DBusSimpleBus
+
+
+case class DataCacheConfig(cacheSize : Int,
+                           bytePerLine : Int,
+                           wayCount : Int,
+                           addressWidth : Int,
+                           cpuDataWidth : Int,
+                           var rfDataWidth : Int = -1, //-1 mean cpuDataWidth
+                           memDataWidth : Int,
+                           catchAccessError : Boolean,
+                           catchIllegal : Boolean,
+                           catchUnaligned : Boolean,
+                           earlyWaysHits : Boolean = true,
+                           earlyDataMux : Boolean = false,
+                           tagSizeShift : Int = 0, //Used to force infering ram
+                           withLrSc : Boolean = false,
+                           withAmo : Boolean = false,
+                           withExclusive : Boolean = false,
+                           withInvalidate : Boolean = false,
+                           pendingMax : Int = 64,
+                           directTlbHit : Boolean = false,
+                           mergeExecuteMemory : Boolean = false,
+                           asyncTagMemory : Boolean = false,
+                           withWriteAggregation : Boolean = false){
+
+  if(rfDataWidth == -1)  rfDataWidth = cpuDataWidth 
+  assert(!(mergeExecuteMemory && (earlyDataMux || earlyWaysHits)))
+  assert(!(earlyDataMux && !earlyWaysHits))
+  assert(isPow2(pendingMax))
+  assert(rfDataWidth <= memDataWidth)
+
+  def lineCount = cacheSize/bytePerLine/wayCount
+  def sizeMax = log2Up(bytePerLine)
+  def sizeWidth = log2Up(sizeMax + 1)
+  val aggregationWidth = if(withWriteAggregation) log2Up(memDataBytes+1) else 0
+  def withWriteResponse = withExclusive
+  def burstSize = bytePerLine*8/memDataWidth
+  val burstLength = bytePerLine/(cpuDataWidth/8)
+  def catchSomething = catchUnaligned || catchIllegal || catchAccessError
+  def withInternalAmo = withAmo && !withExclusive
+  def withInternalLrSc = withLrSc && !withExclusive
+  def withExternalLrSc = withLrSc && withExclusive
+  def withExternalAmo = withAmo && withExclusive
+  def cpuDataBytes = cpuDataWidth/8
+  def rfDataBytes = rfDataWidth/8
+  def memDataBytes = memDataWidth/8
+  def getAxi4SharedConfig() = Axi4Config(
+    addressWidth = addressWidth,
+    dataWidth = memDataWidth,
+    useId = false,
+    useRegion = false,
+    useBurst = false,
+    useLock = false,
+    useQos = false
+  )
+
+
+  def getAvalonConfig() = AvalonMMConfig.bursted(
+    addressWidth = addressWidth,
+    dataWidth = memDataWidth,
+    burstCountWidth = log2Up(burstSize + 1)).copy(
+    useByteEnable = true,
+    constantBurstBehavior = true,
+    burstOnBurstBoundariesOnly = true,
+    useResponse = true,
+    maximumPendingReadTransactions = 2
+  )
+
+  def getWishboneConfig() = WishboneConfig(
+    addressWidth = 32-log2Up(memDataWidth/8),
+    dataWidth = memDataWidth,
+    selWidth = memDataBytes,
+    useSTALL = false,
+    useLOCK = false,
+    useERR = true,
+    useRTY = false,
+    tgaWidth = 0,
+    tgcWidth = 0,
+    tgdWidth = 0,
+    useBTE = true,
+    useCTI = true
+  )
+
+  def getBmbParameter() = BmbParameter(
+    BmbAccessParameter(
+      addressWidth = 32,
+      dataWidth = memDataWidth
+    ).addSources(1, BmbSourceParameter(
+      lengthWidth = log2Up(this.bytePerLine),
+      contextWidth = (if(!withWriteResponse) 1 else 0) + aggregationWidth,
+      alignment  = BmbParameter.BurstAlignement.LENGTH,
+      canExclusive = withExclusive,
+      withCachedRead = true,
+      canInvalidate = withInvalidate,
+      canSync = withInvalidate
+    )),
+    BmbInvalidationParameter(
+      invalidateLength = log2Up(this.bytePerLine),
+      invalidateAlignment = BmbParameter.BurstAlignement.LENGTH
+    )
+  )
+}
+
+object DataCacheCpuExecute{
+  implicit def implArgs(that : DataCacheCpuExecute) = that.args
+}
+
+case class DataCacheCpuExecute(p : DataCacheConfig) extends Bundle with IMasterSlave{
+  val isValid = Bool
+  val address = UInt(p.addressWidth bit)
+  val haltIt = Bool
+  val args = DataCacheCpuExecuteArgs(p)
+  val refilling = Bool
+
+  override def asMaster(): Unit = {
+    out(isValid, args, address)
+    in(haltIt, refilling)
+  }
+}
+
+case class DataCacheCpuExecuteArgs(p : DataCacheConfig) extends Bundle{
+  val wr = Bool
+  val size = UInt(log2Up(log2Up(p.cpuDataBytes)+1) bits)
+  val isLrsc = p.withLrSc generate Bool()
+  val isAmo = p.withAmo generate Bool()
+  val amoCtrl = p.withAmo generate new Bundle {
+    val swap = Bool()
+    val alu = Bits(3 bits)
+  }
+
+  val totalyConsistent = Bool() //Only for AMO/LRSC
+}
+
+case class DataCacheCpuMemory(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+  val isValid = Bool
+  val isStuck = Bool
+  val isWrite = Bool
+  val address = UInt(p.addressWidth bit)
+  val mmuRsp  = MemoryTranslatorRsp(mmu)
+
+  override def asMaster(): Unit = {
+    out(isValid, isStuck, address)
+    in(isWrite)
+    out(mmuRsp)
+  }
+}
+
+
+case class FenceFlags() extends Bundle {
+  val SW,SR,SO,SI,PW,PR,PO,PI = Bool()
+  val FM = Bits(4 bits)
+
+  def SL = SR || SI
+  def SS = SW || SO
+  def PL = PR || PI
+  def PS = PW || PO
+  def forceAll(): Unit ={
+    List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := True)
+  }
+  def clearAll(): Unit ={
+    List(SW,SR,SO,SI,PW,PR,PO,PI).foreach(_ := False)
+  }
+}
+
+case class DataCacheCpuWriteBack(p : DataCacheConfig) extends Bundle with IMasterSlave{
+  val isValid = Bool()
+  val isStuck = Bool()
+  val isFiring = Bool()
+  val isUser = Bool()
+  val haltIt = Bool()
+  val isWrite = Bool()
+  val storeData = Bits(p.cpuDataWidth bit)
+  val data = Bits(p.cpuDataWidth bit)
+  val address = UInt(p.addressWidth bit)
+  val mmuException, unalignedAccess, accessError = Bool()
+  val keepMemRspData = Bool() //Used by external AMO to avoid having an internal buffer
+  val fence = FenceFlags()
+  val exclusiveOk = Bool()
+
+  override def asMaster(): Unit = {
+    out(isValid,isStuck,isUser, address, fence, storeData, isFiring)
+    in(haltIt, data, mmuException, unalignedAccess, accessError, isWrite, keepMemRspData, exclusiveOk)
+  }
+}
+
+case class DataCacheFlush(lineCount : Int) extends Bundle{
+  val singleLine = Bool()
+  val lineId = UInt(log2Up(lineCount) bits)
+}
+
+case class DataCacheCpuBus(p : DataCacheConfig, mmu : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+  val execute   = DataCacheCpuExecute(p)
+  val memory    = DataCacheCpuMemory(p, mmu)
+  val writeBack = DataCacheCpuWriteBack(p)
+
+  val redo = Bool()
+  val flush = Stream(DataCacheFlush(p.lineCount))
+
+  override def asMaster(): Unit = {
+    master(execute)
+    master(memory)
+    master(writeBack)
+    master(flush)
+    in(redo)
+  }
+}
+
+
+case class DataCacheMemCmd(p : DataCacheConfig) extends Bundle{
+  val wr = Bool
+  val uncached = Bool
+  val address = UInt(p.addressWidth bit)
+  val data = Bits(p.cpuDataWidth bits)
+  val mask = Bits(p.cpuDataWidth/8 bits)
+  val size   = UInt(p.sizeWidth bits) //... 1 => 2 bytes ... 2 => 4 bytes ...
+  val exclusive = p.withExclusive generate Bool()
+  val last = Bool
+
+//  def beatCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes)))
+//  def beatCount = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)/p.memDataBytes-1)))
+
+  //Utilities which does quite a few assumtions about the bus utilisation
+  def byteCountMinusOne = size.muxListDc((0 to p.sizeMax).map(i => i -> U((1 << i)-1, log2Up(p.bytePerLine) bits)))
+  def beatCountMinusOne = (size === log2Up(p.bytePerLine)) ? U(p.burstSize-1) | U(0)
+  def beatCount         = (size === log2Up(p.bytePerLine)) ? U(p.burstSize) | U(1)
+  def isBurst           = size === log2Up(p.bytePerLine)
+}
+case class DataCacheMemRsp(p : DataCacheConfig) extends Bundle{
+  val aggregated = UInt(p.aggregationWidth bits)
+  val last = Bool()
+  val data = Bits(p.memDataWidth bit)
+  val error = Bool
+  val exclusive = p.withExclusive generate Bool()
+}
+case class DataCacheInv(p : DataCacheConfig) extends Bundle{
+  val enable = Bool()
+  val address = UInt(p.addressWidth bit)
+}
+case class DataCacheAck(p : DataCacheConfig) extends Bundle{
+  val hit = Bool()
+}
+
+case class DataCacheSync(p : DataCacheConfig) extends Bundle{
+  val aggregated = UInt(p.aggregationWidth bits)
+}
+
+case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave{
+  val cmd = Stream (DataCacheMemCmd(p))
+  val rsp = Flow (DataCacheMemRsp(p))
+
+  val inv = p.withInvalidate generate Stream(Fragment(DataCacheInv(p)))
+  val ack = p.withInvalidate generate Stream(Fragment(DataCacheAck(p)))
+  val sync = p.withInvalidate generate Stream(DataCacheSync(p))
+
+  override def asMaster(): Unit = {
+    master(cmd)
+    slave(rsp)
+
+    if(p.withInvalidate) {
+      slave(inv)
+      master(ack)
+      slave(sync)
+    }
+  }
+
+  def toAxi4Shared(stageCmd : Boolean = false, pendingWritesMax  : Int = 7): Axi4Shared = {
+    val axi = Axi4Shared(p.getAxi4SharedConfig()).setName("dbus_axi")
+
+    val cmdPreFork = if (stageCmd) cmd.stage.stage().s2mPipe() else cmd
+
+    val pendingWrites = CounterUpDown(
+      stateCount = pendingWritesMax + 1,
+      incWhen = cmdPreFork.fire && cmdPreFork.wr,
+      decWhen = axi.writeRsp.fire
+    )
+
+    val hazard = (pendingWrites =/= 0 && !cmdPreFork.wr) || pendingWrites === pendingWritesMax
+    val (cmdFork, dataFork) = StreamFork2(cmdPreFork.haltWhen(hazard))
+    val cmdStage  = cmdFork.throwWhen(RegNextWhen(!cmdFork.last,cmdFork.fire).init(False))
+    val dataStage = dataFork.throwWhen(!dataFork.wr)
+
+    axi.sharedCmd.arbitrationFrom(cmdStage)
+    axi.sharedCmd.write := cmdStage.wr
+    axi.sharedCmd.prot := "010"
+    axi.sharedCmd.cache := "1111"
+    axi.sharedCmd.size := log2Up(p.memDataBytes)
+    axi.sharedCmd.addr := cmdStage.address
+    axi.sharedCmd.len  := cmdStage.beatCountMinusOne.resized
+
+    axi.writeData.arbitrationFrom(dataStage)
+    axi.writeData.data := dataStage.data
+    axi.writeData.strb := dataStage.mask
+    axi.writeData.last := dataStage.last
+
+    rsp.valid := axi.r.valid
+    rsp.error := !axi.r.isOKAY()
+    rsp.data := axi.r.data
+
+    axi.r.ready := True
+    axi.b.ready := True
+
+    axi
+  }
+
+
+  def toAvalon(): AvalonMM = {
+    val avalonConfig = p.getAvalonConfig()
+    val mm = AvalonMM(avalonConfig)
+    mm.read := cmd.valid && !cmd.wr
+    mm.write := cmd.valid && cmd.wr
+    mm.address := cmd.address(cmd.address.high downto log2Up(p.memDataWidth/8)) @@ U(0,log2Up(p.memDataWidth/8) bits)
+    mm.burstCount := cmd.beatCount
+    mm.byteEnable := cmd.mask
+    mm.writeData := cmd.data
+
+    cmd.ready := mm.waitRequestn
+    rsp.valid := mm.readDataValid
+    rsp.data  := mm.readData
+    rsp.error := mm.response =/= AvalonMM.Response.OKAY
+
+    mm
+  }
+
+  def toWishbone(): Wishbone = {
+    val wishboneConfig = p.getWishboneConfig()
+    val bus = Wishbone(wishboneConfig)
+    val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+    val addressShift = log2Up(p.memDataWidth/8)
+
+    val cmdBridge = Stream (DataCacheMemCmd(p))
+    val isBurst = cmdBridge.isBurst
+    cmdBridge.valid := cmd.valid
+    cmdBridge.address := (isBurst ? (cmd.address(31 downto widthOf(counter) + addressShift) @@ counter @@ U(0, addressShift bits)) | (cmd.address(31 downto addressShift) @@ U(0, addressShift bits)))
+    cmdBridge.wr := cmd.wr
+    cmdBridge.mask := cmd.mask
+    cmdBridge.data := cmd.data
+    cmdBridge.size := cmd.size
+    cmdBridge.last := !isBurst || counter === p.burstSize-1
+    cmd.ready := cmdBridge.ready && (cmdBridge.wr || cmdBridge.last)
+
+
+    when(cmdBridge.fire){
+      counter := counter + 1
+      when(cmdBridge.last){
+        counter := 0
+      }
+    }
+
+
+    bus.ADR := cmdBridge.address >> addressShift
+    bus.CTI := Mux(isBurst, cmdBridge.last ? B"111" | B"010", B"000")
+    bus.BTE := B"00"
+    bus.SEL := cmdBridge.wr ? cmdBridge.mask | B((1 << p.memDataBytes)-1)
+    bus.WE  := cmdBridge.wr
+    bus.DAT_MOSI := cmdBridge.data
+
+    cmdBridge.ready := cmdBridge.valid && bus.ACK
+    bus.CYC := cmdBridge.valid
+    bus.STB := cmdBridge.valid
+
+    rsp.valid := RegNext(cmdBridge.valid && !bus.WE && bus.ACK) init(False)
+    rsp.data  := RegNext(bus.DAT_MISO)
+    rsp.error := False //TODO
+    bus
+  }
+
+
+
+  def toPipelinedMemoryBus(): PipelinedMemoryBus = {
+    val bus = PipelinedMemoryBus(32,32)
+
+    val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+    when(bus.cmd.fire){ counter := counter + 1 }
+    when(    cmd.fire && cmd.last){ counter := 0 }
+
+    bus.cmd.valid := cmd.valid
+    bus.cmd.address := (cmd.address(31 downto 2) | counter.resized) @@ U"00"
+    bus.cmd.write := cmd.wr
+    bus.cmd.mask := cmd.mask
+    bus.cmd.data := cmd.data
+    cmd.ready := bus.cmd.ready && (cmd.wr || counter === p.burstSize-1)
+    rsp.valid := bus.rsp.valid
+    rsp.data  := bus.rsp.payload.data
+    rsp.error := False
+    bus
+  }
+
+
+  def toBmb(syncPendingMax : Int = 32,
+            timeoutCycles : Int = 16) : Bmb = new Area{
+    setCompositeName(DataCacheMemBus.this, "Bridge", true)
+    val pipelinedMemoryBusConfig = p.getBmbParameter()
+    val bus = Bmb(pipelinedMemoryBusConfig).setCompositeName(this,"toBmb", true)
+
+    case class Context() extends Bundle{
+      val isWrite = !p.withWriteResponse generate Bool()
+      val rspCount = (p.aggregationWidth != 0) generate UInt(p.aggregationWidth bits)
+    }
+
+
+    def sizeToLength(size : UInt) = size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> U((1 << i)-1, log2Up(p.cpuDataBytes) bits)))
+
+    val withoutWriteBuffer = if(p.aggregationWidth == 0) new Area {
+      val busCmdContext = Context()
+
+      bus.cmd.valid := cmd.valid
+      bus.cmd.last := cmd.last
+      bus.cmd.opcode := (cmd.wr ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+      bus.cmd.address := cmd.address.resized
+      bus.cmd.data := cmd.data
+      bus.cmd.length := cmd.byteCountMinusOne
+      bus.cmd.mask := cmd.mask
+      if (p.withExclusive) bus.cmd.exclusive := cmd.exclusive
+      if (!p.withWriteResponse) busCmdContext.isWrite := cmd.wr
+      bus.cmd.context := B(busCmdContext)
+
+      cmd.ready := bus.cmd.ready
+      if(p.withInvalidate) sync.arbitrationFrom(bus.sync)
+    }
+
+    val withWriteBuffer = if(p.aggregationWidth != 0) new Area {
+      val buffer = new Area {
+        val stream = cmd.toEvent().m2sPipe()
+        val address = Reg(UInt(p.addressWidth bits))
+        val length = Reg(UInt(pipelinedMemoryBusConfig.access.lengthWidth bits))
+        val write  = Reg(Bool)
+        val exclusive = Reg(Bool)
+        val data = Reg(Bits(p.memDataWidth bits))
+        val mask = Reg(Bits(p.memDataWidth/8 bits)) init(0)
+      }
+
+      val aggregationRange = log2Up(p.memDataWidth/8)-1 downto log2Up(p.cpuDataWidth/8)
+      val tagRange = p.addressWidth-1 downto aggregationRange.high+1
+      val aggregationEnabled = Reg(Bool)
+      val aggregationCounter = Reg(UInt(p.aggregationWidth bits)) init(0)
+      val aggregationCounterFull = aggregationCounter === aggregationCounter.maxValue
+      val timer = Reg(UInt(log2Up(timeoutCycles)+1 bits)) init(0)
+      val timerFull = timer.msb
+      val hit = cmd.address(tagRange) === buffer.address(tagRange)
+      val cmdExclusive = if(p.withExclusive) cmd.exclusive else False
+      val canAggregate = cmd.valid && cmd.wr && !cmd.uncached && !cmdExclusive && !timerFull && !aggregationCounterFull && (!buffer.stream.valid || aggregationEnabled && hit)
+      val doFlush = cmd.valid && !canAggregate || timerFull || aggregationCounterFull || !aggregationEnabled
+//      val canAggregate = False
+//      val doFlush = True
+      val busCmdContext = Context()
+      val halt = False
+
+      when(cmd.fire){
+        aggregationCounter := aggregationCounter + 1
+      }
+      when(buffer.stream.valid && !timerFull){
+        timer := timer + 1
+      }
+      when(bus.cmd.fire || !buffer.stream.valid){
+        buffer.mask := 0
+        aggregationCounter := 0
+        timer := 0
+      }
+
+      buffer.stream.ready := (bus.cmd.ready && doFlush || canAggregate) && !halt
+      bus.cmd.valid := buffer.stream.valid && doFlush && !halt
+      bus.cmd.last := True
+      bus.cmd.opcode := (buffer.write ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+      bus.cmd.address := buffer.address
+      bus.cmd.length := buffer.length
+      bus.cmd.data := buffer.data
+      bus.cmd.mask := buffer.mask
+
+      if (p.withExclusive) bus.cmd.exclusive := buffer.exclusive
+      bus.cmd.context.removeAssignments() := B(busCmdContext)
+      if (!p.withWriteResponse) busCmdContext.isWrite := bus.cmd.isWrite
+      busCmdContext.rspCount := aggregationCounter
+
+      val aggregationSel = cmd.address(aggregationRange)
+      when(cmd.fire){
+        val dIn = cmd.data.subdivideIn(8 bits)
+        val dReg = buffer.data.subdivideIn(8 bits)
+        for(byteId <- 0 until p.memDataBytes){
+          when(aggregationSel === byteId / p.cpuDataBytes && cmd.mask(byteId % p.cpuDataBytes)){
+            dReg.write(byteId, dIn(byteId % p.cpuDataBytes))
+            buffer.mask(byteId) := True
+          }
+        }
+      }
+
+      when(cmd.fire){
+        buffer.write := cmd.wr
+        buffer.address := cmd.address.resized
+        buffer.length := cmd.byteCountMinusOne
+        if (p.withExclusive) buffer.exclusive := cmd.exclusive
+
+        when(cmd.wr && !cmd.uncached && !cmdExclusive){
+          aggregationEnabled := True
+          buffer.address(aggregationRange.high downto 0) := 0
+          buffer.length := p.memDataBytes-1
+        } otherwise {
+          aggregationEnabled := False
+        }
+      }
+
+
+      val rspCtx = bus.rsp.context.as(Context())
+      rsp.aggregated := rspCtx.rspCount
+
+      val syncLogic = p.withInvalidate generate new Area{
+        val cmdCtx = Stream(UInt(p.aggregationWidth bits))
+        cmdCtx.valid := bus.cmd.fire && bus.cmd.isWrite
+        cmdCtx.payload := aggregationCounter
+        halt setWhen(!cmdCtx.ready)
+
+        val syncCtx = cmdCtx.queue(syncPendingMax).s2mPipe().m2sPipe() //Assume latency of sync is at least 3 cycles
+        syncCtx.ready := bus.sync.fire
+
+        sync.arbitrationFrom(bus.sync)
+        sync.aggregated := syncCtx.payload
+      }
+    }
+
+
+    rsp.valid := bus.rsp.valid
+    if(!p.withWriteResponse) rsp.valid clearWhen(bus.rsp.context(0))
+    rsp.data  := bus.rsp.data
+    rsp.error := bus.rsp.isError
+    rsp.last := bus.rsp.last
+    if(p.withExclusive) rsp.exclusive := bus.rsp.exclusive
+    bus.rsp.ready := True
+
+    val invalidateLogic = p.withInvalidate generate new Area{
+      val beatCountMinusOne = bus.inv.transferBeatCountMinusOne(p.bytePerLine)
+      val counter = Reg(UInt(widthOf(beatCountMinusOne) bits)) init(0)
+
+      inv.valid := bus.inv.valid
+      inv.address := bus.inv.address + (counter << log2Up(p.bytePerLine))
+      inv.enable  := bus.inv.all
+      inv.last := counter === beatCountMinusOne
+      bus.inv.ready := inv.last && inv.ready
+
+      if(widthOf(counter) != 0) when(inv.fire){
+        counter := counter + 1
+        when(inv.last){
+          counter := 0
+        }
+      }
+
+      bus.ack.arbitrationFrom(ack.throwWhen(!ack.last))
+    }
+  }.bus
+
+}
+
+object DataCacheExternalAmoStates extends SpinalEnum{
+  val LR_CMD, LR_RSP, SC_CMD, SC_RSP = newElement();
+}
+
+//If external amo, mem rsp should stay
+class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{
+  import p._
+
+  val io = new Bundle{
+    val cpu = slave(DataCacheCpuBus(p, mmuParameter))
+    val mem = master(DataCacheMemBus(p))
+  }
+
+  val haltCpu = False
+  val lineWidth = bytePerLine*8
+  val lineCount = cacheSize/bytePerLine
+  val wordWidth = cpuDataWidth
+  val wordWidthLog2 = log2Up(wordWidth)
+  val wordPerLine = lineWidth/wordWidth
+  val bytePerWord = wordWidth/8
+  val wayLineCount = lineCount/wayCount
+  val wayLineLog2 = log2Up(wayLineCount)
+  val wayWordCount = wayLineCount * wordPerLine
+  val memWordPerLine = lineWidth/memDataWidth
+  val memTransactionPerLine = p.bytePerLine / (p.memDataWidth/8)
+  val bytePerMemWord = memDataWidth/8
+  val wayMemWordCount = wayLineCount * memWordPerLine
+
+  val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine)
+  val lineRange = tagRange.low-1 downto log2Up(bytePerLine)
+  val cpuWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerWord)
+  val memWordRange = log2Up(bytePerLine)-1 downto log2Up(bytePerMemWord)
+  val hitRange = tagRange.high downto lineRange.low
+  val memWordToCpuWordRange = log2Up(bytePerMemWord)-1 downto log2Up(bytePerWord)
+  val cpuWordToRfWordRange = log2Up(bytePerWord)-1 downto log2Up(p.rfDataBytes)
+
+
+  class LineInfo() extends Bundle{
+    val valid, error = Bool()
+    val address = UInt(tagRange.length bit)
+  }
+
+  val tagsReadCmd =  Flow(UInt(log2Up(wayLineCount) bits))
+  val tagsInvReadCmd = withInvalidate generate Flow(UInt(log2Up(wayLineCount) bits))
+  val tagsWriteCmd = Flow(new Bundle{
+    val way = Bits(wayCount bits)
+    val address = UInt(log2Up(wayLineCount) bits)
+    val data = new LineInfo()
+  })
+
+  val tagsWriteLastCmd = RegNext(tagsWriteCmd)
+
+  val dataReadCmd =  Flow(UInt(log2Up(wayMemWordCount) bits))
+  val dataWriteCmd = Flow(new Bundle{
+    val way = Bits(wayCount bits)
+    val address = UInt(log2Up(wayMemWordCount) bits)
+    val data = Bits(memDataWidth bits)
+    val mask = Bits(memDataWidth/8 bits)
+  })
+
+
+  val ways = for(i <- 0 until wayCount) yield new Area{
+    val tags = Mem(new LineInfo(), wayLineCount)
+    val data = Mem(Bits(memDataWidth bit), wayMemWordCount)
+
+    //Reads
+    val tagsReadRsp = asyncTagMemory match {
+      case false => tags.readSync(tagsReadCmd.payload, tagsReadCmd.valid && !io.cpu.memory.isStuck)
+      case true => tags.readAsync(RegNextWhen(tagsReadCmd.payload, io.cpu.execute.isValid && !io.cpu.memory.isStuck))
+    }
+    val dataReadRspMem = data.readSync(dataReadCmd.payload, dataReadCmd.valid && !io.cpu.memory.isStuck)
+    val dataReadRspSel = if(mergeExecuteMemory) io.cpu.writeBack.address else io.cpu.memory.address
+    val dataReadRsp = dataReadRspMem.subdivideIn(cpuDataWidth bits).read(dataReadRspSel(memWordToCpuWordRange))
+
+    val tagsInvReadRsp = withInvalidate generate(asyncTagMemory match {
+      case false => tags.readSync(tagsInvReadCmd.payload, tagsInvReadCmd.valid)
+      case true => tags.readAsync(RegNextWhen(tagsInvReadCmd.payload, tagsInvReadCmd.valid))
+    })
+
+    //Writes
+    when(tagsWriteCmd.valid && tagsWriteCmd.way(i)){
+      tags.write(tagsWriteCmd.address, tagsWriteCmd.data)
+    }
+    when(dataWriteCmd.valid && dataWriteCmd.way(i)){
+      data.write(
+        address = dataWriteCmd.address,
+        data = dataWriteCmd.data,
+        mask = dataWriteCmd.mask
+      )
+    }
+  }
+
+
+  tagsReadCmd.valid := False
+  tagsReadCmd.payload.assignDontCare()
+  dataReadCmd.valid := False
+  dataReadCmd.payload.assignDontCare()
+  tagsWriteCmd.valid := False
+  tagsWriteCmd.payload.assignDontCare()
+  dataWriteCmd.valid := False
+  dataWriteCmd.payload.assignDontCare()
+
+  when(io.cpu.execute.isValid && !io.cpu.memory.isStuck){
+    tagsReadCmd.valid   := True
+    dataReadCmd.valid   := True
+    tagsReadCmd.payload := io.cpu.execute.address(lineRange)
+    dataReadCmd.payload := io.cpu.execute.address(lineRange.high downto memWordRange.low)
+  }
+
+  def collisionProcess(readAddress : UInt, readMask : Bits): Bits ={
+    val ret = Bits(wayCount bits)
+    val readAddressAligned = (readAddress >> log2Up(memDataWidth/cpuDataWidth))
+    val dataWriteMaskAligned = dataWriteCmd.mask.subdivideIn(memDataWidth/cpuDataWidth slices).read(readAddress(log2Up(memDataWidth/cpuDataWidth)-1 downto 0))
+    for(i <- 0 until wayCount){
+      ret(i) := dataWriteCmd.valid && dataWriteCmd.way(i) && dataWriteCmd.address === readAddressAligned && (readMask & dataWriteMaskAligned) =/= 0
+    }
+    ret
+  }
+
+
+  io.cpu.execute.haltIt := False
+
+  val rspSync = True
+  val rspLast = True
+  val memCmdSent = RegInit(False) setWhen (io.mem.cmd.fire) clearWhen (!io.cpu.writeBack.isStuck)
+  val pending = withExclusive generate new Area{
+    val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+    val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - ((io.mem.rsp.valid  && io.mem.rsp.last) ? (io.mem.rsp.aggregated +^ 1) | 0)
+    counter := counterNext
+
+    val done = RegNext(counterNext === 0)
+    val full = RegNext(counter.msb)       //Has margin
+    val last = RegNext(counterNext === 1) //Equivalent to counter === 1 but pipelined
+
+    if(!withInvalidate) {
+      io.cpu.execute.haltIt setWhen(full)
+    }
+
+    rspSync clearWhen (!last || !memCmdSent)
+    rspLast clearWhen (!last)
+  }
+
+  val sync = withInvalidate generate new Area{
+    io.mem.sync.ready := True
+    val syncCount = io.mem.sync.aggregated +^ 1
+    val syncContext = new Area{
+      val history = Mem(Bool, pendingMax)
+      val wPtr, rPtr = Reg(UInt(log2Up(pendingMax)+1 bits)) init(0)
+      when(io.mem.cmd.fire && io.mem.cmd.wr){
+        history.write(wPtr.resized, io.mem.cmd.uncached)
+        wPtr := wPtr + 1
+      }
+
+      when(io.mem.sync.fire){
+        rPtr := rPtr + syncCount
+      }
+      val uncached = history.readAsync(rPtr.resized)
+      val full = RegNext(wPtr - rPtr >= pendingMax-1)
+      io.cpu.execute.haltIt setWhen(full)
+    }
+
+    def pending(inc : Bool, dec : Bool) = new Area {
+      val pendingSync = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+      val pendingSyncNext = pendingSync + U(io.mem.cmd.fire && io.mem.cmd.wr && inc) - ((io.mem.sync.fire && dec) ? syncCount | 0)
+      pendingSync := pendingSyncNext
+    }
+
+    val writeCached = pending(inc = !io.mem.cmd.uncached, dec = !syncContext.uncached)
+    val writeUncached = pending(inc = io.mem.cmd.uncached, dec = syncContext.uncached)
+
+    def track(load : Bool, uncached : Boolean) = new Area {
+      val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
+      counter := counter - ((io.mem.sync.fire && counter =/= 0 && (if(uncached) syncContext.uncached else !syncContext.uncached)) ? syncCount | 0)
+      when(load){ counter := (if(uncached) writeUncached.pendingSyncNext else writeCached.pendingSyncNext) }
+
+      val busy = counter =/= 0
+    }
+
+    val w2w = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SW, uncached = false)
+    val w2r = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SR, uncached = false)
+    val w2i = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SI, uncached = false)
+    val w2o = track(load = io.cpu.writeBack.fence.PW && io.cpu.writeBack.fence.SO, uncached = false)
+    val o2w = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SW, uncached =  true)
+    val o2r = track(load = io.cpu.writeBack.fence.PO && io.cpu.writeBack.fence.SR, uncached =  true)
+    //Assume o2i and o2o are ordered by the interconnect
+
+    val notTotalyConsistent = w2w.busy || w2r.busy || w2i.busy || w2o.busy || o2w.busy || o2r.busy
+  }
+
+
+
+
+  val stage0 = new Area{
+//    val mask = io.cpu.execute.size.mux (
+//      U(0)    -> B"0001",
+//      U(1)    -> B"0011",
+//      default -> B"1111"
+//    ) |<< io.cpu.execute.address(1 downto 0)
+
+    val mask = io.cpu.execute.size.muxListDc((0 to log2Up(p.cpuDataBytes)).map(i => U(i) -> B((1 << (1 << i)) -1, p.cpuDataBytes bits))) |<< io.cpu.execute.address(log2Up(p.cpuDataBytes)-1 downto 0)
+
+
+    val dataColisions = collisionProcess(io.cpu.execute.address(lineRange.high downto cpuWordRange.low), mask)
+    val wayInvalidate = B(0, wayCount bits) //Used if invalidate enabled
+
+    val isAmo = if(withAmo) io.cpu.execute.isAmo else False
+  }
+
+  val stageA = new Area{
+    def stagePipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else RegNextWhen(that, !io.cpu.memory.isStuck)
+    val request = stagePipe(io.cpu.execute.args)
+    val mask = stagePipe(stage0.mask)
+    io.cpu.memory.isWrite := request.wr
+
+    val isAmo = if(withAmo) request.isAmo else False
+    val isLrsc = if(withAmo) request.isLrsc else False
+    val consistancyCheck = (withInvalidate || withWriteResponse) generate new Area {
+      val hazard = False
+      val w = sync.w2w.busy || sync.o2w.busy
+      val r = stagePipe(sync.w2r.busy || sync.o2r.busy) || sync.w2r.busy || sync.o2r.busy // As it use the cache, need to check against the execute stage status too
+      val o = CombInit(sync.w2o.busy)
+      val i = CombInit(sync.w2i.busy)
+
+      val s = io.cpu.memory.mmuRsp.isIoAccess ? o | w
+      val l = io.cpu.memory.mmuRsp.isIoAccess ? i | r
+
+      when(isAmo? (s || l) | (request.wr ? s | l)){
+        hazard := True
+      }
+      when(request.totalyConsistent && (sync.notTotalyConsistent || io.cpu.writeBack.isValid && io.cpu.writeBack.isWrite)){
+        hazard := True
+      }
+    }
+
+    val wayHits = earlyWaysHits generate Bits(wayCount bits)
+    val indirectTlbHitGen = (earlyWaysHits && !directTlbHit) generate new Area {
+      wayHits := B(ways.map(way => (io.cpu.memory.mmuRsp.physicalAddress(tagRange) === way.tagsReadRsp.address && way.tagsReadRsp.valid)))
+    }
+    val directTlbHitGen = (earlyWaysHits && directTlbHit) generate new Area {
+      val wayTlbHits = for (way <- ways) yield for (tlb <- io.cpu.memory.mmuRsp.ways) yield {
+        way.tagsReadRsp.address === tlb.physical(tagRange) && tlb.sel
+      }
+      val translatedHits = B(wayTlbHits.map(_.orR))
+      val bypassHits = B(ways.map(_.tagsReadRsp.address === io.cpu.memory.address(tagRange)))
+      wayHits := (io.cpu.memory.mmuRsp.bypassTranslation ? bypassHits | translatedHits) & B(ways.map(_.tagsReadRsp.valid))
+    }
+
+    val dataMux = earlyDataMux generate MuxOH(wayHits, ways.map(_.dataReadRsp))
+    val wayInvalidate = stagePipe(stage0. wayInvalidate)
+    val dataColisions = if(mergeExecuteMemory){
+      stagePipe(stage0.dataColisions)
+    } else {
+      //Assume the writeback stage will never be unstall memory acces while memory stage is stalled
+      stagePipe(stage0.dataColisions) | collisionProcess(io.cpu.memory.address(lineRange.high downto cpuWordRange.low), mask)
+    }
+  }
+
+  val stageB = new Area {
+    def stagePipe[T <: Data](that : T) = RegNextWhen(that, !io.cpu.writeBack.isStuck)
+    def ramPipe[T <: Data](that : T) = if(mergeExecuteMemory) CombInit(that) else  RegNextWhen(that, !io.cpu.writeBack.isStuck)
+    val request = RegNextWhen(stageA.request, !io.cpu.writeBack.isStuck)
+    val mmuRspFreeze = False
+    val mmuRsp = RegNextWhen(io.cpu.memory.mmuRsp, !io.cpu.writeBack.isStuck && !mmuRspFreeze)
+    val tagsReadRsp = ways.map(w => ramPipe(w.tagsReadRsp))
+    val dataReadRsp = !earlyDataMux generate ways.map(w => ramPipe(w.dataReadRsp))
+    val wayInvalidate = stagePipe(stageA. wayInvalidate)
+    val consistancyHazard = if(stageA.consistancyCheck != null) stagePipe(stageA.consistancyCheck.hazard) else False
+    val dataColisions = stagePipe(stageA.dataColisions)
+//    val unaligned = if(!catchUnaligned) False else stagePipe((stageA.request.size === 2 && io.cpu.memory.address(1 downto 0) =/= 0) || (stageA.request.size === 1 && io.cpu.memory.address(0 downto 0) =/= 0))
+    val unaligned = if(!catchUnaligned) False else stagePipe((1 to log2Up(p.cpuDataBytes)).map(i => stageA.request.size === i && io.cpu.memory.address(i-1 downto 0) =/= 0).orR)
+    val waysHitsBeforeInvalidate = if(earlyWaysHits) stagePipe(B(stageA.wayHits)) else B(tagsReadRsp.map(tag => mmuRsp.physicalAddress(tagRange) === tag.address && tag.valid).asBits())
+    val waysHits = waysHitsBeforeInvalidate & ~wayInvalidate
+    val waysHit = waysHits.orR
+    val dataMux = if(earlyDataMux) stagePipe(stageA.dataMux) else MuxOH(waysHits, dataReadRsp)
+    val mask = stagePipe(stageA.mask)
+
+    //Loader interface
+    val loaderValid = False
+
+    val ioMemRspMuxed = io.mem.rsp.data.subdivideIn(cpuDataWidth bits).read(io.cpu.writeBack.address(memWordToCpuWordRange))
+
+    io.cpu.writeBack.haltIt := True
+
+    //Evict the cache after reset logics
+    val flusher = new Area {
+      val waitDone = RegInit(False) clearWhen(io.cpu.flush.ready)
+      val hold = False
+      val counter = Reg(UInt(lineRange.size + 1 bits)) init(0)
+      when(!counter.msb) {
+        tagsWriteCmd.valid := True
+        tagsWriteCmd.address := counter.resized
+        tagsWriteCmd.way.setAll()
+        tagsWriteCmd.data.valid := False
+        io.cpu.execute.haltIt := True
+        when(!hold) {
+          counter := counter + 1
+          when(io.cpu.flush.singleLine){
+            counter.msb := True
+          }
+        }
+      }
+
+      io.cpu.flush.ready := waitDone && counter.msb
+
+      val start = RegInit(True) //Used to relax timings
+      start := !waitDone && !start && io.cpu.flush.valid && !io.cpu.execute.isValid && !io.cpu.memory.isValid && !io.cpu.writeBack.isValid && !io.cpu.redo
+
+      when(start){
+        waitDone := True
+        counter := 0
+        when(io.cpu.flush.singleLine){
+          counter := U"0" @@ io.cpu.flush.lineId
+        }
+      }
+    }
+
+    val lrSc = withInternalLrSc generate new Area{
+      val reserved = RegInit(False)
+      when(io.cpu.writeBack.isValid && io.cpu.writeBack.isFiring){
+        reserved setWhen(request.isLrsc)
+        reserved clearWhen(request.wr)
+      }
+    }
+
+    val isAmo = if(withAmo) request.isAmo else False
+    val isAmoCached = if(withInternalAmo) isAmo else False
+    val isExternalLsrc = if(withExternalLrSc) request.isLrsc else False
+    val isExternalAmo  = if(withExternalAmo)  request.isAmo  else False
+
+    val requestDataBypass = CombInit(io.cpu.writeBack.storeData)
+    import DataCacheExternalAmoStates._
+    val amo = withAmo generate new Area{
+      def rf = io.cpu.writeBack.storeData(p.rfDataWidth-1 downto 0)
+      def memLarger = if(withInternalAmo) dataMux else ioMemRspMuxed
+      def mem = memLarger.subdivideIn(rfDataWidth bits).read(io.cpu.writeBack.address(cpuWordToRfWordRange))
+      val compare = request.amoCtrl.alu.msb
+      val unsigned = request.amoCtrl.alu(2 downto 1) === B"11"
+      val addSub = (rf.asSInt + Mux(compare, ~mem, mem).asSInt + Mux(compare, S(1), S(0))).asBits
+      val less = Mux(rf.msb === mem.msb, addSub.msb, Mux(unsigned, mem.msb, rf.msb))
+      val selectRf = request.amoCtrl.swap ? True | (request.amoCtrl.alu.lsb ^ less)
+
+      val result = (request.amoCtrl.alu | (request.amoCtrl.swap ## B"00")).mux(
+        B"000"  -> addSub,
+        B"001"  -> (rf ^ mem),
+        B"010"  -> (rf | mem),
+        B"011"  -> (rf & mem),
+        default -> (selectRf ? rf | mem)
+      )
+      //      val resultRegValid = RegNext(True) clearWhen(!io.cpu.writeBack.isStuck)
+      //      val resultReg = RegNext(result)
+      val resultReg = Reg(Bits(32 bits))
+
+      val internal = withInternalAmo generate new Area{
+        val resultRegValid = RegNext(io.cpu.writeBack.isStuck)
+        resultReg := result
+      }
+      val external = !withInternalAmo generate new Area{
+        val state = RegInit(LR_CMD)
+      }
+    }
+
+
+    val cpuWriteToCache = False
+    when(cpuWriteToCache){
+      dataWriteCmd.valid setWhen(request.wr && waysHit)
+      dataWriteCmd.address := mmuRsp.physicalAddress(lineRange.high downto memWordRange.low)
+      dataWriteCmd.data.subdivideIn(cpuDataWidth bits).foreach(_ := requestDataBypass)
+      dataWriteCmd.mask := 0
+      dataWriteCmd.mask.subdivideIn(cpuDataWidth/8 bits).write(io.cpu.writeBack.address(memWordToCpuWordRange), mask)
+      dataWriteCmd.way := waysHits
+    }
+
+    val badPermissions = (!mmuRsp.allowWrite && request.wr) || (!mmuRsp.allowRead && (!request.wr || isAmo))
+    val loadStoreFault = io.cpu.writeBack.isValid && (mmuRsp.exception || badPermissions)
+    
+    io.cpu.redo := False
+    io.cpu.writeBack.accessError := False
+    io.cpu.writeBack.mmuException :=  loadStoreFault && (if(catchIllegal) mmuRsp.isPaging else False)
+    io.cpu.writeBack.unalignedAccess := io.cpu.writeBack.isValid && unaligned
+    io.cpu.writeBack.isWrite := request.wr
+
+
+    io.mem.cmd.valid := False
+    io.mem.cmd.address := mmuRsp.physicalAddress
+    io.mem.cmd.last := True
+    io.mem.cmd.wr := request.wr
+    io.mem.cmd.mask := mask
+    io.mem.cmd.data := requestDataBypass
+    io.mem.cmd.uncached := mmuRsp.isIoAccess
+    io.mem.cmd.size := request.size.resized
+    if(withExternalLrSc) io.mem.cmd.exclusive := request.isLrsc || isAmo
+
+
+    val bypassCache = mmuRsp.isIoAccess || isExternalLsrc || isExternalAmo
+
+    io.cpu.writeBack.keepMemRspData := False
+    when(io.cpu.writeBack.isValid) {
+      when(isExternalAmo){
+        if(withExternalAmo) switch(amo.external.state){
+          is(LR_CMD){
+            io.mem.cmd.valid := True
+            io.mem.cmd.wr := False
+            when(io.mem.cmd.ready) {
+              amo.external.state := LR_RSP
+            }
+          }
+          is(LR_RSP){
+            when(io.mem.rsp.valid && pending.last) {
+              amo.external.state := SC_CMD
+              amo.resultReg := amo.result
+            }
+          }
+          is(SC_CMD){
+            io.mem.cmd.valid := True
+            when(io.mem.cmd.ready) {
+              amo.external.state := SC_RSP
+            }
+          }
+          is(SC_RSP){
+            io.cpu.writeBack.keepMemRspData := True
+            when(io.mem.rsp.valid) {
+              amo.external.state := LR_CMD
+              when(io.mem.rsp.exclusive){ //Success
+                cpuWriteToCache := True
+                io.cpu.writeBack.haltIt := False
+              }
+            }
+          }
+        }
+      } elsewhen(mmuRsp.isIoAccess || isExternalLsrc) {
+        val waitResponse = !request.wr
+        if(withExternalLrSc) waitResponse setWhen(request.isLrsc)
+
+        io.cpu.writeBack.haltIt.clearWhen(waitResponse ? (io.mem.rsp.valid && rspSync) | io.mem.cmd.ready)
+
+        io.mem.cmd.valid := !memCmdSent
+
+        if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){
+          io.mem.cmd.valid := False
+          io.cpu.writeBack.haltIt := False
+        }
+      } otherwise {
+        when(waysHit || request.wr && !isAmoCached) {   //Do not require a cache refill ?
+          cpuWriteToCache := True
+
+          //Write through
+          io.mem.cmd.valid setWhen(request.wr)
+          io.cpu.writeBack.haltIt clearWhen(!request.wr || io.mem.cmd.ready)
+
+          if(withInternalAmo) when(isAmo){
+            when(!amo.internal.resultRegValid) {
+              io.mem.cmd.valid := False
+              dataWriteCmd.valid := False
+              io.cpu.writeBack.haltIt := True
+            }
+          }
+
+          //On write to read dataColisions
+          when((!request.wr || isAmoCached) && (dataColisions & waysHits) =/= 0){
+            io.cpu.redo := True
+            if(withAmo) io.mem.cmd.valid := False
+          }
+
+          if(withInternalLrSc) when(request.isLrsc && !lrSc.reserved){
+            io.mem.cmd.valid := False
+            dataWriteCmd.valid := False
+            io.cpu.writeBack.haltIt := False
+          }
+        } otherwise { //Do refill
+          //Emit cmd
+          io.mem.cmd.valid setWhen(!memCmdSent)
+          io.mem.cmd.wr := False
+          io.mem.cmd.address(0, lineRange.low bits) := 0
+          io.mem.cmd.size := log2Up(p.bytePerLine)
+
+          loaderValid setWhen(io.mem.cmd.ready)
+        }
+      }
+    }
+
+    when(bypassCache){
+      io.cpu.writeBack.data := ioMemRspMuxed
+      def isLast = if(pending != null) pending.last else True
+      if(catchAccessError) io.cpu.writeBack.accessError := !request.wr && isLast && io.mem.rsp.valid && io.mem.rsp.error
+    } otherwise {
+      io.cpu.writeBack.data := dataMux
+      if(catchAccessError) io.cpu.writeBack.accessError := (waysHits & B(tagsReadRsp.map(_.error))) =/= 0 || (loadStoreFault && !mmuRsp.isPaging)
+    }
+
+    if(withLrSc) {
+      val success = if(withInternalLrSc)lrSc.reserved else io.mem.rsp.exclusive
+      io.cpu.writeBack.exclusiveOk := success
+      when(request.isLrsc && request.wr){
+        //      io.cpu.writeBack.data := B(!success).resized
+        if(withExternalLrSc) when(io.cpu.writeBack.isValid && io.mem.rsp.valid && rspSync && success && waysHit){
+          cpuWriteToCache := True
+        }
+      }
+    }
+    if(withAmo) when(request.isAmo){
+      requestDataBypass.subdivideIn(p.rfDataWidth bits).foreach(_ := amo.resultReg)
+    }
+
+    //remove side effects on exceptions
+    when(consistancyHazard || mmuRsp.refilling || io.cpu.writeBack.accessError || io.cpu.writeBack.mmuException || io.cpu.writeBack.unalignedAccess){
+      io.mem.cmd.valid := False
+      tagsWriteCmd.valid := False
+      dataWriteCmd.valid := False
+      loaderValid := False
+      io.cpu.writeBack.haltIt := False
+      if(withInternalLrSc) lrSc.reserved := lrSc.reserved
+      if(withExternalAmo) amo.external.state := LR_CMD
+    }
+    io.cpu.redo setWhen(io.cpu.writeBack.isValid && (mmuRsp.refilling || consistancyHazard))
+
+    assert(!(io.cpu.writeBack.isValid && !io.cpu.writeBack.haltIt && io.cpu.writeBack.isStuck), "writeBack stuck by another plugin is not allowed", ERROR)
+  }
+
+  val loader = new Area{
+    val valid = RegInit(False) setWhen(stageB.loaderValid)
+    val baseAddress =  stageB.mmuRsp.physicalAddress
+
+    val counter = Counter(memTransactionPerLine)
+    val waysAllocator = Reg(Bits(wayCount bits)) init(1)
+    val error = RegInit(False)
+    val kill = False
+    val killReg = RegInit(False) setWhen(kill)
+
+    when(valid && io.mem.rsp.valid && rspLast){
+      dataWriteCmd.valid := True
+      dataWriteCmd.address := baseAddress(lineRange) @@ counter
+      dataWriteCmd.data := io.mem.rsp.data
+      dataWriteCmd.mask.setAll()
+      dataWriteCmd.way := waysAllocator
+      error := error | io.mem.rsp.error
+      counter.increment()
+    }
+
+    val done = CombInit(counter.willOverflow)
+    if(withInvalidate) done setWhen(valid && pending.counter === 0) //Used to solve invalidate write request at the same time
+
+    when(done){
+      valid := False
+
+      //Update tags
+      tagsWriteCmd.valid := True
+      tagsWriteCmd.address := baseAddress(lineRange)
+      tagsWriteCmd.data.valid := !(kill || killReg)
+      tagsWriteCmd.data.address := baseAddress(tagRange)
+      tagsWriteCmd.data.error := error || (io.mem.rsp.valid && io.mem.rsp.error)
+      tagsWriteCmd.way := waysAllocator
+
+      error := False
+      killReg := False
+    }
+
+    when(!valid){
+      waysAllocator := (waysAllocator ## waysAllocator.msb).resized
+    }
+
+    io.cpu.redo setWhen(valid.rise())
+    io.cpu.execute.refilling := valid
+
+    stageB.mmuRspFreeze setWhen(stageB.loaderValid || valid)
+  }
+
+  val invalidate = withInvalidate generate new Area{
+    val s0 = new Area{
+      val input = io.mem.inv
+      tagsInvReadCmd.valid := input.fire
+      tagsInvReadCmd.payload := input.address(lineRange)
+
+      val loaderTagHit = input.address(tagRange) === loader.baseAddress(tagRange)
+      val loaderLineHit =  input.address(lineRange) === loader.baseAddress(lineRange)
+      when(input.valid && input.enable && loader.valid && loaderLineHit && loaderTagHit){
+        loader.kill := True
+      }
+    }
+    val s1 = new Area{
+      val input = s0.input.stage()
+      val loaderValid = RegNextWhen(loader.valid, s0.input.ready)
+      val loaderWay = RegNextWhen(loader.waysAllocator, s0.input.ready)
+      val loaderTagHit = RegNextWhen(s0.loaderTagHit, s0.input.ready)
+      val loaderLineHit = RegNextWhen(s0.loaderLineHit, s0.input.ready)
+      val invalidations = Bits(wayCount bits)
+
+      var wayHits = B(ways.map(way => (input.address(tagRange) === way.tagsInvReadRsp.address && way.tagsInvReadRsp.valid))) & ~invalidations
+
+      //Handle invalider read during loader write hazard
+      when(loaderValid && loaderLineHit && !loaderTagHit){
+        wayHits \= wayHits & ~loaderWay
+      }
+    }
+    val s2 = new Area{
+      val input = s1.input.stage()
+      val wayHits = RegNextWhen(s1.wayHits, s1.input.ready)
+      val wayHit = wayHits.orR
+
+      when(input.valid && input.enable) {
+        //Manage invalidate write during cpu read hazard
+        when(input.address(lineRange) === io.cpu.execute.address(lineRange)) {
+          stage0.wayInvalidate := wayHits
+        }
+
+        //Invalidate cache tag
+        when(wayHit) {
+          tagsWriteCmd.valid := True
+          stageB.flusher.hold := True
+          tagsWriteCmd.address := input.address(lineRange)
+          tagsWriteCmd.data.valid := False
+          tagsWriteCmd.way := wayHits
+          loader.done := False //Hold loader tags write
+        }
+      }
+      io.mem.ack.arbitrationFrom(input)
+      io.mem.ack.hit := wayHit
+      io.mem.ack.last := input.last
+
+      //Manage invalidation read during write hazard
+      s1.invalidations := RegNextWhen((input.valid && input.enable && input.address(lineRange) === s0.input.address(lineRange)) ? wayHits | 0, s0.input.ready)
+    }
+  }
+}
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala
new file mode 100644
index 0000000..e09712c
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/InstructionCache.scala
@@ -0,0 +1,487 @@
+package vexriscv.ip
+
+import vexriscv._
+import spinal.core._
+import spinal.lib._
+import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4ReadOnly}
+import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig}
+import spinal.lib.bus.bmb.{Bmb, BmbAccessParameter, BmbParameter, BmbSourceParameter}
+import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig}
+import spinal.lib.bus.simple._
+import vexriscv.plugin.{IBusSimpleBus, IBusSimplePlugin}
+
+
+case class InstructionCacheConfig( cacheSize : Int,
+                                   bytePerLine : Int,
+                                   wayCount : Int,
+                                   addressWidth : Int,
+                                   cpuDataWidth : Int,
+                                   memDataWidth : Int,
+                                   catchIllegalAccess : Boolean,
+                                   catchAccessFault : Boolean,
+                                   asyncTagMemory : Boolean,
+                                   twoCycleCache : Boolean = true,
+                                   twoCycleRam : Boolean = false,
+                                   twoCycleRamInnerMux : Boolean = false,
+                                   preResetFlush : Boolean = false,
+                                   bypassGen : Boolean = false,
+                                   reducedBankWidth : Boolean = false){
+
+  assert(!(twoCycleRam && !twoCycleCache))
+
+  def burstSize = bytePerLine*8/memDataWidth
+  def catchSomething = catchAccessFault || catchIllegalAccess
+
+  def getAxi4Config() = Axi4Config(
+    addressWidth = addressWidth,
+    dataWidth = memDataWidth,
+    useId = false,
+    useRegion = false,
+    useLock = false,
+    useQos = false,
+    useSize = false
+  )
+
+  def getAvalonConfig() = AvalonMMConfig.bursted(
+    addressWidth = addressWidth,
+    dataWidth = memDataWidth,
+    burstCountWidth = log2Up(burstSize + 1)).getReadOnlyConfig.copy(
+    useResponse = true,
+    constantBurstBehavior = true
+  )
+
+  def getPipelinedMemoryBusConfig() = PipelinedMemoryBusConfig(
+    addressWidth = 32,
+    dataWidth = 32
+  )
+
+  def getWishboneConfig() = WishboneConfig(
+    addressWidth = 32-log2Up(memDataWidth/8),
+    dataWidth = memDataWidth,
+    selWidth = memDataWidth/8,
+    useSTALL = false,
+    useLOCK = false,
+    useERR = true,
+    useRTY = false,
+    tgaWidth = 0,
+    tgcWidth = 0,
+    tgdWidth = 0,
+    useBTE = true,
+    useCTI = true
+  )
+
+  def getBmbParameter() = BmbParameter(
+    BmbAccessParameter(
+      addressWidth = 32,
+      dataWidth = memDataWidth
+    ).addSources(1, BmbSourceParameter(
+      lengthWidth = log2Up(this.bytePerLine),
+      contextWidth = 0,
+      canWrite = false,
+      alignment = BmbParameter.BurstAlignement.LENGTH,
+      maximumPendingTransaction = 1
+    ))
+  )
+}
+
+
+
+case class InstructionCacheCpuPrefetch(p : InstructionCacheConfig) extends Bundle with IMasterSlave{
+  val isValid  = Bool
+  val haltIt   = Bool
+  val pc  = UInt(p.addressWidth bit)
+
+  override def asMaster(): Unit = {
+    out(isValid, pc)
+    in(haltIt)
+  }
+}
+
+trait InstructionCacheCommons{
+  val isValid : Bool
+  val isStuck : Bool
+  val pc : UInt
+  val physicalAddress : UInt
+  val data   : Bits
+  val cacheMiss, error,  mmuRefilling, mmuException, isUser : Bool
+}
+
+case class InstructionCacheCpuFetch(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave with InstructionCacheCommons {
+  val isValid = Bool()
+  val isStuck = Bool()
+  val isRemoved = Bool()
+  val pc = UInt(p.addressWidth bits)
+  val data = Bits(p.cpuDataWidth bits)
+  val dataBypassValid = p.bypassGen generate Bool()
+  val dataBypass = p.bypassGen generate Bits(p.cpuDataWidth bits)
+  val mmuRsp  = MemoryTranslatorRsp(mmuParameter)
+  val physicalAddress = UInt(p.addressWidth bits)
+  val cacheMiss, error, mmuRefilling, mmuException, isUser  = ifGen(!p.twoCycleCache)(Bool)
+
+  override def asMaster(): Unit = {
+    out(isValid, isStuck, isRemoved, pc)
+    inWithNull(error,mmuRefilling,mmuException,data, cacheMiss,physicalAddress)
+    outWithNull(isUser, dataBypass, dataBypassValid)
+    out(mmuRsp)
+  }
+}
+
+
+case class InstructionCacheCpuDecode(p : InstructionCacheConfig) extends Bundle with IMasterSlave with InstructionCacheCommons {
+  val isValid = Bool
+  val isStuck  = Bool
+  val pc = UInt(p.addressWidth bits)
+  val physicalAddress = UInt(p.addressWidth bits)
+  val data  =  Bits(p.cpuDataWidth bits)
+  val cacheMiss, error, mmuRefilling, mmuException, isUser  = ifGen(p.twoCycleCache)(Bool)
+
+  override def asMaster(): Unit = {
+    out(isValid, isStuck, pc)
+    outWithNull(isUser)
+    inWithNull(error, mmuRefilling, mmuException,data, cacheMiss, physicalAddress)
+  }
+}
+
+case class InstructionCacheCpuBus(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Bundle with IMasterSlave{
+  val prefetch = InstructionCacheCpuPrefetch(p)
+  val fetch = InstructionCacheCpuFetch(p, mmuParameter)
+  val decode = InstructionCacheCpuDecode(p)
+  val fill = Flow(UInt(p.addressWidth bits))
+
+  override def asMaster(): Unit = {
+    master(prefetch, fetch, decode, fill)
+  }
+}
+
+case class InstructionCacheMemCmd(p : InstructionCacheConfig) extends Bundle{
+  val address = UInt(p.addressWidth bit)
+  val size = UInt(log2Up(log2Up(p.bytePerLine) + 1) bits)
+}
+
+case class InstructionCacheMemRsp(p : InstructionCacheConfig) extends Bundle{
+  val data = Bits(p.memDataWidth bit)
+  val error = Bool
+}
+
+case class InstructionCacheMemBus(p : InstructionCacheConfig) extends Bundle with IMasterSlave{
+  val cmd = Stream (InstructionCacheMemCmd(p))
+  val rsp = Flow (InstructionCacheMemRsp(p))
+
+  override def asMaster(): Unit = {
+    master(cmd)
+    slave(rsp)
+  }
+
+  def toAxi4ReadOnly(): Axi4ReadOnly = {
+    val axiConfig = p.getAxi4Config()
+    val mm = Axi4ReadOnly(axiConfig)
+
+    mm.readCmd.valid := cmd.valid
+    mm.readCmd.len := p.burstSize-1
+    mm.readCmd.addr := cmd.address
+    mm.readCmd.prot  := "110"
+    mm.readCmd.cache := "1111"
+    mm.readCmd.setBurstINCR()
+    cmd.ready := mm.readCmd.ready
+    rsp.valid := mm.readRsp.valid
+    rsp.data  := mm.readRsp.data
+    rsp.error := !mm.readRsp.isOKAY()
+    mm.readRsp.ready := True
+    mm
+  }
+
+  def toAvalon(): AvalonMM = {
+    val avalonConfig = p.getAvalonConfig()
+    val mm = AvalonMM(avalonConfig)
+    mm.read := cmd.valid
+    mm.burstCount := U(p.burstSize)
+    mm.address := cmd.address
+    cmd.ready := mm.waitRequestn
+    rsp.valid := mm.readDataValid
+    rsp.data := mm.readData
+    rsp.error := mm.response =/= AvalonMM.Response.OKAY
+    mm
+  }
+
+
+  def toPipelinedMemoryBus(): PipelinedMemoryBus = {
+    val pipelinedMemoryBusConfig = p.getPipelinedMemoryBusConfig()
+    val bus = PipelinedMemoryBus(pipelinedMemoryBusConfig)
+    val counter = Counter(p.burstSize, bus.cmd.fire)
+    bus.cmd.valid := cmd.valid
+    bus.cmd.address := cmd.address(31 downto widthOf(counter.value) + 2) @@ counter @@ U"00"
+    bus.cmd.write := False
+    bus.cmd.mask.assignDontCare()
+    bus.cmd.data.assignDontCare()
+    cmd.ready := counter.willOverflow
+    rsp.valid := bus.rsp.valid
+    rsp.data := bus.rsp.payload.data
+    rsp.error := False
+    bus
+  }
+
+
+  def toWishbone(): Wishbone = {
+    val wishboneConfig = p.getWishboneConfig()
+    val bus = Wishbone(wishboneConfig)
+    val counter = Reg(UInt(log2Up(p.burstSize) bits)) init(0)
+    val pending = counter =/= 0
+    val lastCycle = counter === counter.maxValue
+
+    bus.ADR := (cmd.address >> widthOf(counter) + log2Up(p.memDataWidth/8)) @@ counter
+    bus.CTI := lastCycle ? B"111" | B"010"
+    bus.BTE := "00"
+    bus.SEL.setAll()
+    bus.WE  := False
+    bus.DAT_MOSI.assignDontCare()
+    bus.CYC := False
+    bus.STB := False
+    when(cmd.valid || pending){
+      bus.CYC := True
+      bus.STB := True
+      when(bus.ACK){
+        counter := counter + 1
+      }
+    }
+
+    cmd.ready := cmd.valid && bus.ACK
+    rsp.valid := RegNext(bus.CYC && bus.ACK) init(False)
+    rsp.data := RegNext(bus.DAT_MISO)
+    rsp.error := False //TODO
+    bus
+  }
+
+  def toBmb() : Bmb = {
+    val busParameter = p.getBmbParameter
+    val bus = Bmb(busParameter).setCompositeName(this,"toBmb", true)
+    bus.cmd.arbitrationFrom(cmd)
+    bus.cmd.opcode := Bmb.Cmd.Opcode.READ
+    bus.cmd.address := cmd.address.resized
+    bus.cmd.length := p.bytePerLine - 1
+    bus.cmd.last := True
+    rsp.valid := bus.rsp.valid
+    rsp.data := bus.rsp.data
+    rsp.error := bus.rsp.isError
+    bus.rsp.ready := True
+    bus
+  }
+}
+
+
+case class InstructionCacheFlushBus() extends Bundle with IMasterSlave{
+  val cmd = Event
+  val rsp = Bool
+
+  override def asMaster(): Unit = {
+    master(cmd)
+    in(rsp)
+  }
+}
+
+class InstructionCache(p : InstructionCacheConfig, mmuParameter : MemoryTranslatorBusParameter) extends Component{
+  import p._
+  val io = new Bundle{
+    val flush = in Bool()
+    val cpu = slave(InstructionCacheCpuBus(p, mmuParameter))
+    val mem = master(InstructionCacheMemBus(p))
+  }
+
+  val lineWidth = bytePerLine*8
+  val lineCount = cacheSize/bytePerLine
+  val cpuWordWidth = cpuDataWidth
+  val memWordPerLine = lineWidth/memDataWidth
+  val bytePerCpuWord = cpuWordWidth/8
+  val wayLineCount = lineCount/wayCount
+
+  val tagRange = addressWidth-1 downto log2Up(wayLineCount*bytePerLine)
+  val lineRange = tagRange.low-1 downto log2Up(bytePerLine)
+
+  case class LineTag() extends Bundle{
+    val valid = Bool
+    val error = Bool
+    val address = UInt(tagRange.length bit)
+  }
+
+  val bankCount = wayCount
+  val bankWidth = if(!reducedBankWidth) memDataWidth else Math.max(cpuDataWidth, memDataWidth/wayCount)
+  val bankByteSize = cacheSize/bankCount
+  val bankWordCount = bankByteSize*8/bankWidth
+  val bankWordToCpuWordRange = log2Up(bankWidth/8)-1 downto log2Up(bytePerCpuWord)
+  val memToBankRatio = bankWidth*bankCount / memDataWidth
+
+  val banks = Seq.fill(bankCount)(Mem(Bits(bankWidth bits), bankWordCount))
+
+  val ways = Seq.fill(wayCount)(new Area{
+    val tags = Mem(LineTag(),wayLineCount)
+
+    if(preResetFlush){
+      tags.initBigInt(List.fill(wayLineCount)(BigInt(0)))
+    }
+  })
+
+
+  val lineLoader = new Area{
+    val fire = False
+    val valid = RegInit(False) clearWhen(fire)
+    val address = KeepAttribute(Reg(UInt(addressWidth bits)))
+    val hadError = RegInit(False) clearWhen(fire)
+    val flushPending = RegInit(True)
+
+    when(io.cpu.fill.valid){
+      valid := True
+      address := io.cpu.fill.payload
+    }
+
+    io.cpu.prefetch.haltIt := valid || flushPending
+
+    val flushCounter = Reg(UInt(log2Up(wayLineCount) + 1 bit))
+    when(!flushCounter.msb){
+      io.cpu.prefetch.haltIt := True
+      flushCounter := flushCounter + 1
+    }
+    when(!RegNext(flushCounter.msb)){
+      io.cpu.prefetch.haltIt := True
+    }
+
+    when(io.flush){
+      io.cpu.prefetch.haltIt := True
+      flushPending := True
+    }
+
+    when(flushPending && !(valid || io.cpu.fetch.isValid) ){
+      flushCounter := 0
+      flushPending := False
+    }
+
+
+
+    val cmdSent = RegInit(False) setWhen(io.mem.cmd.fire) clearWhen(fire)
+    io.mem.cmd.valid := valid && !cmdSent
+    io.mem.cmd.address := address(tagRange.high downto lineRange.low) @@ U(0,lineRange.low bit)
+    io.mem.cmd.size := log2Up(p.bytePerLine)
+
+    val wayToAllocate = Counter(wayCount, !valid)
+    val wordIndex = KeepAttribute(Reg(UInt(log2Up(memWordPerLine) bits)) init(0))
+
+
+    val write = new Area{
+      val tag = ways.map(_.tags.writePort)
+      val data = banks.map(_.writePort)
+    }
+
+    for(wayId <- 0 until wayCount){
+      val wayHit = wayToAllocate === wayId
+      val tag = write.tag(wayId)
+      tag.valid := ((wayHit && fire) || !flushCounter.msb)
+      tag.address := (flushCounter.msb ? address(lineRange) | flushCounter(flushCounter.high-1 downto 0))
+      tag.data.valid := flushCounter.msb
+      tag.data.error := hadError || io.mem.rsp.error
+      tag.data.address := address(tagRange)
+    }
+
+    for((writeBank, bankId) <- write.data.zipWithIndex){
+      if(!reducedBankWidth) {
+        writeBank.valid := io.mem.rsp.valid && wayToAllocate === bankId
+        writeBank.address := address(lineRange) @@ wordIndex
+        writeBank.data := io.mem.rsp.data
+      } else {
+        val sel = U(bankId) - wayToAllocate.value
+        val groupSel = wayToAllocate(log2Up(bankCount)-1 downto log2Up(bankCount/memToBankRatio))
+        val subSel = sel(log2Up(bankCount/memToBankRatio) -1 downto 0)
+        writeBank.valid := io.mem.rsp.valid && groupSel === (bankId >> log2Up(bankCount/memToBankRatio))
+        writeBank.address := address(lineRange) @@ wordIndex @@ (subSel)
+        writeBank.data := io.mem.rsp.data.subdivideIn(bankCount/memToBankRatio slices)(subSel)
+      }
+    }
+
+
+    when(io.mem.rsp.valid) {
+      wordIndex := (wordIndex + 1).resized
+      hadError.setWhen(io.mem.rsp.error)
+      when(wordIndex === wordIndex.maxValue) {
+        fire := True
+      }
+    }
+  }
+
+  val fetchStage = new Area{
+    val read = new Area{
+      val banksValue = for(bank <- banks) yield new Area{
+        val dataMem = bank.readSync(io.cpu.prefetch.pc(lineRange.high downto log2Up(bankWidth/8)), !io.cpu.fetch.isStuck)
+        val data = if(!twoCycleRamInnerMux) dataMem.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange)) else dataMem
+      }
+
+      val waysValues = for((way, wayId) <- ways.zipWithIndex) yield new Area{
+        val tag = if(asyncTagMemory) {
+          way.tags.readAsync(io.cpu.fetch.pc(lineRange))
+        }else {
+          way.tags.readSync(io.cpu.prefetch.pc(lineRange), !io.cpu.fetch.isStuck)
+        }
+//        val data = CombInit(banksValue(wayId).data)
+      }
+    }
+
+
+    val hit = (!twoCycleRam) generate new Area{
+      val hits = read.waysValues.map(way => way.tag.valid && way.tag.address === io.cpu.fetch.mmuRsp.physicalAddress(tagRange))
+      val valid = Cat(hits).orR
+      val wayId = OHToUInt(hits)
+      val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (io.cpu.fetch.mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio)))
+      val error = read.waysValues.map(_.tag.error).read(wayId)
+      val data = read.banksValue.map(_.data).read(bankId)
+      val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(data) else data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange))
+      io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | word) else word)
+      if(twoCycleCache){
+        io.cpu.decode.data := RegNextWhen(io.cpu.fetch.data,!io.cpu.decode.isStuck)
+      }
+    }
+
+    if(twoCycleRam && wayCount == 1){
+      val cacheData = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) CombInit(read.banksValue.head.data) else read.banksValue.head.data.subdivideIn(cpuDataWidth bits).read(io.cpu.fetch.pc(bankWordToCpuWordRange))
+      io.cpu.fetch.data := (if(p.bypassGen) (io.cpu.fetch.dataBypassValid ? io.cpu.fetch.dataBypass | cacheData) else cacheData)
+    }
+
+    io.cpu.fetch.physicalAddress := io.cpu.fetch.mmuRsp.physicalAddress
+
+    val resolution = ifGen(!twoCycleCache)( new Area{
+      val mmuRsp = io.cpu.fetch.mmuRsp
+
+      io.cpu.fetch.cacheMiss := !hit.valid
+      io.cpu.fetch.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute))
+      io.cpu.fetch.mmuRefilling := mmuRsp.refilling
+      io.cpu.fetch.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)
+    })
+  }
+
+
+
+  val decodeStage = ifGen(twoCycleCache) (new Area{
+    def stage[T <: Data](that : T) = RegNextWhen(that,!io.cpu.decode.isStuck)
+    val mmuRsp = stage(io.cpu.fetch.mmuRsp)
+
+    val hit = if(!twoCycleRam) new Area{
+      val valid = stage(fetchStage.hit.valid)
+      val error = stage(fetchStage.hit.error)
+    } else new Area{
+      val tags = fetchStage.read.waysValues.map(way => stage(way.tag))
+      val hits = tags.map(tag => tag.valid && tag.address === mmuRsp.physicalAddress(tagRange))
+      val valid = Cat(hits).orR
+      val wayId = OHToUInt(hits)
+      val bankId = if(!reducedBankWidth) wayId else (wayId >> log2Up(bankCount/memToBankRatio)) @@ ((wayId + (mmuRsp.physicalAddress(log2Up(bankWidth/8), log2Up(bankCount) bits))).resize(log2Up(bankCount/memToBankRatio)))
+      val error = tags(wayId).error
+      val data = fetchStage.read.banksValue.map(bank => stage(bank.data)).read(bankId)
+      val word = if(cpuDataWidth == memDataWidth || !twoCycleRamInnerMux) data else data.subdivideIn(cpuDataWidth bits).read(io.cpu.decode.pc(bankWordToCpuWordRange))
+      if(p.bypassGen) when(stage(io.cpu.fetch.dataBypassValid)){
+        word := stage(io.cpu.fetch.dataBypass)
+      }
+      io.cpu.decode.data := word
+    }
+
+    io.cpu.decode.cacheMiss := !hit.valid
+    io.cpu.decode.error := hit.error || (!mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute))
+    io.cpu.decode.mmuRefilling := mmuRsp.refilling
+    io.cpu.decode.mmuException := !mmuRsp.refilling && mmuRsp.isPaging && (mmuRsp.exception || !mmuRsp.allowExecute)
+    io.cpu.decode.physicalAddress := mmuRsp.physicalAddress
+  })
+}
+
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
new file mode 100644
index 0000000..657b2fb
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -0,0 +1,1944 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets}
+import spinal.lib.math.UnsignedDivider
+
+import scala.collection.mutable.ArrayBuffer
+
+object FpuDivSqrtIterationState extends SpinalEnum{
+  val IDLE, YY, XYY, Y2_XYY, DIV, _15_XYY2, Y_15_XYY2, Y_15_XYY2_RESULT, SQRT = newElement()
+}
+
+
+case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
+  val io = new Bundle {
+    val port = Vec(slave(FpuPort(p)), portCount)
+  }
+
+  val portCountWidth = log2Up(portCount)
+  val Source = HardType(UInt(portCountWidth bits))
+  val exponentOne = (1 << p.internalExponentSize-1) - 1
+  val exponentF32Subnormal = exponentOne-127
+  val exponentF64Subnormal = exponentOne-1023
+  val exponentF32Infinity = exponentOne+127+1
+  val exponentF64Infinity = exponentOne+1023+1
+
+
+
+  def whenDouble(format : FpuFormat.C)(yes : => Unit)(no : => Unit): Unit ={
+    if(p.withDouble) when(format === FpuFormat.DOUBLE) { yes } otherwise{ no }
+    if(!p.withDouble) no
+  }
+
+  def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={
+    if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no })
+    else no
+  }
+
+  case class RfReadInput() extends Bundle{
+    val source = Source()
+    val opcode = p.Opcode()
+    val rs1, rs2, rs3 = p.rfAddress()
+    val rd = p.rfAddress()
+    val arg = p.Arg()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+  case class RfReadOutput() extends Bundle{
+    val source = Source()
+    val opcode = p.Opcode()
+    val rs1, rs2, rs3 = p.internalFloating()
+    val rd = p.rfAddress()
+    val arg = p.Arg()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
+  }
+
+
+  case class LoadInput() extends Bundle{
+    val source = Source()
+    val rd = p.rfAddress()
+    val i2f = Bool()
+    val arg = Bits(2 bits)
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+  case class ShortPipInput() extends Bundle{
+    val source = Source()
+    val opcode = p.Opcode()
+    val rs1, rs2 = p.internalFloating()
+    val rd = p.rfAddress()
+    val value = Bits(32 bits)
+    val arg = Bits(2 bits)
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
+  }
+
+  class MulInput() extends Bundle{
+    val source = Source()
+    val rs1, rs2, rs3 = p.internalFloating()
+    val rd = p.rfAddress()
+    val add = Bool()
+    val divSqrt = Bool()
+    val msb1, msb2 = Bool() //allow usage of msb bits of mul
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+
+  case class DivSqrtInput() extends Bundle{
+    val source = Source()
+    val rs1, rs2 = p.internalFloating()
+    val rd = p.rfAddress()
+    val div = Bool()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+  case class DivInput() extends Bundle{
+    val source = Source()
+    val rs1, rs2 = p.internalFloating()
+    val rd = p.rfAddress()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+
+  case class SqrtInput() extends Bundle{
+    val source = Source()
+    val rs1 = p.internalFloating()
+    val rd = p.rfAddress()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+  }
+
+
+  val addExtraBits = 2
+  case class AddInput() extends Bundle{
+    val source = Source()
+    val rs1, rs2 = FpuFloat(exponentSize = p.internalExponentSize, mantissaSize = p.internalMantissaSize+addExtraBits)
+    val rd = p.rfAddress()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+    val needCommit = Bool()
+  }
+
+
+  class MergeInput() extends Bundle{
+    val source = Source()
+    val rd = p.rfAddress()
+    val value = p.writeFloating()
+    val scrap = Bool()
+    val roundMode = FpuRoundMode()
+    val format = p.withDouble generate FpuFormat()
+    val NV = Bool()
+    val DZ = Bool()
+  }
+
+  case class RoundOutput() extends Bundle{
+    val source = Source()
+    val rd = p.rfAddress()
+    val value = p.internalFloating()
+    val format = p.withDouble generate FpuFormat()
+    val NV, NX, OF, UF, DZ = Bool()
+    val write = Bool()
+  }
+
+  val rf = new Area{
+    case class Entry() extends Bundle{
+      val value = p.internalFloating()
+      val boxed = p.withDouble generate Bool()
+    }
+    val ram = Mem(Entry(), 32*portCount)
+
+    val init = new Area{
+      val counter = Reg(UInt(6 bits)) init(0)
+      val done = CombInit(counter.msb)
+      when(!done){
+        counter := counter + 1
+      }
+      def apply(port : Flow[MemWriteCmd[Bool]]) = {
+        port.valid := !done
+        port.address := counter.resized
+        port.data := False
+        port
+      }
+    }
+
+    val scoreboards = Array.fill(portCount)(new Area{
+      val target, hit = Mem(Bool, 32) // XOR
+      val writes = Mem(Bool, 32)
+
+      val targetWrite = init(target.writePort)
+      val hitWrite = init(hit.writePort)
+    })
+  }
+
+  val commitFork = new Area{
+    val load, commit = Vec(Stream(FpuCommit(p)), portCount)
+    for(i <- 0 until portCount){
+      val fork = new StreamFork(FpuCommit(p), 2, synchronous = true)
+      fork.io.input << io.port(i).commit
+      fork.io.outputs(0) >> load(i)
+      fork.io.outputs(1).pipelined(m2s = false, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload
+    }
+  }
+
+  class Tracker(width : Int) extends Area{
+    val counter = Reg(UInt(width bits)) init(0)
+    val full = counter.andR
+    val notEmpty = counter.orR
+    val inc = False
+    val dec = False
+    counter := counter + U(inc) - U(dec)
+  }
+
+  class CommitArea(source : Int) extends Area{
+    val pending = new Tracker(4)
+    val add, mul, div, sqrt, short = new Tracker(4)
+    val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow
+
+    when(input.fire){
+      add.inc   setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR)
+      mul.inc   setWhen(List(FpuOpcode.MUL, FpuOpcode.FMA).map(input.opcode === _).orR)
+      div.inc   setWhen(List(FpuOpcode.DIV).map(input.opcode === _).orR)
+      sqrt.inc  setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR)
+      short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR)
+      rf.scoreboards(source).writes(input.rd) := input.write
+      pending.dec := True
+    }
+  }
+
+  val commitLogic = for(source <- 0 until portCount) yield new CommitArea(source)
+
+  def commitConsume(what : CommitArea => Tracker, source : UInt, fire : Bool) : Bool = {
+    for(i <- 0 until portCount) what(commitLogic(i)).dec setWhen(fire && source === i)
+    commitLogic.map(what(_).notEmpty).read(source)
+  }
+
+
+  val scheduler = for(portId <- 0 until portCount;
+                      scoreboard = rf.scoreboards(portId)) yield new Area{
+    val input = io.port(portId).cmd.pipelined(s2m = true)
+    val useRs1, useRs2, useRs3, useRd = False
+    switch(input.opcode){
+      is(p.Opcode.LOAD)      { useRd := True }
+      is(p.Opcode.STORE)     { useRs2 := True }
+      is(p.Opcode.ADD)       { useRd  := True; useRs1 := True; useRs2 := True }
+      is(p.Opcode.MUL)       { useRd  := True; useRs1 := True; useRs2 := True }
+      is(p.Opcode.DIV)       { useRd  := True; useRs1 := True; useRs2 := True }
+      is(p.Opcode.SQRT)      { useRd  := True; useRs1 := True }
+      is(p.Opcode.FMA)       { useRd  := True; useRs1 := True; useRs2 := True; useRs3 := True }
+      is(p.Opcode.I2F)       { useRd  := True }
+      is(p.Opcode.F2I)       { useRs1 := True }
+      is(p.Opcode.MIN_MAX)   { useRd  := True; useRs1 := True; useRs2 := True }
+      is(p.Opcode.CMP)       { useRs1 := True; useRs2 := True }
+      is(p.Opcode.SGNJ)      { useRd  := True; useRs1 := True; useRs2 := True }
+      is(p.Opcode.FMV_X_W)   { useRs1 := True }
+      is(p.Opcode.FMV_W_X)   { useRd  := True }
+      is(p.Opcode.FCLASS )   { useRs1  := True }
+      is(p.Opcode.FCVT_X_X ) { useRd  := True; useRs1  := True }
+    }
+
+    val uses = List(useRs1, useRs2, useRs3, useRd)
+    val regs = List(input.rs1, input.rs2, input.rs3, input.rd)
+    val rfHits = regs.map(scoreboard.hit.readAsync(_))
+    val rfTargets = regs.map(scoreboard.target.readAsync(_))
+    val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _)
+
+    val hits = (0 to 3).map(id => uses(id) && rfBusy(id))
+    val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full
+    val output = input.haltWhen(hazard)
+    when(input.opcode === p.Opcode.STORE){
+      output.rs1 := input.rs2 //Datapath optimisation to unify rs source in the store pipeline
+    }
+    when(input.valid && rf.init.done){
+      scoreboard.targetWrite.address := input.rd
+      scoreboard.targetWrite.data := !rfTargets.last
+    }
+    when(output.fire && useRd){
+      scoreboard.targetWrite.valid := True
+      commitLogic(portId).pending.inc := True
+    }
+  }
+
+
+  val cmdArbiter = new Area{
+    val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
+    arbiter.io.inputs <> Vec(scheduler.map(_.output.pipelined(m2s = p.schedulerM2sPipe)))
+
+    val output = arbiter.io.output.swapPayload(RfReadInput())
+    output.source := arbiter.io.chosen
+    output.payload.assignSomeByName(arbiter.io.output.payload)
+  }
+
+  val read = new Area{
+    val s0 = cmdArbiter.output.pipelined()
+    val s1 = s0.m2sPipe()
+    val output = s1.swapPayload(RfReadOutput())
+    val rs = if(p.asyncRegFile){
+      List(s1.rs1, s1.rs2, s1.rs3).map(a =>  rf.ram.readAsync(s1.source @@ a))
+    } else {
+      List(s0.rs1, s0.rs2, s0.rs3).map(a =>  rf.ram.readSync(s0.source @@ a, enable = !output.isStall))
+    }
+    output.source := s1.source
+    output.opcode := s1.opcode
+    output.arg := s1.arg
+    output.roundMode := s1.roundMode
+    output.rd := s1.rd
+    output.rs1 := rs(0).value
+    output.rs2 := rs(1).value
+    output.rs3 := rs(2).value
+    if(p.withDouble){
+      output.rs1Boxed := rs(0).boxed
+      output.rs2Boxed := rs(1).boxed
+      output.format := s1.format
+      val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
+      val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE
+      when(!sgnjBypass) {
+        when(store) { //Pass through
+          output.format := rs(0).boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
+        } elsewhen (s1.format === FpuFormat.FLOAT =/= rs(0).boxed) {
+          output.rs1.setNanQuiet
+          output.rs1.sign := False
+        }
+      }
+      when(s1.format === FpuFormat.FLOAT =/= rs(1).boxed) {
+        output.rs2.setNanQuiet
+        output.rs2.sign := False
+      }
+      when(s1.format === FpuFormat.FLOAT =/= rs(2).boxed) {
+        output.rs3.setNanQuiet
+      }
+    }
+  }
+
+  val decode = new Area{
+    val input = read.output/*.s2mPipe()*/.combStage()
+    input.ready := False
+
+    val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR
+    val load = Stream(LoadInput())
+    load.valid := input.valid && loadHit
+    input.ready setWhen(loadHit && load.ready)
+    load.payload.assignSomeByName(input.payload)
+    load.i2f := input.opcode === FpuOpcode.I2F
+
+    val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR
+    val shortPip = Stream(ShortPipInput())
+    input.ready setWhen(shortPipHit && shortPip.ready)
+    shortPip.valid := input.valid && shortPipHit
+    shortPip.payload.assignSomeByName(input.payload)
+
+    val divSqrtHit = input.opcode === p.Opcode.DIV ||  input.opcode === p.Opcode.SQRT
+    val divSqrt = Stream(DivSqrtInput())
+    if(p.withDivSqrt) {
+      input.ready setWhen (divSqrtHit && divSqrt.ready)
+      divSqrt.valid := input.valid && divSqrtHit
+      divSqrt.payload.assignSomeByName(input.payload)
+      divSqrt.div := input.opcode === p.Opcode.DIV
+    }
+
+    val divHit = input.opcode === p.Opcode.DIV
+    val div = Stream(DivInput())
+    if(p.withDiv) {
+      input.ready setWhen (divHit && div.ready)
+      div.valid := input.valid && divHit
+      div.payload.assignSomeByName(input.payload)
+    }
+
+    val sqrtHit = input.opcode === p.Opcode.SQRT
+    val sqrt = Stream(SqrtInput())
+    if(p.withSqrt) {
+      input.ready setWhen (sqrtHit && sqrt.ready)
+      sqrt.valid := input.valid && sqrtHit
+      sqrt.payload.assignSomeByName(input.payload)
+    }
+
+
+    val fmaHit = input.opcode === p.Opcode.FMA
+    val mulHit = input.opcode === p.Opcode.MUL || fmaHit
+    val mul = Stream(new MulInput())
+    val divSqrtToMul = Stream(new MulInput())
+    if(!p.withDivSqrt){
+      divSqrtToMul.valid := False
+      divSqrtToMul.payload.assignDontCare()
+    }
+
+    if(p.withMul) {
+      input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid)
+      mul.valid := input.valid && mulHit || divSqrtToMul.valid
+
+      divSqrtToMul.ready := mul.ready
+      mul.payload := divSqrtToMul.payload
+      when(!divSqrtToMul.valid) {
+        mul.payload.assignSomeByName(input.payload)
+        mul.add := fmaHit
+        mul.divSqrt := False
+        mul.msb1 := True
+        mul.msb2 := True
+        mul.rs2.sign.allowOverride();
+        mul.rs2.sign := input.rs2.sign ^ input.arg(0)
+        mul.rs3.sign.allowOverride();
+        mul.rs3.sign := input.rs3.sign ^ input.arg(1)
+      }
+    }
+
+    val addHit = input.opcode === p.Opcode.ADD
+    val add = Stream(AddInput())
+    val mulToAdd = Stream(AddInput())
+
+
+    if(p.withAdd) {
+      input.ready setWhen (addHit && add.ready && !mulToAdd.valid)
+      add.valid := input.valid && addHit || mulToAdd.valid
+
+      mulToAdd.ready := add.ready
+      add.payload := mulToAdd.payload
+      when(!mulToAdd.valid) {
+        add.source := input.source
+        add.rd := input.rd
+        add.roundMode := input.roundMode
+        if(p.withDouble) add.format := input.format
+        add.needCommit := True
+        add.rs1.special := input.rs1.special
+        add.rs2.special := input.rs2.special
+        add.rs1.exponent := input.rs1.exponent
+        add.rs2.exponent := input.rs2.exponent
+        add.rs1.sign := input.rs1.sign
+        add.rs2.sign := input.rs2.sign ^ input.arg(0)
+        add.rs1.mantissa := input.rs1.mantissa << addExtraBits
+        add.rs2.mantissa := input.rs2.mantissa << addExtraBits
+      }
+    }
+  }
+
+  val load = new Area{
+
+    case class S0() extends Bundle{
+      val source = Source()
+      val rd = p.rfAddress()
+      val value = p.storeLoadType()
+      val i2f = Bool()
+      val arg = Bits(2 bits)
+      val roundMode = FpuRoundMode()
+      val format = p.withDouble generate FpuFormat()
+    }
+
+    val s0 = new Area{
+      val input = decode.load.pipelined(m2s = true, s2m = true).stage()
+      val filtred = commitFork.load.map(port => port.takeWhen(List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(_ === port.opcode).orR))
+      def feed = filtred(input.source)
+      val hazard = !feed.valid
+
+
+      val output = input.haltWhen(hazard).swapPayload(S0())
+      filtred.foreach(_.ready := False)
+      feed.ready := input.valid && output.ready
+      output.source := input.source
+      output.rd := input.rd
+      output.value := feed.value
+      output.i2f := input.i2f
+      output.arg := input.arg
+      output.roundMode := input.roundMode
+      if(p.withDouble) {
+        output.format := input.format
+        when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing
+          output.format := FpuFormat.FLOAT
+        }
+      }
+    }
+
+    val s1 = new Area{
+      val input = s0.output.stage()
+      val busy = False
+
+      val f32 = new Area{
+        val mantissa = input.value(0, 23 bits).asUInt
+        val exponent = input.value(23, 8 bits).asUInt
+        val sign     = input.value(31)
+      }
+      val f64 = p.withDouble generate new Area{
+        val mantissa = input.value(0, 52 bits).asUInt
+        val exponent = input.value(52, 11 bits).asUInt
+        val sign     = input.value(63)
+      }
+
+      val recodedExpOffset = UInt(p.internalExponentSize bits)
+      val passThroughFloat = p.internalFloating()
+      passThroughFloat.special := False
+
+      whenDouble(input.format){
+        passThroughFloat.sign := f64.sign
+        passThroughFloat.exponent := f64.exponent.resized
+        passThroughFloat.mantissa := f64.mantissa
+        recodedExpOffset := exponentF64Subnormal
+      } {
+        passThroughFloat.sign := f32.sign
+        passThroughFloat.exponent := f32.exponent.resized
+        passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0)
+        recodedExpOffset := exponentF32Subnormal
+      }
+
+
+      val manZero = passThroughFloat.mantissa === 0
+      val expZero = passThroughFloat.exponent === 0
+      val expOne =  passThroughFloat.exponent(7 downto 0).andR
+      if(p.withDouble) {
+        expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0)
+        expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7)
+      }
+
+      val isZero      =  expZero &&  manZero
+      val isSubnormal =  expZero && !manZero
+      val isInfinity  =  expOne  &&  manZero
+      val isNan       =  expOne  && !manZero
+
+
+      val fsm = new Area{
+        val done, boot, patched = Reg(Bool())
+        val ohInputWidth = 32 max p.internalMantissaSize
+        val ohInput = Bits(ohInputWidth bits).assignDontCare()
+        when(!input.i2f) {
+          if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9
+          if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits
+        } otherwise {
+          ohInput(ohInputWidth-32-1 downto 0) := 0
+          ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0)
+        }
+
+        val i2fZero = Reg(Bool)
+
+        val shift = new Area{
+          val by = Reg(UInt(log2Up(ohInputWidth) bits))
+          val input = UInt(ohInputWidth bits).assignDontCare()
+          var logic = input
+          for(i <- by.range){
+            logic \= by(i) ? (logic |<< (BigInt(1) << i)) | logic
+          }
+          val output = RegNextWhen(logic, !done)
+        }
+        shift.input := (ohInput.asUInt |<< 1).resized
+
+        when(input.valid && (input.i2f || isSubnormal) && !done){
+          busy := True
+          when(boot){
+            when(input.i2f && !patched && input.value(31) && input.arg(0)){
+              input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits))
+              patched := True
+            } otherwise {
+              shift.by := OHToUInt(OHMasking.first((ohInput).reversed))
+              boot := False
+              i2fZero := input.value(31 downto 0) === 0
+            }
+          } otherwise {
+            done := True
+          }
+        }
+
+        val expOffset = (UInt(p.internalExponentSize bits))
+        expOffset := 0
+        when(isSubnormal){
+          expOffset := shift.by.resized
+        }
+
+        when(!input.isStall){
+          done := False
+          boot := True
+          patched := False
+        }
+      }
+
+
+      val i2fSign = fsm.patched
+      val (i2fHigh, i2fLow) = fsm.shift.output.splitAt(if(p.withDouble) 0 else widthOf(input.value)-24)
+      val scrap = i2fLow =/= 0
+
+      val recoded = p.internalFloating()
+      recoded.mantissa := passThroughFloat.mantissa
+      recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized
+      recoded.sign     := passThroughFloat.sign
+      recoded.setNormal
+      when(isZero){recoded.setZero}
+      when(isInfinity){recoded.setInfinity}
+      when(isNan){recoded.setNan}
+
+      val output = input.haltWhen(busy).swapPayload(new MergeInput())
+      output.source := input.source
+      output.roundMode := input.roundMode
+      if(p.withDouble) {
+        output.format := input.format
+      }
+      output.rd := input.rd
+      output.value.sign      := recoded.sign
+      output.value.exponent  := recoded.exponent
+      output.value.mantissa  := recoded.mantissa @@ U"0"
+      output.value.special   := recoded.special
+      output.scrap := False
+      output.NV := False
+      output.DZ := False
+      when(input.i2f){
+        output.value.sign := i2fSign
+        output.value.exponent := (U(exponentOne+31) - fsm.shift.by).resized
+        output.value.setNormal
+        output.scrap := scrap
+        when(fsm.i2fZero) { output.value.setZero }
+      }
+
+      when(input.i2f || isSubnormal){
+        output.value.mantissa := U(i2fHigh) @@ (if(p.withDouble) U"0" else U"")
+      }
+    }
+
+  }
+
+  val shortPip = new Area{
+    val input = decode.shortPip.stage()
+
+    val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR
+    val rfOutput = Stream(new MergeInput())
+
+    val isCommited = commitConsume(_.short, input.source, input.fire && toFpuRf)
+    val output = rfOutput.haltWhen(!isCommited)
+
+    val result = p.storeLoadType().assignDontCare()
+
+    val halt = False
+    val recodedResult =  p.storeLoadType()
+    val f32 = new Area{
+      val exp = (input.rs1.exponent - (exponentOne-127)).resize(8 bits)
+      val man = CombInit(input.rs1.mantissa(if(p.withDouble) 51 downto 29 else 22 downto 0))
+    }
+    val f64 = p.withDouble generate new Area{
+      val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits)
+      val man = CombInit(input.rs1.mantissa)
+    }
+
+    whenDouble(input.format){
+      recodedResult := input.rs1.sign ## f64.exp ## f64.man
+    } {
+      recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
+    }
+
+    val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+    val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold
+    val isSubnormal = !input.rs1.special && expInSubnormalRange
+    val isNormal = !input.rs1.special && !expInSubnormalRange
+    val fsm = new Area{
+      val f2iShift = input.rs1.exponent - U(exponentOne)
+      val isF2i = input.opcode === FpuOpcode.F2I
+      val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal
+      val done, boot = Reg(Bool())
+      val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1
+
+      val shift = new Area{
+        val by = Reg(UInt(log2Up(p.internalMantissaSize+1 max 33) bits))
+        val input = UInt(p.internalMantissaSize+1 max 33 bits).assignDontCare()
+        var logic = input
+        val scrap = Reg(Bool)
+        for(i <- by.range.reverse){
+          scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0)
+          logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic
+        }
+        when(boot){
+          scrap := False
+        }
+        val output = RegNextWhen(logic, !done)
+      }
+
+      shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9)
+
+      val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10)))
+      when(input.valid && (needRecoding || isF2i) && !done){
+        halt := True
+        when(boot){
+          when(isF2i){
+            shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge
+          } otherwise {
+            shift.by := (formatShiftOffset - input.rs1.exponent).resized
+          }
+          boot := False
+        } otherwise {
+          done := True
+        }
+      }
+
+      when(!input.isStall){
+        done := False
+        boot := True
+      }
+    }
+
+    val mantissaForced = False
+    val exponentForced = False
+    val mantissaForcedValue = Bool().assignDontCare()
+    val exponentForcedValue = Bool().assignDontCare()
+    val cononicalForced = False
+
+
+    when(input.rs1.special){
+      switch(input.rs1.exponent(1 downto 0)){
+        is(FpuFloat.ZERO){
+          mantissaForced      := True
+          exponentForced      := True
+          mantissaForcedValue := False
+          exponentForcedValue := False
+        }
+        is(FpuFloat.INFINITY){
+          mantissaForced      := True
+          exponentForced      := True
+          mantissaForcedValue := False
+          exponentForcedValue := True
+        }
+        is(FpuFloat.NAN){
+          exponentForced      := True
+          exponentForcedValue := True
+          when(input.rs1.isCanonical){
+            cononicalForced := True
+            mantissaForced      := True
+            mantissaForcedValue := False
+          }
+        }
+      }
+    }
+
+
+
+    when(isSubnormal){
+      exponentForced      := True
+      exponentForcedValue := False
+      recodedResult(0,23 bits) := fsm.shift.output(22 downto 0).asBits
+      whenDouble(input.format){
+        recodedResult(51 downto 23) := fsm.shift.output(51 downto 23).asBits
+      }{}
+    }
+    when(mantissaForced){
+      recodedResult(0,23 bits) := (default -> mantissaForcedValue)
+      whenDouble(input.format){
+        recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue)
+      }{}
+    }
+    when(exponentForced){
+      whenDouble(input.format){
+        recodedResult(52, 11 bits) := (default -> exponentForcedValue)
+      }  {
+        recodedResult(23, 8 bits) := (default -> exponentForcedValue)
+      }
+    }
+    when(cononicalForced){
+      whenDouble(input.format){
+        recodedResult(63) := False
+        recodedResult(51) := True
+      }  {
+        recodedResult(31) := False
+        recodedResult(22) := True
+      }
+    }
+
+    val rspNv = False
+    val rspNx = False
+
+    val f2i = new Area{ //Will not work for 64 bits float max value rounding
+      val unsigned = fsm.shift.output(32 downto 0) >> 1
+      val resign = input.arg(0) && input.rs1.sign
+      val round = fsm.shift.output(0) ## fsm.shift.scrap
+      val increment = input.roundMode.mux(
+        FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))),
+        FpuRoundMode.RTZ -> False,
+        FpuRoundMode.RDN -> (round =/= 0 &&  input.rs1.sign),
+        FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign),
+        FpuRoundMode.RMM -> (round(1))
+      )
+      val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt)
+      val overflow  = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan
+      val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
+      val isZero = input.rs1.isZero
+      if(p.withDouble){
+        overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31)))
+      }
+      when(isZero){
+        result := 0
+      } elsewhen(underflow || overflow) {
+        val low = overflow
+        val high = input.arg(0) ^ overflow
+        result := (31 -> high, default -> low)
+        rspNv := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero
+      } otherwise {
+        rspNx := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0
+      }
+    }
+
+    val bothZero = input.rs1.isZero && input.rs2.isZero
+    val rs1Equal = input.rs1 === input.rs2
+    val rs1AbsSmaller = (input.rs1.exponent @@ input.rs1.mantissa) < (input.rs2.exponent @@ input.rs2.mantissa)
+    rs1AbsSmaller.setWhen(input.rs2.isInfinity)
+    rs1AbsSmaller.setWhen(input.rs1.isZero)
+    rs1AbsSmaller.clearWhen(input.rs2.isZero)
+    rs1AbsSmaller.clearWhen(input.rs1.isInfinity)
+    rs1Equal setWhen(input.rs1.sign === input.rs2.sign && input.rs1.isInfinity && input.rs2.isInfinity)
+    val rs1Smaller = (input.rs1.sign ## input.rs2.sign).mux(
+      0 -> rs1AbsSmaller,
+      1 -> False,
+      2 -> True,
+      3 -> (!rs1AbsSmaller && !rs1Equal)
+    )
+
+
+    val minMaxSelectRs2 = !(((rs1Smaller ^ input.arg(0)) && !input.rs1.isNan || input.rs2.isNan))
+    val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan
+    val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0))
+    when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 }
+    val sgnjRs1Sign = CombInit(input.rs1.sign)
+    val sgnjRs2Sign = CombInit(input.rs2.sign)
+    if(p.withDouble){
+      sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE)
+    }
+    val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0)
+    val fclassResult = B(0, 32 bits)
+    val decoded = input.rs1.decode()
+    fclassResult(0) :=  input.rs1.sign &&  decoded.isInfinity
+    fclassResult(1) :=  input.rs1.sign &&  isNormal
+    fclassResult(2) :=  input.rs1.sign &&  isSubnormal
+    fclassResult(3) :=  input.rs1.sign &&  decoded.isZero
+    fclassResult(4) := !input.rs1.sign &&  decoded.isZero
+    fclassResult(5) := !input.rs1.sign &&  isSubnormal
+    fclassResult(6) := !input.rs1.sign &&  isNormal
+    fclassResult(7) := !input.rs1.sign &&  decoded.isInfinity
+    fclassResult(8) :=   decoded.isNan && !decoded.isQuiet
+    fclassResult(9) :=   decoded.isNan &&  decoded.isQuiet
+
+
+    switch(input.opcode){
+      is(FpuOpcode.STORE)   { result := recodedResult }
+      is(FpuOpcode.FMV_X_W) { result := recodedResult }
+      is(FpuOpcode.F2I)     { result(31 downto 0) := f2i.result.asBits }
+      is(FpuOpcode.CMP)     { result(31 downto 0) := cmpResult.resized }
+      is(FpuOpcode.FCLASS)  { result(31 downto 0) := fclassResult.resized }
+    }
+
+
+    rfOutput.valid := input.valid && toFpuRf && !halt
+    rfOutput.source := input.source
+    rfOutput.rd := input.rd
+    rfOutput.roundMode := input.roundMode
+    if(p.withDouble) rfOutput.format := input.format
+    rfOutput.scrap := False
+    rfOutput.value.sign     := input.rs1.sign
+    rfOutput.value.exponent := input.rs1.exponent
+    rfOutput.value.mantissa := input.rs1.mantissa @@ U"0"
+    rfOutput.value.special  := input.rs1.special
+
+    switch(input.opcode){
+      is(FpuOpcode.MIN_MAX){
+        when(minMaxSelectRs2) {
+          rfOutput.value.sign := input.rs2.sign
+          rfOutput.value.exponent := input.rs2.exponent
+          rfOutput.value.mantissa := input.rs2.mantissa @@ U"0"
+          rfOutput.value.special := input.rs2.special
+        }
+        when(minMaxSelectNanQuiet){
+          rfOutput.value.setNanQuiet
+        }
+      }
+      is(FpuOpcode.SGNJ){
+        when(!input.rs1.isNan) {
+          rfOutput.value.sign := sgnjResult
+        }
+        if(p.withDouble) when(input.rs1Boxed && input.format === FpuFormat.DOUBLE){
+          rfOutput.value.sign := input.rs1.sign
+          rfOutput.format := FpuFormat.FLOAT
+        }
+      }
+      if(p.withDouble) is(FpuOpcode.FCVT_X_X){
+        rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT)
+        when(input.rs1.isNan){
+          rfOutput.value.setNanQuiet
+        }
+      }
+    }
+
+    val signalQuiet = input.opcode === FpuOpcode.CMP && input.arg =/= 2
+    val rs1Nan = input.rs1.isNan
+    val rs2Nan = input.rs2.isNan
+    val rs1NanNv = input.rs1.isNan && (!input.rs1.isQuiet || signalQuiet)
+    val rs2NanNv = input.rs2.isNan && (!input.rs2.isQuiet || signalQuiet)
+    val NV = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv ||
+             List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv
+    rspNv setWhen(NV)
+
+    val rspStreams = Vec(Stream(FpuRsp(p)), portCount)
+    input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source))
+    for(i <- 0 until portCount){
+      def rsp = rspStreams(i)
+      rsp.valid := input.valid && input.source === i && !toFpuRf && !halt
+      rsp.value := result
+      rsp.NV := rspNv
+      rsp.NX := rspNx
+      io.port(i).rsp << rsp.stage()
+    }
+
+
+    rfOutput.NV := NV
+    rfOutput.DZ := False
+  }
+
+  val mul = p.withMul generate new Area{
+    val inWidthA = p.internalMantissaSize+1
+    val inWidthB = p.internalMantissaSize+1
+    val outWidth = p.internalMantissaSize*2+2
+
+    case class MulSplit(offsetA : Int, offsetB : Int, widthA : Int, widthB : Int, id : Int){
+      val offsetC = offsetA+offsetB
+      val widthC = widthA + widthB
+      val endC = offsetC+widthC
+    }
+    val splitsUnordered = for(offsetA <- 0 until inWidthA by p.mulWidthA;
+                     offsetB <- 0 until inWidthB by p.mulWidthB;
+                     widthA = (inWidthA - offsetA) min p.mulWidthA;
+                     widthB = (inWidthB - offsetB) min p.mulWidthB) yield {
+     MulSplit(offsetA, offsetB, widthA, widthB, -1)
+    }
+    val splits = splitsUnordered.sortWith(_.endC < _.endC).zipWithIndex.map(e => e._1.copy(id=e._2))
+
+    class MathWithExp extends MulInput{
+      val exp  = UInt(p.internalExponentSize+1 bits)
+    }
+    val preMul = new Area{
+      val input = decode.mul.stage()
+      val output = input.swapPayload(new MathWithExp())
+      output.payload.assignSomeByName(input.payload)
+      output.exp := input.rs1.exponent +^ input.rs2.exponent
+    }
+    class MathWithMul extends MathWithExp{
+      val muls  = Vec(splits.map(e => UInt(e.widthA + e.widthB bits)))
+    }
+    val mul = new Area{
+      val input = preMul.output.stage()
+      val output = input.swapPayload(new MathWithMul())
+      val mulA = U(input.msb1) @@ input.rs1.mantissa
+      val mulB = U(input.msb2) @@ input.rs2.mantissa
+      output.payload.assignSomeByName(input.payload)
+      splits.foreach(e => output.muls(e.id) := mulA(e.offsetA, e.widthA bits) * mulB(e.offsetB, e.widthB bits))
+    }
+
+    val sumSplitAt = splits.size/2//splits.filter(e => e.endC <= p.internalMantissaSize).size
+
+    class Sum1Output extends MathWithExp{
+      val muls2  = Vec(splits.drop(sumSplitAt).map(e => UInt(e.widthA + e.widthB bits)))
+      val mulC2 = UInt(p.internalMantissaSize*2+2 bits)
+    }
+    class Sum2Output extends MathWithExp{
+      val mulC = UInt(p.internalMantissaSize*2+2 bits)
+    }
+
+    val sum1 = new Area {
+      val input = mul.output.stage()
+      val sum = splits.take(sumSplitAt).map(e => (input.muls(e.id) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _)
+
+      val output = input.swapPayload(new Sum1Output())
+      output.payload.assignSomeByName(input.payload)
+      output.mulC2 := sum.resized
+      output.muls2 := Vec(input.muls.drop(sumSplitAt))
+    }
+
+    val sum2 = new Area {
+      val input = sum1.output.stage()
+      val sum = input.mulC2 + splits.drop(sumSplitAt).map(e => (input.muls2(e.id-sumSplitAt) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _)
+
+      val isCommited = commitConsume(_.mul, input.source, input.fire)
+      val output = input.haltWhen(!isCommited).swapPayload(new Sum2Output())
+      output.payload.assignSomeByName(input.payload)
+      output.mulC := sum
+    }
+
+    val norm = new Area{
+      val input = sum2.output.stage()
+      val (mulHigh, mulLow) = input.mulC.splitAt(p.internalMantissaSize-1)
+      val scrap = mulLow =/= 0
+      val needShift = mulHigh.msb
+      val exp = input.exp + U(needShift)
+      val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
+      scrap setWhen(needShift && mulHigh(0))
+      val forceZero = input.rs1.isZero || input.rs2.isZero
+      val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24)
+      val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25)
+      val forceUnderflow = exp <  underflowThreshold
+      val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity
+      val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero))
+      val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+
+      val output = p.writeFloating()
+      output.sign := input.rs1.sign ^ input.rs2.sign
+      output.exponent := (exp - exponentOne).resized
+      output.mantissa := man.asUInt
+      output.setNormal
+      val NV = False
+
+      when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+
+      when(forceNan) {
+        output.setNanQuiet
+        NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)
+      } elsewhen(forceOverflow) {
+        output.setInfinity
+      } elsewhen(forceZero) {
+        output.setZero
+      } elsewhen(forceUnderflow) {
+        output.exponent := underflowExp.resized
+      }
+    }
+
+    val result = new Area {
+      def input = norm.input
+      def NV = norm.NV
+
+      val notMul = new Area {
+        val output = Flow(UInt(p.internalMantissaSize + 1 bits))
+        output.valid := input.valid && input.divSqrt
+        output.payload := input.mulC(p.internalMantissaSize, p.internalMantissaSize + 1 bits)
+      }
+
+      val output = Stream(new MergeInput())
+      output.valid := input.valid && !input.add && !input.divSqrt
+      output.source := input.source
+      output.rd := input.rd
+      if (p.withDouble) output.format := input.format
+      output.roundMode := input.roundMode
+      output.scrap := norm.scrap
+      output.value := norm.output
+      output.NV := NV
+      output.DZ := False
+
+      val mulToAdd = Stream(AddInput())
+      decode.mulToAdd << mulToAdd.stage()
+
+      mulToAdd.valid := input.valid && input.add
+      mulToAdd.source := input.source
+      mulToAdd.rs1.mantissa := norm.output.mantissa @@ norm.scrap //FMA Precision lost
+      mulToAdd.rs1.exponent := norm.output.exponent
+      mulToAdd.rs1.sign := norm.output.sign
+      mulToAdd.rs1.special := norm.output.special
+      mulToAdd.rs2 := input.rs3
+      mulToAdd.rs2.mantissa.removeAssignments() := input.rs3.mantissa << addExtraBits
+      mulToAdd.rd := input.rd
+      mulToAdd.roundMode := input.roundMode
+      mulToAdd.needCommit := False
+      if (p.withDouble) mulToAdd.format := input.format
+
+      when(NV){
+        mulToAdd.rs1.mantissa.msb := False
+      }
+
+      input.ready := (input.add ? mulToAdd.ready | output.ready) || input.divSqrt
+    }
+  }
+
+
+  val div = p.withDiv generate new Area{
+    val input = decode.div.halfPipe()
+    val haltIt = True
+    val isCommited = RegNext(commitConsume(_.div, input.source, input.fire))
+    val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput())
+
+    val dividerShift = if(p.withDouble) 0 else 1
+    val divider = FpuDiv(p.internalMantissaSize + dividerShift)
+    divider.io.input.a := input.rs1.mantissa << dividerShift
+    divider.io.input.b := input.rs2.mantissa << dividerShift
+    val dividerResult = divider.io.output.result >> dividerShift
+    val dividerScrap = divider.io.output.remain =/= 0 || divider.io.output.result(0, dividerShift bits) =/= 0
+
+    val cmdSent = RegInit(False) setWhen(divider.io.input.fire) clearWhen(!haltIt)
+    divider.io.input.valid := input.valid && !cmdSent
+    divider.io.output.ready := input.ready
+    output.payload.assignSomeByName(input.payload)
+
+    val needShift = !dividerResult.msb
+    val mantissa = needShift ? dividerResult(0, p.internalMantissaSize + 1 bits) |  dividerResult(1, p.internalMantissaSize + 1 bits)
+    val scrap = dividerScrap || !needShift && dividerResult(0)
+    val exponentOffset = 1 << (p.internalExponentSize + 1)
+    val exponent = input.rs1.exponent + U(exponentOffset | exponentOne) - input.rs2.exponent - U(needShift)
+
+    output.value.setNormal
+    output.value.sign := input.rs1.sign ^ input.rs2.sign
+    output.value.exponent := exponent.resized
+    output.value.mantissa := mantissa
+    output.scrap := scrap
+    when(exponent.takeHigh(2) === 3){ output.value.exponent(p.internalExponentSize-3, 3 bits) := 7} //Handle overflow
+
+
+
+    val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24)
+    val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25)
+    val forceUnderflow = exponent <  underflowThreshold
+    val forceOverflow = input.rs1.isInfinity || input.rs2.isZero
+    val infinitynan = input.rs1.isZero && input.rs2.isZero || input.rs1.isInfinity && input.rs2.isInfinity
+    val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+    val forceZero = input.rs1.isZero || input.rs2.isInfinity
+
+
+
+    output.NV := False
+    output.DZ := !forceNan && !input.rs1.isInfinity && input.rs2.isZero
+
+    when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+
+    when(forceNan) {
+      output.value.setNanQuiet
+      output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
+    } elsewhen(forceOverflow) {
+      output.value.setInfinity
+    } elsewhen(forceZero) {
+      output.value.setZero
+    } elsewhen(forceUnderflow) {
+      output.value.exponent := underflowExp.resized
+    }
+
+
+    haltIt clearWhen(divider.io.output.valid)
+  }
+
+
+
+  val sqrt = p.withSqrt generate new Area{
+    val input = decode.sqrt.halfPipe()
+    val haltIt = True
+    val isCommited = RegNext(commitConsume(_.sqrt, input.source, input.fire))
+    val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput())
+
+    val needShift = !input.rs1.exponent.lsb
+    val sqrt = FpuSqrt(p.internalMantissaSize)
+    sqrt.io.input.a := (needShift ? (U"1" @@ input.rs1.mantissa @@ U"0") | (U"01" @@ input.rs1.mantissa))
+
+    val cmdSent = RegInit(False) setWhen(sqrt.io.input.fire) clearWhen(!haltIt)
+    sqrt.io.input.valid := input.valid && !cmdSent
+    sqrt.io.output.ready := input.ready
+    output.payload.assignSomeByName(input.payload)
+
+
+    val scrap = sqrt.io.output.remain =/= 0
+    val exponent =   RegNext(exponentOne-exponentOne/2 -1 +^ (input.rs1.exponent >> 1) + U(input.rs1.exponent.lsb))
+
+    output.value.setNormal
+    output.value.sign := input.rs1.sign
+    output.value.exponent := exponent
+    output.value.mantissa := sqrt.io.output.result
+    output.scrap := scrap
+    output.NV := False
+    output.DZ := False
+
+    val negative  = !input.rs1.isNan && !input.rs1.isZero && input.rs1.sign
+
+    when(input.rs1.isInfinity){
+      output.value.setInfinity
+    }
+    when(negative){
+      output.value.setNanQuiet
+      output.NV := True
+    }
+    when(input.rs1.isNan){
+      output.value.setNanQuiet
+      output.NV := !input.rs1.isQuiet
+    }
+    when(input.rs1.isZero){
+      output.value.setZero
+    }
+
+
+//    val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24)
+//    val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25)
+//    val forceUnderflow = exponent <  underflowThreshold
+//    val forceOverflow = input.rs1.isInfinity// || input.rs2.isInfinity
+//    val infinitynan = input.rs1.isZero && input.rs2.isZero
+//    val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
+//    val forceZero = input.rs1.isZero
+//
+//
+//
+//    output.NV := False
+//    output.DZ := !forceNan && input.rs2.isZero
+//
+//    when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 }
+//
+//    when(forceNan) {
+//      output.value.setNanQuiet
+//      output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
+//    } elsewhen(forceOverflow) {
+//      output.value.setInfinity
+//    } elsewhen(forceZero) {
+//      output.value.setZero
+//    } elsewhen(forceUnderflow) {
+//      output.value.exponent := underflowExp.resized
+//    }
+
+
+    haltIt clearWhen(sqrt.io.output.valid)
+  }
+
+  //divSqrt isn't realy used anymore
+  val divSqrt = p.withDivSqrt generate new Area {
+    val input = decode.divSqrt.halfPipe()
+    assert(false, "Need to implement commit tracking")
+    val aproxWidth = 8
+    val aproxDepth = 64
+    val divIterationCount = 3
+    val sqrtIterationCount = 3
+
+    val mulWidth = p.internalMantissaSize + 1
+
+    import FpuDivSqrtIterationState._
+    val state     = RegInit(FpuDivSqrtIterationState.IDLE())
+    val iteration = Reg(UInt(log2Up(divIterationCount max sqrtIterationCount) bits))
+
+    decode.divSqrtToMul.valid := False
+    decode.divSqrtToMul.source := input.source
+    decode.divSqrtToMul.rs1.assignDontCare()
+    decode.divSqrtToMul.rs2.assignDontCare()
+    decode.divSqrtToMul.rs3.assignDontCare()
+    decode.divSqrtToMul.rd := input.rd
+    decode.divSqrtToMul.add := False
+    decode.divSqrtToMul.divSqrt := True
+    decode.divSqrtToMul.msb1 := True
+    decode.divSqrtToMul.msb2 := True
+    decode.divSqrtToMul.rs1.special := False //TODO
+    decode.divSqrtToMul.rs2.special := False
+    decode.divSqrtToMul.roundMode := input.roundMode
+    if(p.withDouble) decode.divSqrtToMul.format := input.format
+
+
+    val aprox = new Area {
+      val rom = Mem(UInt(aproxWidth bits), aproxDepth * 2)
+      val divTable, sqrtTable = ArrayBuffer[Double]()
+      for(i <- 0 until aproxDepth){
+        val value = 1+(i+0.5)/aproxDepth
+        divTable += 1/value
+      }
+      for(i <- 0 until aproxDepth){
+        val scale = if(i < aproxDepth/2) 2 else 1
+        val value = scale+(scale*(i%(aproxDepth/2)+0.5)/aproxDepth*2)
+//        println(s"$i => $value" )
+        sqrtTable += 1/Math.sqrt(value)
+      }
+      val romElaboration = (sqrtTable ++ divTable).map(v => BigInt(((v-0.5)*2*(1 << aproxWidth)).round))
+
+      rom.initBigInt(romElaboration)
+      val div = input.rs2.mantissa.takeHigh(log2Up(aproxDepth))
+      val sqrt = U(input.rs1.exponent.lsb ## input.rs1.mantissa).takeHigh(log2Up(aproxDepth))
+      val address = U(input.div ## (input.div ? div | sqrt))
+      val raw = rom.readAsync(address)
+      val result = U"01" @@ (raw << (mulWidth-aproxWidth-2))
+    }
+
+    val divExp = new Area{
+      val value = (1 << p.internalExponentSize) - 3 - input.rs2.exponent
+    }
+    val sqrtExp = new Area{
+      val value = ((1 << p.internalExponentSize-1) + (1 << p.internalExponentSize-2) - 2 -1) - (input.rs1.exponent >> 1) + U(!input.rs1.exponent.lsb)
+    }
+
+    def mulArg(rs1 : UInt, rs2 : UInt): Unit ={
+      decode.divSqrtToMul.rs1.mantissa := rs1.resized
+      decode.divSqrtToMul.rs2.mantissa := rs2.resized
+      decode.divSqrtToMul.msb1 := rs1.msb
+      decode.divSqrtToMul.msb2 := rs2.msb
+    }
+
+    val mulBuffer = mul.result.notMul.output.toStream.stage
+    mulBuffer.ready := False
+
+    val iterationValue = Reg(UInt(mulWidth bits))
+
+    input.ready := False
+    switch(state){
+      is(IDLE){
+        iterationValue := aprox.result
+        iteration := 0
+        when(input.valid) {
+          state := YY
+        }
+      }
+      is(YY){
+        decode.divSqrtToMul.valid := True
+        mulArg(iterationValue, iterationValue)
+        when(decode.divSqrtToMul.ready) {
+          state := XYY
+        }
+      }
+      is(XYY){
+        decode.divSqrtToMul.valid := mulBuffer.valid
+        val sqrtIn = !input.rs1.exponent.lsb ? (U"1" @@ input.rs1.mantissa) | ((U"1" @@ input.rs1.mantissa) |>> 1)
+        val divIn = U"1" @@ input.rs2.mantissa
+        mulArg(input.div ? divIn| sqrtIn, mulBuffer.payload)
+        when(mulBuffer.valid && decode.divSqrtToMul.ready) {
+          state := (input.div ? Y2_XYY | _15_XYY2)
+          mulBuffer.ready := True
+        }
+      }
+      is(Y2_XYY){
+        mulBuffer.ready := True
+        when(mulBuffer.valid) {
+          iterationValue := ((iterationValue << 1) - mulBuffer.payload).resized
+          mulBuffer.ready := True
+          iteration := iteration + 1
+          when(iteration =/= divIterationCount-1){ //TODO
+            state := YY
+          } otherwise {
+            state := DIV
+          }
+        }
+      }
+      is(DIV){
+        decode.divSqrtToMul.valid := True
+        decode.divSqrtToMul.divSqrt := False
+        decode.divSqrtToMul.rs1 := input.rs1
+        decode.divSqrtToMul.rs2.sign := input.rs2.sign
+        decode.divSqrtToMul.rs2.exponent := divExp.value + iterationValue.msb.asUInt
+        decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized
+        val zero = input.rs2.isInfinity
+        val overflow = input.rs2.isZero
+        val nan = input.rs2.isNan || (input.rs1.isZero && input.rs2.isZero)
+
+        when(nan){
+          decode.divSqrtToMul.rs2.setNanQuiet
+        } elsewhen(overflow) {
+          decode.divSqrtToMul.rs2.setInfinity
+        } elsewhen(zero) {
+          decode.divSqrtToMul.rs2.setZero
+        }
+        when(decode.divSqrtToMul.ready) {
+          state := IDLE
+          input.ready := True
+        }
+      }
+      is(_15_XYY2){
+        when(mulBuffer.valid) {
+          state := Y_15_XYY2
+          mulBuffer.payload.getDrivingReg := (U"11" << mulWidth-2) - (mulBuffer.payload)
+        }
+      }
+      is(Y_15_XYY2){
+        decode.divSqrtToMul.valid := True
+        mulArg(iterationValue, mulBuffer.payload)
+        when(decode.divSqrtToMul.ready) {
+          mulBuffer.ready := True
+          state := Y_15_XYY2_RESULT
+        }
+      }
+      is(Y_15_XYY2_RESULT){
+        iterationValue := mulBuffer.payload
+        mulBuffer.ready := True
+        when(mulBuffer.valid) {
+          iteration := iteration + 1
+          when(iteration =/= sqrtIterationCount-1){
+            state := YY
+          } otherwise {
+            state := SQRT
+          }
+        }
+      }
+      is(SQRT){
+        decode.divSqrtToMul.valid := True
+        decode.divSqrtToMul.divSqrt := False
+        decode.divSqrtToMul.rs1 := input.rs1
+        decode.divSqrtToMul.rs2.sign := False
+        decode.divSqrtToMul.rs2.exponent := sqrtExp.value + iterationValue.msb.asUInt
+        decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized
+
+        val nan       = input.rs1.sign && !input.rs1.isZero
+
+        when(nan){
+          decode.divSqrtToMul.rs2.setNanQuiet
+        }
+
+        when(decode.divSqrtToMul.ready) {
+          state := IDLE
+          input.ready := True
+        }
+      }
+    }
+  }
+
+  val add = p.withAdd generate new Area{
+
+
+    class PreShifterOutput extends AddInput{
+      val absRs1Bigger = Bool()
+      val rs1ExponentBigger = Bool()
+    }
+
+    val preShifter = new Area{
+      val input = decode.add.combStage()
+      val output = input.swapPayload(new PreShifterOutput)
+
+      val exp21 = input.rs2.exponent -^ input.rs1.exponent
+      val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero
+      val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent
+      val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa
+      val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity
+
+      output.payload.assignSomeByName(input.payload)
+      output.absRs1Bigger := absRs1Bigger
+      output.rs1ExponentBigger := rs1ExponentBigger
+    }
+
+    class ShifterOutput extends AddInput{
+      val xSign, ySign = Bool()
+      val xMantissa, yMantissa = UInt(p.internalMantissaSize+1+addExtraBits bits)
+      val xyExponent = UInt(p.internalExponentSize bits)
+      val xySign = Bool()
+      val roundingScrap = Bool()
+    }
+
+    val shifter = new Area {
+      val input = preShifter.output.stage()
+      val output = input.swapPayload(new ShifterOutput)
+      output.payload.assignSomeByName(input.payload)
+
+      val exp21 = input.rs2.exponent -^ input.rs1.exponent
+      val shiftBy = exp21.asSInt.abs//rs1ExponentBigger ? (0-exp21) | exp21
+      val shiftOverflow = (shiftBy >= p.internalMantissaSize+1+addExtraBits)
+      val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero)
+
+      def absRs1Bigger = input.absRs1Bigger
+      def rs1ExponentBigger = input.rs1ExponentBigger
+
+      //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
+      val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
+      output.xSign := xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign)
+      output.ySign := xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign)
+      val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa)
+      val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa)
+      var yMantissa = CombInit(yMantissaUnshifted)
+      val roundingScrap = False
+      for(i <- log2Up(p.internalMantissaSize) - 1 downto 0){
+        roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0)
+        yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa
+      }
+      when(passThrough) { yMantissa := 0 }
+      when(shiftOverflow) { roundingScrap := True }
+      when(input.rs1.special || input.rs2.special){ roundingScrap := False }
+      output.xyExponent := rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent
+      output.xMantissa := xMantissa
+      output.yMantissa := yMantissa
+      output.xySign := xySign
+      output.roundingScrap := roundingScrap
+    }
+
+    class MathOutput extends ShifterOutput{
+      val xyMantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits)
+    }
+
+    val math = new Area {
+      val input = shifter.output.stage()
+      val output = input.swapPayload(new MathOutput)
+      output.payload.assignSomeByName(input.payload)
+      import input.payload._
+
+      val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ?
+      val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here
+      output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits)
+
+    }
+
+    class OhOutput extends MathOutput{
+      val shift = UInt(log2Up(p.internalMantissaSize+1+addExtraBits+1) bits)
+    }
+
+    val oh = new Area {
+      val input = math.output.stage()
+      val isCommited = commitConsume(_.add, input.source, input.fire && input.needCommit)
+      val output = input.haltWhen(input.needCommit && !isCommited).swapPayload(new OhOutput)
+      output.payload.assignSomeByName(input.payload)
+      import input.payload._
+
+      val shiftOh = OHMasking.first(output.xyMantissa.asBools.reverse) //The OhMasking.first can be processed in parallel to the xyMantissa carry chaine
+//      output.shiftOh := shiftOh
+
+      val shift = OHToUInt(shiftOh)
+      output.shift := shift
+    }
+
+
+    class NormOutput extends AddInput{
+      val mantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits)
+      val exponent = UInt(p.internalExponentSize+1 bits)
+      val infinityNan, forceNan, forceZero, forceInfinity = Bool()
+      val xySign, roundingScrap = Bool()
+      val xyMantissaZero = Bool()
+    }
+
+    val norm = new Area{
+      val input = oh.output.stage()
+      val output = input.swapPayload(new NormOutput)
+      output.payload.assignSomeByName(input.payload)
+      import input.payload._
+
+      output.mantissa := (xyMantissa |<< shift)
+      output.exponent := xyExponent -^ shift + 1
+      output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity)
+      output.forceZero := xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero)
+      output.infinityNan :=  (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign))
+      output.forceNan := input.rs1.isNan || input.rs2.isNan || output.infinityNan
+      output.xyMantissaZero := xyMantissa === 0
+    }
+
+    val result = new Area {
+      val input = norm.output.pipelined()
+      val output = input.swapPayload(new MergeInput())
+      import input.payload._
+
+      output.source := input.source
+      output.rd := input.rd
+      output.value.sign := xySign
+      output.value.mantissa := (mantissa >> addExtraBits).resized
+      output.value.exponent := exponent.resized
+      output.value.special := False
+      output.roundMode := input.roundMode
+      if (p.withDouble) output.format := input.format
+      output.scrap := (mantissa(1) | mantissa(0) | roundingScrap)
+
+      output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling
+      output.DZ := False
+      when(forceNan) {
+        output.value.setNanQuiet
+      } elsewhen (forceInfinity) {
+        output.value.setInfinity
+      } elsewhen (forceZero) {
+        output.value.setZero
+        when(xyMantissaZero || input.rs1.isZero && input.rs2.isZero) {
+          output.value.sign := input.rs1.sign && input.rs2.sign
+        }
+        when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN) {
+          output.value.sign := True
+        }
+      }
+    }
+  }
+
+
+  val merge = new Area {
+    val inputs = ArrayBuffer[Stream[MergeInput]]()
+    inputs += load.s1.output.stage()
+    if(p.withSqrt) (inputs += sqrt.output)
+    if(p.withDiv) (inputs += div.output)
+    if(p.withAdd) (inputs += add.result.output)
+    if(p.withMul) (inputs += mul.result.output)
+    if(p.withShortPipMisc) (inputs += shortPip.output.pipelined(m2s = true))
+    val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs).toFlow
+  }
+
+  class RoundFront extends MergeInput{
+    val mantissaIncrement = Bool()
+    val roundAdjusted = Bits(2 bits)
+    val exactMask = UInt(p.internalMantissaSize + 2 bits)
+  }
+
+  val roundFront = new Area {
+    val input = merge.arbitrated.stage()
+    val output = input.swapPayload(new RoundFront())
+    output.payload.assignSomeByName(input.payload)
+
+    val manAggregate = input.value.mantissa @@ input.scrap
+    val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal + 1)(exponentF32Subnormal + 1)
+    val expDif = expBase -^ input.value.exponent
+    val expSubnormal = !expDif.msb
+    var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0))
+    if (p.withDouble) when(input.format === FpuFormat.FLOAT) {
+      discardCount \= discardCount + 29
+    }
+    val exactMask = (List(True) ++ (0 until p.internalMantissaSize + 1).map(_ < discardCount)).asBits.asUInt
+    val roundAdjusted = (True ## (manAggregate >> 1)) (discardCount) ## ((manAggregate & exactMask) =/= 0)
+
+    val mantissaIncrement = !input.value.special && input.roundMode.mux(
+      FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate >> 2)) (discardCount))),
+      FpuRoundMode.RTZ -> False,
+      FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign),
+      FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign),
+      FpuRoundMode.RMM -> (roundAdjusted(1))
+    )
+
+    output.mantissaIncrement := mantissaIncrement
+    output.roundAdjusted := roundAdjusted
+    output.exactMask := exactMask
+  }
+
+  val roundBack = new Area{
+    val input = roundFront.output.stage()
+    val output = input.swapPayload(RoundOutput())
+    import input.payload._
+
+    val math = p.internalFloating()
+    val mantissaRange = p.internalMantissaSize downto 1
+    val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue)
+    val adderRightOp = (mantissaIncrement ? (exactMask >> 1)| U(0)).resize(p.internalMantissaSize bits)
+    val adder = KeepAttribute(KeepAttribute(input.value.exponent @@ adderMantissa) + KeepAttribute(adderRightOp) + KeepAttribute(U(mantissaIncrement)))
+    math.special := input.value.special
+    math.sign := input.value.sign
+    math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits)
+    math.mantissa := adder(0, p.internalMantissaSize bits)
+
+    val patched = CombInit(math)
+    val nx,of,uf = False
+
+    val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+    val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
+    val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1)
+
+    //catch exact 1.17549435E-38 underflow, but, who realy care ?
+//    val borringCase = input.value.exponent === ufSubnormalThreshold && roundAdjusted.asUInt < U"11"
+//    when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
+//      uf := True
+//    }
+    val threshold = input.roundMode.mux(
+      FpuRoundMode.RNE -> U"110",
+      FpuRoundMode.RTZ -> U"110",
+      FpuRoundMode.RDN -> (input.value.sign ? U"101" | U"111"),
+      FpuRoundMode.RUP -> (input.value.sign ? U"111" | U"101"),
+      FpuRoundMode.RMM -> U"110"
+    )
+    val borringRound = (input.value.mantissa(1 downto 0) ## input.scrap)
+    if(p.withDouble) when(input.format === FpuFormat.FLOAT) { borringRound := (input.value.mantissa(30 downto 29) ## input.value.mantissa(28 downto 0).orR)}
+
+    val borringCase = input.value.exponent === ufSubnormalThreshold && borringRound.asUInt < threshold
+    when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
+      uf := True
+    }
+    when(!math.special && math.exponent > ofThreshold){
+      nx := True
+      of := True
+      val doMax = input.roundMode.mux(
+        FpuRoundMode.RNE -> (False),
+        FpuRoundMode.RTZ -> (True),
+        FpuRoundMode.RDN -> (!math.sign),
+        FpuRoundMode.RUP -> (math.sign),
+        FpuRoundMode.RMM -> (False)
+      )
+      when(doMax){
+        patched.exponent := ofThreshold
+        patched.mantissa.setAll()
+      } otherwise {
+        patched.setInfinity
+      }
+    }
+
+
+    when(!math.special && math.exponent < ufThreshold){
+      nx := True
+      uf := True
+      val doMin = input.roundMode.mux(
+        FpuRoundMode.RNE -> (False),
+        FpuRoundMode.RTZ -> (False),
+        FpuRoundMode.RDN -> (math.sign),
+        FpuRoundMode.RUP -> (!math.sign),
+        FpuRoundMode.RMM -> (False)
+      )
+      when(doMin){
+        patched.exponent := ufThreshold.resized
+        patched.mantissa := 0
+      } otherwise {
+        patched.setZero
+      }
+    }
+
+
+    nx setWhen(!input.value.special && (roundAdjusted =/= 0))
+    val writes = rf.scoreboards.map(_.writes.readAsync(input.rd))
+    val write = writes.toList.read(input.source)
+    output.NX := nx & write
+    output.OF := of & write
+    output.UF := uf & write
+    output.NV := input.NV & write
+    output.DZ := input.DZ & write
+    output.source := input.source
+    output.rd := input.rd
+    output.write := write
+    if(p.withDouble) output.format := input.format
+    output.value := patched
+  }
+
+  val writeback = new Area{
+    val input = roundBack.output.stage()
+
+    for(i <- 0 until portCount){
+      val c = io.port(i).completion
+      c.valid := input.fire && input.source === i
+      c.flags.NX := input.NX
+      c.flags.OF := input.OF
+      c.flags.UF := input.UF
+      c.flags.NV := input.NV
+      c.flags.DZ := input.DZ
+      c.written := input.write
+    }
+
+    when(input.valid){
+      for(i <- 0 until portCount) {
+        val port = rf.scoreboards(i).hitWrite
+        port.valid setWhen(input.source === i)
+        port.address := input.rd
+        port.data := !rf.scoreboards(i).hit(input.rd) //TODO improve
+      }
+    }
+
+    val port = rf.ram.writePort
+    port.valid := input.valid && input.write
+    port.address := input.source @@ input.rd
+    port.data.value := input.value
+    if(p.withDouble) port.data.boxed := input.format === FpuFormat.FLOAT
+
+    val randomSim = p.sim generate (in UInt(p.internalMantissaSize bits))
+    if(p.sim) when(port.data.value.isZero || port.data.value.isInfinity){
+      port.data.value.mantissa := randomSim
+    }
+    if(p.sim) when(input.value.special){
+      port.data.value.exponent(p.internalExponentSize-1 downto 3) := randomSim.resized
+      when(!input.value.isNan){
+        port.data.value.exponent(2 downto 2) := randomSim.resized
+      }
+    }
+
+    when(port.valid){
+      assert(!(port.data.value.exponent === 0 && !port.data.value.special), "Special violation")
+      assert(!(port.data.value.exponent === port.data.value.exponent.maxValue && !port.data.value.special), "Special violation")
+    }
+  }
+}
+
+
+
+
+object FpuSynthesisBench extends App{
+  val payloadType = HardType(Bits(8 bits))
+  class Fpu(name : String, portCount : Int, p : FpuParameter) extends Rtl{
+    override def getName(): String = "Fpu_" + name
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new FpuCore(portCount, p){
+
+      setDefinitionName(Fpu.this.getName())
+    })
+  }
+
+  class Shifter(width : Int) extends Rtl{
+    override def getName(): String = "shifter_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new Component{
+      val a = in UInt(width bits)
+      val sel = in UInt(log2Up(width) bits)
+      val result = out(a >> sel)
+      setDefinitionName(Shifter.this.getName())
+    })
+  }
+
+  class Rotate(width : Int) extends Rtl{
+    override def getName(): String = "rotate_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new Component{
+      val a = in UInt(width bits)
+      val sel = in UInt(log2Up(width) bits)
+      val result = out(Delay(Delay(a,3).rotateLeft(Delay(sel,3)),3))
+      setDefinitionName(Rotate.this.getName())
+    })
+  }
+
+//    rotate2_24 ->
+//    Artix 7 -> 233 Mhz 96 LUT 167 FF
+//  Artix 7 -> 420 Mhz 86 LUT 229 FF
+//  rotate2_32 ->
+//    Artix 7 -> 222 Mhz 108 LUT 238 FF
+//  Artix 7 -> 399 Mhz 110 LUT 300 FF
+//  rotate2_52 ->
+//    Artix 7 -> 195 Mhz 230 LUT 362 FF
+//  Artix 7 -> 366 Mhz 225 LUT 486 FF
+//  rotate2_64 ->
+//    Artix 7 -> 182 Mhz 257 LUT 465 FF
+//  Artix 7 -> 359 Mhz 266 LUT 591 FF
+  class Rotate2(width : Int) extends Rtl{
+    override def getName(): String = "rotate2_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new Component{
+      val a = in UInt(width bits)
+      val sel = in UInt(log2Up(width) bits)
+      val result = out(Delay((U(0, width bits) @@ Delay(a,3)).rotateLeft(Delay(sel,3)),3))
+      setDefinitionName(Rotate2.this.getName())
+    })
+  }
+
+  class Rotate3(width : Int) extends Rtl{
+    override def getName(): String = "rotate3_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new Component{
+      val a = Delay(in UInt(width bits), 3)
+      val sel = Delay(in UInt(log2Up(width) bits),3)
+      //      val result =
+      //      val output = Delay(result, 3)
+      setDefinitionName(Rotate3.this.getName())
+    })
+  }
+
+  class Div(width : Int) extends Rtl{
+    override def getName(): String = "div_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new UnsignedDivider(width,width, false).setDefinitionName(Div.this.getName()))
+  }
+
+  class Add(width : Int) extends Rtl{
+    override def getName(): String = "add_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new Component{
+      val a, b = in UInt(width bits)
+      val result = out(a + b)
+      setDefinitionName(Add.this.getName())
+    })
+  }
+
+  class DivSqrtRtl(width : Int) extends Rtl{
+    override def getName(): String = "DivSqrt_" + width
+    override def getRtlPath(): String = getName() + ".v"
+    SpinalVerilog(new FpuDiv(width).setDefinitionName(DivSqrtRtl.this.getName()))
+  }
+
+  val rtls = ArrayBuffer[Rtl]()
+  rtls += new Fpu(
+    "32",
+    portCount = 1,
+    FpuParameter(
+//      withDivSqrt = false,
+      withDouble = false
+    )
+  )
+  rtls += new Fpu(
+    "64",
+    portCount = 1,
+    FpuParameter(
+//      withDivSqrt = false,
+      withDouble = true
+    )
+  )
+
+//  rtls += new Div(52)
+//  rtls += new Div(23)
+//  rtls += new Add(64)
+//  rtls += new DivSqrtRtl(52)
+//  rtls += new DivSqrtRtl(23)
+
+  //  rtls += new Shifter(24)
+//  rtls += new Shifter(32)
+//  rtls += new Shifter(52)
+//  rtls += new Shifter(64)
+//  rtls += new Rotate(24)
+//  rtls += new Rotate(32)
+//  rtls += new Rotate(52)
+//  rtls += new Rotate(64)
+//  rtls += new Rotate3(24)
+//  rtls += new Rotate3(32)
+//  rtls += new Rotate3(52)
+//  rtls += new Rotate3(64)
+
+  val targets = XilinxStdTargets()// ++ AlteraStdTargets()
+
+
+  Bench(rtls, targets)
+}
+
+//Fpu_32 ->
+//Artix 7 -> 136 Mhz 1471 LUT 1336 FF
+//Artix 7 -> 196 Mhz 1687 LUT 1371 FF
+//Fpu_64 ->
+//Artix 7 -> 105 Mhz 2822 LUT 2132 FF
+//Artix 7 -> 161 Mhz 3114 LUT 2272 FF
+//
+//
+//
+//Fpu_32 ->
+//Artix 7 -> 128 Mhz 1693 LUT 1481 FF
+//Artix 7 -> 203 Mhz 1895 LUT 1481 FF
+//Fpu_64 ->
+//Artix 7 -> 99 Mhz 3073 LUT 2396 FF
+//Artix 7 -> 164 Mhz 3433 LUT 2432 FF
+
+
+//Fpu_32 ->
+//Artix 7 -> 112 Mhz 1790 LUT 1666 FF
+//Artix 7 -> 158 Mhz 1989 LUT 1701 FF
+//Fpu_64 ->
+//Artix 7 -> 100 Mhz 3294 LUT 2763 FF
+//Artix 7 -> 151 Mhz 3708 LUT 2904 FF
+
+//Fpu_32 ->
+//Artix 7 -> 139 Mhz 1879 LUT 1713 FF
+//Artix 7 -> 206 Mhz 2135 LUT 1723 FF
+//Fpu_64 ->
+//Artix 7 -> 106 Mhz 3502 LUT 2811 FF
+//Artix 7 -> 163 Mhz 3905 LUT 2951 FF
+
+//Fpu_32 ->
+//Artix 7 -> 130 Mhz 1889 LUT 1835 FF
+//Artix 7 -> 210 Mhz 2131 LUT 1845 FF
+//Fpu_64 ->
+//Artix 7 -> 106 Mhz 3322 LUT 3023 FF
+//Artix 7 -> 161 Mhz 3675 LUT 3163 FF
+
+//Fpu_32 ->
+//Artix 7 -> 132 Mhz 1891 LUT 1837 FF
+//Artix 7 -> 209 Mhz 2132 LUT 1847 FF
+//Fpu_64 ->
+//Artix 7 -> 105 Mhz 3348 LUT 3024 FF
+//Artix 7 -> 162 Mhz 3712 LUT 3165 FF
+
+//Fpu_32 ->
+//Artix 7 -> 128 Mhz 1796 LUT 1727 FF
+//Artix 7 -> 208 Mhz 2049 LUT 1727 FF
+//Fpu_64 ->
+//Artix 7 -> 109 Mhz 3417 LUT 2913 FF
+//Artix 7 -> 168 Mhz 3844 LUT 3053 FF
+
+/*
+testfloat  -tininessafter -all1 > all1.txt
+cat all1.txt | grep "Errors found in"
+
+testfloat  -tininessafter -all2 > all2.txt
+cat all2.txt | grep "Errors found in"
+
+testfloat  -tininessafter -f32_mulAdd > fma.txt
+
+testfloat  -tininessafter -all2  -level 2 -checkall  > all2.txt
+
+
+
+all1 =>
+Errors found in f32_to_ui64_rx_minMag:
+Errors found in f32_to_i64_rx_minMag:
+Errors found in f64_to_ui64_rx_minMag:
+Errors found in f64_to_i64_rx_minMag:
+
+all2 =>
+
+
+Errors found in f32_mulAdd, rounding min:
++00.7FFFFF  +67.000001  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
++67.000001  +00.7FFFFF  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+-00.7FFFFF  -67.000001  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+-67.000001  -00.7FFFFF  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+Errors found in f32_mulAdd, rounding max:
++00.7FFFFF  -67.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
++67.000001  -00.7FFFFF  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
++66.7FFFFE  -01.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+-00.7FFFFF  +67.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+-67.000001  +00.7FFFFF  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+
+
+
+ */
+\ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala
new file mode 100644
index 0000000..7c9e713
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala
@@ -0,0 +1,140 @@
+package vexriscv.ip.fpu
+
+
+import spinal.core._
+import spinal.lib.math.{UnsignedDividerCmd, UnsignedDividerRsp}
+import spinal.lib._
+import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer}
+
+import scala.collection.mutable
+import scala.util.Random
+
+case class FpuDivCmd(mantissaWidth : Int) extends Bundle{
+  val a,b = UInt(mantissaWidth bits)
+}
+
+case class FpuDivRsp(mantissaWidth : Int) extends Bundle{
+  val result = UInt(mantissaWidth+1 + 2 bits)
+  val remain = UInt(mantissaWidth+1 bits)
+}
+
+case class FpuDiv(val mantissaWidth : Int) extends Component {
+  assert(mantissaWidth % 2 == 0)
+  val io = new Bundle{
+    val input = slave Stream(FpuDivCmd(mantissaWidth))
+    val output = master Stream(FpuDivRsp(mantissaWidth))
+  }
+
+  val iterations = (mantissaWidth+2+2)/2
+  val counter = Reg(UInt(log2Up(iterations) bits))
+  val busy = RegInit(False) clearWhen(io.output.fire)
+  val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire)
+
+  val shifter = Reg(UInt(mantissaWidth + 3 bits))
+  val result = Reg(UInt(mantissaWidth+1+2 bits))
+
+  val div1, div3 = Reg(UInt(mantissaWidth+3 bits))
+  val div2 = div1 |<< 1
+
+  val sub1 = shifter -^ div1
+  val sub2 = shifter -^ div2
+  val sub3 = shifter -^ div3
+
+  io.output.valid := done
+  io.output.result := (result << 0).resized
+  io.output.remain := (shifter >> 2).resized
+  io.input.ready := !busy
+
+  when(!done){
+    counter := counter + 1
+    val sel = CombInit(shifter)
+    result := result |<< 2
+    when(!sub1.msb){
+      sel := sub1.resized
+      result(1 downto 0) := 1
+    }
+    when(!sub2.msb){
+      sel := sub2.resized
+      result(1 downto 0) := 2
+    }
+    when(!sub3.msb){
+      sel := sub3.resized
+      result(1 downto 0) := 3
+    }
+    shifter := sel |<< 2
+  }
+
+  when(!busy){
+    counter := 0
+    shifter := (U"1" @@ io.input.a @@ U"").resized
+    div1    := (U"1" @@ io.input.b).resized
+    div3    := (U"1" @@ io.input.b) +^ (((U"1" @@ io.input.b)) << 1)
+    busy := io.input.valid
+  }
+}
+
+
+object FpuDivTester extends App{
+  import spinal.core.sim._
+
+  for(w <- List(16, 20)) {
+    val config = SimConfig
+    config.withFstWave
+    config.compile(new FpuDiv(w)).doSim(seed=2){dut =>
+      dut.clockDomain.forkStimulus(10)
+
+
+      val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain)
+      val rspQueue = mutable.Queue[FpuDivRsp => Unit]()
+      StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_))
+      StreamReadyRandomizer(dut.io.output, dut.clockDomain)
+
+      def test(a : Int, b : Int): Unit ={
+        cmdQueue +={p =>
+          p.a #= a
+          p.b #= b
+        }
+        rspQueue += {p =>
+          val x = (a | (1 << dut.mantissaWidth)).toLong
+          val y = (b | (1 << dut.mantissaWidth)).toLong
+          val result = (x << dut.mantissaWidth+2) / y
+          val remain = (x << dut.mantissaWidth+2) % y
+
+          assert(p.result.toLong == result, f"$x%x/$y%x=${p.result.toLong}%x instead of $result%x")
+          assert(p.remain.toLong == remain, f"$x%x %% $y%x=${p.remain.toLong}%x instead of $remain%x")
+        }
+      }
+
+      val s = dut.mantissaWidth-16
+      val f = (1 << dut.mantissaWidth)-1
+      test(0xE000 << s, 0x8000 << s)
+      test(0xC000 << s, 0x4000 << s)
+      test(0xC835 << s, 0x4742 << s)
+      test(0,0)
+      test(0,f)
+      test(f,0)
+      test(f,f)
+
+      for(i <- 0 until 10000){
+        test(Random.nextInt(1 << dut.mantissaWidth), Random.nextInt(1 << dut.mantissaWidth))
+      }
+
+      waitUntil(rspQueue.isEmpty)
+
+      dut.clockDomain.waitSampling(100)
+
+    }
+  }
+}
+
+object FpuDivTester2 extends App{
+  val mantissaWidth = 52
+  val a = BigInt(0xfffffff810000l)
+  val b = BigInt(0x0000000000FF0l)
+  val x = (a | (1l << mantissaWidth))
+  val y = (b | (1l << mantissaWidth))
+  val result = (x << mantissaWidth+2) / y
+  val remain = (x << mantissaWidth+2) % y
+  println("done")
+
+}
+\ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala
new file mode 100644
index 0000000..0f80905
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala
@@ -0,0 +1,116 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer}
+
+import scala.collection.mutable
+import scala.util.Random
+
+case class FpuSqrtCmd(mantissaWidth : Int) extends Bundle{
+  val a = UInt(mantissaWidth+2 bits)
+}
+
+case class FpuSqrtRsp(mantissaWidth : Int) extends Bundle{
+  val result = UInt(mantissaWidth+1 bits)
+  val remain = UInt(mantissaWidth+5 bits)
+}
+
+case class FpuSqrt(val mantissaWidth : Int) extends Component {
+  val io = new Bundle{
+    val input = slave Stream(FpuSqrtCmd(mantissaWidth))
+    val output = master Stream(FpuSqrtRsp(mantissaWidth))
+  }
+
+  val iterations = mantissaWidth+2
+  val counter = Reg(UInt(log2Up(iterations ) bits))
+  val busy = RegInit(False) clearWhen(io.output.fire)
+  val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire)
+
+  val a = Reg(UInt(mantissaWidth+5 bits))
+  val x = Reg(UInt(mantissaWidth bits))
+  val q = Reg(UInt(mantissaWidth+1 bits))
+  val t = a-(q @@ U"01")
+
+
+  io.output.valid := done
+  io.output.result := (q << 0).resized
+  io.output.remain := a
+  io.input.ready := !busy
+
+  when(!done){
+    counter := counter + 1
+    val sel = CombInit(a)
+    when(!t.msb){
+      sel := t.resized
+    }
+    q := (q @@ !t.msb).resized
+    a := (sel @@ x(widthOf(x)-2,2 bits)).resized
+    x := x |<< 2
+  }
+
+  when(!busy){
+    q := 0
+    a := io.input.a(widthOf(io.input.a)-2,2 bits).resized
+    x := (io.input.a).resized
+    counter := 0
+    when(io.input.valid){
+      busy := True
+    }
+  }
+}
+
+
+object FpuSqrtTester extends App{
+  import spinal.core.sim._
+
+  for(w <- List(16)) {
+    val config = SimConfig
+    config.withFstWave
+    config.compile(new FpuSqrt(w)).doSim(seed=2){dut =>
+      dut.clockDomain.forkStimulus(10)
+
+
+      val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain)
+      val rspQueue = mutable.Queue[FpuSqrtRsp => Unit]()
+      StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_))
+      StreamReadyRandomizer(dut.io.output, dut.clockDomain)
+
+      def test(a : Int): Unit ={
+        cmdQueue +={p =>
+          p.a #= a
+        }
+        rspQueue += {p =>
+//          val x = (a * (1l << dut.mantissaWidth)).toLong
+//          val result = Math.sqrt(x).toLong/(1 << dut.mantissaWidth/2)
+//          val remain = a-x*x
+          val x = a.toDouble / (1 << dut.mantissaWidth)
+          val result = (Math.sqrt(x)*(1 << dut.mantissaWidth+1)).toLong
+          val filtred = result  % (1 << dut.mantissaWidth+1)
+//          val remain = (a-(result*result)).toLong
+          assert(p.result.toLong == filtred, f"$a%x=${p.result.toLong}%x instead of $filtred%x")
+//          assert(p.remain.toLong == remain, f"$a%x=${p.remain.toLong}%x instead of $remain%x")
+        }
+      }
+
+      val s = dut.mantissaWidth-16
+      val f = (1 << dut.mantissaWidth)-1
+//      test(121)
+      test(0x20000)
+      test(0x18000)
+//      test(0,0)
+//      test(0,f)
+//      test(f,0)
+//      test(f,f)
+
+      for(i <- 0 until 10000){
+        test(Random.nextInt(3 << dut.mantissaWidth) + (1 << dut.mantissaWidth))
+      }
+
+      waitUntil(rspQueue.isEmpty)
+
+      dut.clockDomain.waitSampling(100)
+
+    }
+  }
+}
+\ No newline at end of file
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala
new file mode 100644
index 0000000..9338c35
--- /dev/null
+++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala
@@ -0,0 +1,186 @@
+package vexriscv.ip.fpu
+
+import spinal.core._
+import spinal.lib._
+
+
+object Fpu{
+
+  object Function{
+    val MUL = 0
+    val ADD = 1
+  }
+
+}
+
+
+case class FpuFloatDecoded() extends Bundle{
+  val isNan = Bool()
+  val isNormal = Bool()
+  val isSubnormal = Bool()
+  val isZero = Bool()
+  val isInfinity = Bool()
+  val isQuiet = Bool()
+}
+
+object FpuFloat{
+  val ZERO = 0
+  val INFINITY = 1
+  val NAN = 2
+  val NAN_CANONICAL_BIT = 2
+}
+
+case class FpuFloat(exponentSize: Int,
+                    mantissaSize: Int) extends Bundle {
+  val mantissa = UInt(mantissaSize bits)
+  val exponent = UInt(exponentSize bits)
+  val sign = Bool()
+  val special = Bool()
+
+  def withInvertSign : FpuFloat ={
+    val ret = FpuFloat(exponentSize,mantissaSize)
+    ret.sign := !sign
+    ret.exponent := exponent
+    ret.mantissa := mantissa
+    ret
+  }
+
+  def isNormal    = !special
+  def isZero      =  special && exponent(1 downto 0) === FpuFloat.ZERO
+  def isInfinity  =  special && exponent(1 downto 0) === FpuFloat.INFINITY
+  def isNan       =  special && exponent(1 downto 0) === FpuFloat.NAN
+  def isQuiet     =  mantissa.msb
+  def isNanSignaling  = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet
+  def isCanonical = exponent(FpuFloat.NAN_CANONICAL_BIT)
+
+  def setNormal    =  { special := False }
+  def setZero      =  { special := True; exponent(1 downto 0) := FpuFloat.ZERO }
+  def setInfinity  =  { special := True; exponent(1 downto 0) := FpuFloat.INFINITY }
+  def setNan       =  { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := False}
+  def setNanQuiet  =  { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := True; mantissa.msb := True; }
+
+  def decode() = {
+    val ret = FpuFloatDecoded()
+    ret.isZero      := isZero
+    ret.isNormal    := isNormal
+    ret.isInfinity  := isInfinity
+    ret.isNan       := isNan
+    ret.isQuiet     := mantissa.msb
+    ret
+  }
+
+  def decodeIeee754() = {
+    val ret = FpuFloatDecoded()
+    val expZero = exponent === 0
+    val expOne = exponent === exponent.maxValue
+    val manZero = mantissa === 0
+    ret.isZero := expZero && manZero
+    ret.isSubnormal := expZero && !manZero
+    ret.isNormal := !expOne && !expZero
+    ret.isInfinity := expOne && manZero
+    ret.isNan := expOne && !manZero
+    ret.isQuiet := mantissa.msb
+    ret
+  }
+}
+
+object FpuOpcode extends SpinalEnum{
+  val LOAD, STORE, MUL, ADD, FMA, I2F, F2I, CMP, DIV, SQRT, MIN_MAX, SGNJ, FMV_X_W, FMV_W_X, FCLASS, FCVT_X_X = newElement()
+}
+
+object FpuFormat extends SpinalEnum{
+  val FLOAT, DOUBLE = newElement()
+}
+
+object FpuRoundMode extends SpinalEnum(){
+  val RNE, RTZ, RDN, RUP, RMM = newElement()
+  defaultEncoding = SpinalEnumEncoding("opt")(
+    RNE -> 0,
+    RTZ -> 1,
+    RDN -> 2,
+    RUP -> 3,
+    RMM -> 4
+  )
+}
+object FpuRoundModeInstr extends SpinalEnum(){
+  val RNE, RTZ, RDN, RUP, RMM, DYN = newElement()
+  defaultEncoding = SpinalEnumEncoding("opt")(
+    RNE -> 0,
+    RTZ -> 1,
+    RDN -> 2,
+    RUP -> 3,
+    RMM -> 4,
+    DYN -> 7
+  )
+}
+
+
+case class FpuParameter( withDouble : Boolean,
+                         asyncRegFile : Boolean = false,
+                         mulWidthA : Int = 18,
+                         mulWidthB : Int = 18,
+                         schedulerM2sPipe : Boolean = false,
+                         sim : Boolean = false,
+                         withAdd : Boolean = true,
+                         withMul : Boolean = true,
+                         withDivSqrt : Boolean = false,
+                         withDiv : Boolean = true,
+                         withSqrt : Boolean = true,
+                         withShortPipMisc : Boolean = true){
+
+  val internalMantissaSize = if(withDouble) 52 else 23
+  val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits))
+  val internalExponentSize = (if(withDouble) 11 else 8) + 1
+  val internalFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize))
+  val writeFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize+1))
+
+  val rfAddress = HardType(UInt(5 bits))
+
+  val Opcode = FpuOpcode
+  val Format = FpuFormat
+  val argWidth = 2
+  val Arg = HardType(Bits(2 bits))
+}
+
+case class FpuFlags() extends Bundle{
+  val NX,  UF,  OF,  DZ,  NV = Bool()
+}
+
+case class FpuCompletion() extends Bundle{
+  val flags = FpuFlags()
+  val written = Bool() //Used for verification purposes
+}
+
+case class FpuCmd(p : FpuParameter) extends Bundle{
+  val opcode = p.Opcode()
+  val arg = Bits(2 bits) 
+  val rs1, rs2, rs3 = p.rfAddress()
+  val rd = p.rfAddress()
+  val format = p.Format()
+  val roundMode = FpuRoundMode()
+}
+
+case class FpuCommit(p : FpuParameter) extends Bundle{
+  val opcode = FpuOpcode()
+  val rd = UInt(5 bits)
+  val write = Bool()
+  val value = p.storeLoadType() // IEEE 754
+}
+
+case class FpuRsp(p : FpuParameter) extends Bundle{
+  val value = p.storeLoadType() // IEEE754 store || Integer
+  val NV, NX = Bool()
+}
+
+case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave {
+  val cmd = Stream(FpuCmd(p))
+  val commit = Stream(FpuCommit(p))
+  val rsp = Stream(FpuRsp(p))
+  val completion = Flow(FpuCompletion())
+
+  override def asMaster(): Unit = {
+    master(cmd, commit)
+    slave(rsp)
+    in(completion)
+  }
+}