diff options
| author | Friedrich Beckmann <friedrich.beckmann@hs-augsburg.de> | 2022-07-25 17:55:39 +0200 |
|---|---|---|
| committer | Friedrich Beckmann <friedrich.beckmann@hs-augsburg.de> | 2022-07-25 17:55:39 +0200 |
| commit | 3fff6023602822531efdae30bc8ebf862967f1ef (patch) | |
| tree | 16028102b8d850f8ab3115d28a8539ca6bc5f51d /VexRiscv/src/main/scala/vexriscv/ip/fpu | |
Initial Commit
Diffstat (limited to 'VexRiscv/src/main/scala/vexriscv/ip/fpu')
| -rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 1944 | ||||
| -rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala | 140 | ||||
| -rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala | 116 | ||||
| -rw-r--r-- | VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala | 186 |
4 files changed, 2386 insertions, 0 deletions
diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala new file mode 100644 index 0000000..657b2fb --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -0,0 +1,1944 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets} +import spinal.lib.math.UnsignedDivider + +import scala.collection.mutable.ArrayBuffer + +object FpuDivSqrtIterationState extends SpinalEnum{ + val IDLE, YY, XYY, Y2_XYY, DIV, _15_XYY2, Y_15_XYY2, Y_15_XYY2_RESULT, SQRT = newElement() +} + + +case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ + val io = new Bundle { + val port = Vec(slave(FpuPort(p)), portCount) + } + + val portCountWidth = log2Up(portCount) + val Source = HardType(UInt(portCountWidth bits)) + val exponentOne = (1 << p.internalExponentSize-1) - 1 + val exponentF32Subnormal = exponentOne-127 + val exponentF64Subnormal = exponentOne-1023 + val exponentF32Infinity = exponentOne+127+1 + val exponentF64Infinity = exponentOne+1023+1 + + + + def whenDouble(format : FpuFormat.C)(yes : => Unit)(no : => Unit): Unit ={ + if(p.withDouble) when(format === FpuFormat.DOUBLE) { yes } otherwise{ no } + if(!p.withDouble) no + } + + def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={ + if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no }) + else no + } + + case class RfReadInput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + val arg = p.Arg() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class RfReadOutput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + val arg = p.Arg() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val rs1Boxed, rs2Boxed = p.withDouble generate Bool() + } + + + case class LoadInput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val i2f = Bool() + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class ShortPipInput() extends Bundle{ + val source = Source() + val opcode = p.Opcode() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val value = Bits(32 bits) + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val rs1Boxed, rs2Boxed = p.withDouble generate Bool() + } + + class MulInput() extends Bundle{ + val source = Source() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + val add = Bool() + val divSqrt = Bool() + val msb1, msb2 = Bool() //allow usage of msb bits of mul + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + case class DivSqrtInput() extends Bundle{ + val source = Source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val div = Bool() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + case class DivInput() extends Bundle{ + val source = Source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + case class SqrtInput() extends Bundle{ + val source = Source() + val rs1 = p.internalFloating() + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + val addExtraBits = 2 + case class AddInput() extends Bundle{ + val source = Source() + val rs1, rs2 = FpuFloat(exponentSize = p.internalExponentSize, mantissaSize = p.internalMantissaSize+addExtraBits) + val rd = p.rfAddress() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val needCommit = Bool() + } + + + class MergeInput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.writeFloating() + val scrap = Bool() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + val NV = Bool() + val DZ = Bool() + } + + case class RoundOutput() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.internalFloating() + val format = p.withDouble generate FpuFormat() + val NV, NX, OF, UF, DZ = Bool() + val write = Bool() + } + + val rf = new Area{ + case class Entry() extends Bundle{ + val value = p.internalFloating() + val boxed = p.withDouble generate Bool() + } + val ram = Mem(Entry(), 32*portCount) + + val init = new Area{ + val counter = Reg(UInt(6 bits)) init(0) + val done = CombInit(counter.msb) + when(!done){ + counter := counter + 1 + } + def apply(port : Flow[MemWriteCmd[Bool]]) = { + port.valid := !done + port.address := counter.resized + port.data := False + port + } + } + + val scoreboards = Array.fill(portCount)(new Area{ + val target, hit = Mem(Bool, 32) // XOR + val writes = Mem(Bool, 32) + + val targetWrite = init(target.writePort) + val hitWrite = init(hit.writePort) + }) + } + + val commitFork = new Area{ + val load, commit = Vec(Stream(FpuCommit(p)), portCount) + for(i <- 0 until portCount){ + val fork = new StreamFork(FpuCommit(p), 2, synchronous = true) + fork.io.input << io.port(i).commit + fork.io.outputs(0) >> load(i) + fork.io.outputs(1).pipelined(m2s = false, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload + } + } + + class Tracker(width : Int) extends Area{ + val counter = Reg(UInt(width bits)) init(0) + val full = counter.andR + val notEmpty = counter.orR + val inc = False + val dec = False + counter := counter + U(inc) - U(dec) + } + + class CommitArea(source : Int) extends Area{ + val pending = new Tracker(4) + val add, mul, div, sqrt, short = new Tracker(4) + val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow + + when(input.fire){ + add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR) + mul.inc setWhen(List(FpuOpcode.MUL, FpuOpcode.FMA).map(input.opcode === _).orR) + div.inc setWhen(List(FpuOpcode.DIV).map(input.opcode === _).orR) + sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR) + short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR) + rf.scoreboards(source).writes(input.rd) := input.write + pending.dec := True + } + } + + val commitLogic = for(source <- 0 until portCount) yield new CommitArea(source) + + def commitConsume(what : CommitArea => Tracker, source : UInt, fire : Bool) : Bool = { + for(i <- 0 until portCount) what(commitLogic(i)).dec setWhen(fire && source === i) + commitLogic.map(what(_).notEmpty).read(source) + } + + + val scheduler = for(portId <- 0 until portCount; + scoreboard = rf.scoreboards(portId)) yield new Area{ + val input = io.port(portId).cmd.pipelined(s2m = true) + val useRs1, useRs2, useRs3, useRd = False + switch(input.opcode){ + is(p.Opcode.LOAD) { useRd := True } + is(p.Opcode.STORE) { useRs2 := True } + is(p.Opcode.ADD) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.MUL) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.DIV) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.SQRT) { useRd := True; useRs1 := True } + is(p.Opcode.FMA) { useRd := True; useRs1 := True; useRs2 := True; useRs3 := True } + is(p.Opcode.I2F) { useRd := True } + is(p.Opcode.F2I) { useRs1 := True } + is(p.Opcode.MIN_MAX) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.CMP) { useRs1 := True; useRs2 := True } + is(p.Opcode.SGNJ) { useRd := True; useRs1 := True; useRs2 := True } + is(p.Opcode.FMV_X_W) { useRs1 := True } + is(p.Opcode.FMV_W_X) { useRd := True } + is(p.Opcode.FCLASS ) { useRs1 := True } + is(p.Opcode.FCVT_X_X ) { useRd := True; useRs1 := True } + } + + val uses = List(useRs1, useRs2, useRs3, useRd) + val regs = List(input.rs1, input.rs2, input.rs3, input.rd) + val rfHits = regs.map(scoreboard.hit.readAsync(_)) + val rfTargets = regs.map(scoreboard.target.readAsync(_)) + val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _) + + val hits = (0 to 3).map(id => uses(id) && rfBusy(id)) + val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full + val output = input.haltWhen(hazard) + when(input.opcode === p.Opcode.STORE){ + output.rs1 := input.rs2 //Datapath optimisation to unify rs source in the store pipeline + } + when(input.valid && rf.init.done){ + scoreboard.targetWrite.address := input.rd + scoreboard.targetWrite.data := !rfTargets.last + } + when(output.fire && useRd){ + scoreboard.targetWrite.valid := True + commitLogic(portId).pending.inc := True + } + } + + + val cmdArbiter = new Area{ + val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount) + arbiter.io.inputs <> Vec(scheduler.map(_.output.pipelined(m2s = p.schedulerM2sPipe))) + + val output = arbiter.io.output.swapPayload(RfReadInput()) + output.source := arbiter.io.chosen + output.payload.assignSomeByName(arbiter.io.output.payload) + } + + val read = new Area{ + val s0 = cmdArbiter.output.pipelined() + val s1 = s0.m2sPipe() + val output = s1.swapPayload(RfReadOutput()) + val rs = if(p.asyncRegFile){ + List(s1.rs1, s1.rs2, s1.rs3).map(a => rf.ram.readAsync(s1.source @@ a)) + } else { + List(s0.rs1, s0.rs2, s0.rs3).map(a => rf.ram.readSync(s0.source @@ a, enable = !output.isStall)) + } + output.source := s1.source + output.opcode := s1.opcode + output.arg := s1.arg + output.roundMode := s1.roundMode + output.rd := s1.rd + output.rs1 := rs(0).value + output.rs2 := rs(1).value + output.rs3 := rs(2).value + if(p.withDouble){ + output.rs1Boxed := rs(0).boxed + output.rs2Boxed := rs(1).boxed + output.format := s1.format + val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W + val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE + when(!sgnjBypass) { + when(store) { //Pass through + output.format := rs(0).boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE + } elsewhen (s1.format === FpuFormat.FLOAT =/= rs(0).boxed) { + output.rs1.setNanQuiet + output.rs1.sign := False + } + } + when(s1.format === FpuFormat.FLOAT =/= rs(1).boxed) { + output.rs2.setNanQuiet + output.rs2.sign := False + } + when(s1.format === FpuFormat.FLOAT =/= rs(2).boxed) { + output.rs3.setNanQuiet + } + } + } + + val decode = new Area{ + val input = read.output/*.s2mPipe()*/.combStage() + input.ready := False + + val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR + val load = Stream(LoadInput()) + load.valid := input.valid && loadHit + input.ready setWhen(loadHit && load.ready) + load.payload.assignSomeByName(input.payload) + load.i2f := input.opcode === FpuOpcode.I2F + + val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR + val shortPip = Stream(ShortPipInput()) + input.ready setWhen(shortPipHit && shortPip.ready) + shortPip.valid := input.valid && shortPipHit + shortPip.payload.assignSomeByName(input.payload) + + val divSqrtHit = input.opcode === p.Opcode.DIV || input.opcode === p.Opcode.SQRT + val divSqrt = Stream(DivSqrtInput()) + if(p.withDivSqrt) { + input.ready setWhen (divSqrtHit && divSqrt.ready) + divSqrt.valid := input.valid && divSqrtHit + divSqrt.payload.assignSomeByName(input.payload) + divSqrt.div := input.opcode === p.Opcode.DIV + } + + val divHit = input.opcode === p.Opcode.DIV + val div = Stream(DivInput()) + if(p.withDiv) { + input.ready setWhen (divHit && div.ready) + div.valid := input.valid && divHit + div.payload.assignSomeByName(input.payload) + } + + val sqrtHit = input.opcode === p.Opcode.SQRT + val sqrt = Stream(SqrtInput()) + if(p.withSqrt) { + input.ready setWhen (sqrtHit && sqrt.ready) + sqrt.valid := input.valid && sqrtHit + sqrt.payload.assignSomeByName(input.payload) + } + + + val fmaHit = input.opcode === p.Opcode.FMA + val mulHit = input.opcode === p.Opcode.MUL || fmaHit + val mul = Stream(new MulInput()) + val divSqrtToMul = Stream(new MulInput()) + if(!p.withDivSqrt){ + divSqrtToMul.valid := False + divSqrtToMul.payload.assignDontCare() + } + + if(p.withMul) { + input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid) + mul.valid := input.valid && mulHit || divSqrtToMul.valid + + divSqrtToMul.ready := mul.ready + mul.payload := divSqrtToMul.payload + when(!divSqrtToMul.valid) { + mul.payload.assignSomeByName(input.payload) + mul.add := fmaHit + mul.divSqrt := False + mul.msb1 := True + mul.msb2 := True + mul.rs2.sign.allowOverride(); + mul.rs2.sign := input.rs2.sign ^ input.arg(0) + mul.rs3.sign.allowOverride(); + mul.rs3.sign := input.rs3.sign ^ input.arg(1) + } + } + + val addHit = input.opcode === p.Opcode.ADD + val add = Stream(AddInput()) + val mulToAdd = Stream(AddInput()) + + + if(p.withAdd) { + input.ready setWhen (addHit && add.ready && !mulToAdd.valid) + add.valid := input.valid && addHit || mulToAdd.valid + + mulToAdd.ready := add.ready + add.payload := mulToAdd.payload + when(!mulToAdd.valid) { + add.source := input.source + add.rd := input.rd + add.roundMode := input.roundMode + if(p.withDouble) add.format := input.format + add.needCommit := True + add.rs1.special := input.rs1.special + add.rs2.special := input.rs2.special + add.rs1.exponent := input.rs1.exponent + add.rs2.exponent := input.rs2.exponent + add.rs1.sign := input.rs1.sign + add.rs2.sign := input.rs2.sign ^ input.arg(0) + add.rs1.mantissa := input.rs1.mantissa << addExtraBits + add.rs2.mantissa := input.rs2.mantissa << addExtraBits + } + } + } + + val load = new Area{ + + case class S0() extends Bundle{ + val source = Source() + val rd = p.rfAddress() + val value = p.storeLoadType() + val i2f = Bool() + val arg = Bits(2 bits) + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + val s0 = new Area{ + val input = decode.load.pipelined(m2s = true, s2m = true).stage() + val filtred = commitFork.load.map(port => port.takeWhen(List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(_ === port.opcode).orR)) + def feed = filtred(input.source) + val hazard = !feed.valid + + + val output = input.haltWhen(hazard).swapPayload(S0()) + filtred.foreach(_.ready := False) + feed.ready := input.valid && output.ready + output.source := input.source + output.rd := input.rd + output.value := feed.value + output.i2f := input.i2f + output.arg := input.arg + output.roundMode := input.roundMode + if(p.withDouble) { + output.format := input.format + when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing + output.format := FpuFormat.FLOAT + } + } + } + + val s1 = new Area{ + val input = s0.output.stage() + val busy = False + + val f32 = new Area{ + val mantissa = input.value(0, 23 bits).asUInt + val exponent = input.value(23, 8 bits).asUInt + val sign = input.value(31) + } + val f64 = p.withDouble generate new Area{ + val mantissa = input.value(0, 52 bits).asUInt + val exponent = input.value(52, 11 bits).asUInt + val sign = input.value(63) + } + + val recodedExpOffset = UInt(p.internalExponentSize bits) + val passThroughFloat = p.internalFloating() + passThroughFloat.special := False + + whenDouble(input.format){ + passThroughFloat.sign := f64.sign + passThroughFloat.exponent := f64.exponent.resized + passThroughFloat.mantissa := f64.mantissa + recodedExpOffset := exponentF64Subnormal + } { + passThroughFloat.sign := f32.sign + passThroughFloat.exponent := f32.exponent.resized + passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0) + recodedExpOffset := exponentF32Subnormal + } + + + val manZero = passThroughFloat.mantissa === 0 + val expZero = passThroughFloat.exponent === 0 + val expOne = passThroughFloat.exponent(7 downto 0).andR + if(p.withDouble) { + expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0) + expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7) + } + + val isZero = expZero && manZero + val isSubnormal = expZero && !manZero + val isInfinity = expOne && manZero + val isNan = expOne && !manZero + + + val fsm = new Area{ + val done, boot, patched = Reg(Bool()) + val ohInputWidth = 32 max p.internalMantissaSize + val ohInput = Bits(ohInputWidth bits).assignDontCare() + when(!input.i2f) { + if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9 + if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits + } otherwise { + ohInput(ohInputWidth-32-1 downto 0) := 0 + ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0) + } + + val i2fZero = Reg(Bool) + + val shift = new Area{ + val by = Reg(UInt(log2Up(ohInputWidth) bits)) + val input = UInt(ohInputWidth bits).assignDontCare() + var logic = input + for(i <- by.range){ + logic \= by(i) ? (logic |<< (BigInt(1) << i)) | logic + } + val output = RegNextWhen(logic, !done) + } + shift.input := (ohInput.asUInt |<< 1).resized + + when(input.valid && (input.i2f || isSubnormal) && !done){ + busy := True + when(boot){ + when(input.i2f && !patched && input.value(31) && input.arg(0)){ + input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits)) + patched := True + } otherwise { + shift.by := OHToUInt(OHMasking.first((ohInput).reversed)) + boot := False + i2fZero := input.value(31 downto 0) === 0 + } + } otherwise { + done := True + } + } + + val expOffset = (UInt(p.internalExponentSize bits)) + expOffset := 0 + when(isSubnormal){ + expOffset := shift.by.resized + } + + when(!input.isStall){ + done := False + boot := True + patched := False + } + } + + + val i2fSign = fsm.patched + val (i2fHigh, i2fLow) = fsm.shift.output.splitAt(if(p.withDouble) 0 else widthOf(input.value)-24) + val scrap = i2fLow =/= 0 + + val recoded = p.internalFloating() + recoded.mantissa := passThroughFloat.mantissa + recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized + recoded.sign := passThroughFloat.sign + recoded.setNormal + when(isZero){recoded.setZero} + when(isInfinity){recoded.setInfinity} + when(isNan){recoded.setNan} + + val output = input.haltWhen(busy).swapPayload(new MergeInput()) + output.source := input.source + output.roundMode := input.roundMode + if(p.withDouble) { + output.format := input.format + } + output.rd := input.rd + output.value.sign := recoded.sign + output.value.exponent := recoded.exponent + output.value.mantissa := recoded.mantissa @@ U"0" + output.value.special := recoded.special + output.scrap := False + output.NV := False + output.DZ := False + when(input.i2f){ + output.value.sign := i2fSign + output.value.exponent := (U(exponentOne+31) - fsm.shift.by).resized + output.value.setNormal + output.scrap := scrap + when(fsm.i2fZero) { output.value.setZero } + } + + when(input.i2f || isSubnormal){ + output.value.mantissa := U(i2fHigh) @@ (if(p.withDouble) U"0" else U"") + } + } + + } + + val shortPip = new Area{ + val input = decode.shortPip.stage() + + val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR + val rfOutput = Stream(new MergeInput()) + + val isCommited = commitConsume(_.short, input.source, input.fire && toFpuRf) + val output = rfOutput.haltWhen(!isCommited) + + val result = p.storeLoadType().assignDontCare() + + val halt = False + val recodedResult = p.storeLoadType() + val f32 = new Area{ + val exp = (input.rs1.exponent - (exponentOne-127)).resize(8 bits) + val man = CombInit(input.rs1.mantissa(if(p.withDouble) 51 downto 29 else 22 downto 0)) + } + val f64 = p.withDouble generate new Area{ + val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits) + val man = CombInit(input.rs1.mantissa) + } + + whenDouble(input.format){ + recodedResult := input.rs1.sign ## f64.exp ## f64.man + } { + recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man + } + + val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal) + val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold + val isSubnormal = !input.rs1.special && expInSubnormalRange + val isNormal = !input.rs1.special && !expInSubnormalRange + val fsm = new Area{ + val f2iShift = input.rs1.exponent - U(exponentOne) + val isF2i = input.opcode === FpuOpcode.F2I + val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal + val done, boot = Reg(Bool()) + val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1 + + val shift = new Area{ + val by = Reg(UInt(log2Up(p.internalMantissaSize+1 max 33) bits)) + val input = UInt(p.internalMantissaSize+1 max 33 bits).assignDontCare() + var logic = input + val scrap = Reg(Bool) + for(i <- by.range.reverse){ + scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0) + logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic + } + when(boot){ + scrap := False + } + val output = RegNextWhen(logic, !done) + } + + shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9) + + val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10))) + when(input.valid && (needRecoding || isF2i) && !done){ + halt := True + when(boot){ + when(isF2i){ + shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge + } otherwise { + shift.by := (formatShiftOffset - input.rs1.exponent).resized + } + boot := False + } otherwise { + done := True + } + } + + when(!input.isStall){ + done := False + boot := True + } + } + + val mantissaForced = False + val exponentForced = False + val mantissaForcedValue = Bool().assignDontCare() + val exponentForcedValue = Bool().assignDontCare() + val cononicalForced = False + + + when(input.rs1.special){ + switch(input.rs1.exponent(1 downto 0)){ + is(FpuFloat.ZERO){ + mantissaForced := True + exponentForced := True + mantissaForcedValue := False + exponentForcedValue := False + } + is(FpuFloat.INFINITY){ + mantissaForced := True + exponentForced := True + mantissaForcedValue := False + exponentForcedValue := True + } + is(FpuFloat.NAN){ + exponentForced := True + exponentForcedValue := True + when(input.rs1.isCanonical){ + cononicalForced := True + mantissaForced := True + mantissaForcedValue := False + } + } + } + } + + + + when(isSubnormal){ + exponentForced := True + exponentForcedValue := False + recodedResult(0,23 bits) := fsm.shift.output(22 downto 0).asBits + whenDouble(input.format){ + recodedResult(51 downto 23) := fsm.shift.output(51 downto 23).asBits + }{} + } + when(mantissaForced){ + recodedResult(0,23 bits) := (default -> mantissaForcedValue) + whenDouble(input.format){ + recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue) + }{} + } + when(exponentForced){ + whenDouble(input.format){ + recodedResult(52, 11 bits) := (default -> exponentForcedValue) + } { + recodedResult(23, 8 bits) := (default -> exponentForcedValue) + } + } + when(cononicalForced){ + whenDouble(input.format){ + recodedResult(63) := False + recodedResult(51) := True + } { + recodedResult(31) := False + recodedResult(22) := True + } + } + + val rspNv = False + val rspNx = False + + val f2i = new Area{ //Will not work for 64 bits float max value rounding + val unsigned = fsm.shift.output(32 downto 0) >> 1 + val resign = input.arg(0) && input.rs1.sign + val round = fsm.shift.output(0) ## fsm.shift.scrap + val increment = input.roundMode.mux( + FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))), + FpuRoundMode.RTZ -> False, + FpuRoundMode.RDN -> (round =/= 0 && input.rs1.sign), + FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign), + FpuRoundMode.RMM -> (round(1)) + ) + val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt) + val overflow = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan + val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign + val isZero = input.rs1.isZero + if(p.withDouble){ + overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31))) + } + when(isZero){ + result := 0 + } elsewhen(underflow || overflow) { + val low = overflow + val high = input.arg(0) ^ overflow + result := (31 -> high, default -> low) + rspNv := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero + } otherwise { + rspNx := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0 + } + } + + val bothZero = input.rs1.isZero && input.rs2.isZero + val rs1Equal = input.rs1 === input.rs2 + val rs1AbsSmaller = (input.rs1.exponent @@ input.rs1.mantissa) < (input.rs2.exponent @@ input.rs2.mantissa) + rs1AbsSmaller.setWhen(input.rs2.isInfinity) + rs1AbsSmaller.setWhen(input.rs1.isZero) + rs1AbsSmaller.clearWhen(input.rs2.isZero) + rs1AbsSmaller.clearWhen(input.rs1.isInfinity) + rs1Equal setWhen(input.rs1.sign === input.rs2.sign && input.rs1.isInfinity && input.rs2.isInfinity) + val rs1Smaller = (input.rs1.sign ## input.rs2.sign).mux( + 0 -> rs1AbsSmaller, + 1 -> False, + 2 -> True, + 3 -> (!rs1AbsSmaller && !rs1Equal) + ) + + + val minMaxSelectRs2 = !(((rs1Smaller ^ input.arg(0)) && !input.rs1.isNan || input.rs2.isNan)) + val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan + val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0)) + when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 } + val sgnjRs1Sign = CombInit(input.rs1.sign) + val sgnjRs2Sign = CombInit(input.rs2.sign) + if(p.withDouble){ + sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE) + } + val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0) + val fclassResult = B(0, 32 bits) + val decoded = input.rs1.decode() + fclassResult(0) := input.rs1.sign && decoded.isInfinity + fclassResult(1) := input.rs1.sign && isNormal + fclassResult(2) := input.rs1.sign && isSubnormal + fclassResult(3) := input.rs1.sign && decoded.isZero + fclassResult(4) := !input.rs1.sign && decoded.isZero + fclassResult(5) := !input.rs1.sign && isSubnormal + fclassResult(6) := !input.rs1.sign && isNormal + fclassResult(7) := !input.rs1.sign && decoded.isInfinity + fclassResult(8) := decoded.isNan && !decoded.isQuiet + fclassResult(9) := decoded.isNan && decoded.isQuiet + + + switch(input.opcode){ + is(FpuOpcode.STORE) { result := recodedResult } + is(FpuOpcode.FMV_X_W) { result := recodedResult } + is(FpuOpcode.F2I) { result(31 downto 0) := f2i.result.asBits } + is(FpuOpcode.CMP) { result(31 downto 0) := cmpResult.resized } + is(FpuOpcode.FCLASS) { result(31 downto 0) := fclassResult.resized } + } + + + rfOutput.valid := input.valid && toFpuRf && !halt + rfOutput.source := input.source + rfOutput.rd := input.rd + rfOutput.roundMode := input.roundMode + if(p.withDouble) rfOutput.format := input.format + rfOutput.scrap := False + rfOutput.value.sign := input.rs1.sign + rfOutput.value.exponent := input.rs1.exponent + rfOutput.value.mantissa := input.rs1.mantissa @@ U"0" + rfOutput.value.special := input.rs1.special + + switch(input.opcode){ + is(FpuOpcode.MIN_MAX){ + when(minMaxSelectRs2) { + rfOutput.value.sign := input.rs2.sign + rfOutput.value.exponent := input.rs2.exponent + rfOutput.value.mantissa := input.rs2.mantissa @@ U"0" + rfOutput.value.special := input.rs2.special + } + when(minMaxSelectNanQuiet){ + rfOutput.value.setNanQuiet + } + } + is(FpuOpcode.SGNJ){ + when(!input.rs1.isNan) { + rfOutput.value.sign := sgnjResult + } + if(p.withDouble) when(input.rs1Boxed && input.format === FpuFormat.DOUBLE){ + rfOutput.value.sign := input.rs1.sign + rfOutput.format := FpuFormat.FLOAT + } + } + if(p.withDouble) is(FpuOpcode.FCVT_X_X){ + rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT) + when(input.rs1.isNan){ + rfOutput.value.setNanQuiet + } + } + } + + val signalQuiet = input.opcode === FpuOpcode.CMP && input.arg =/= 2 + val rs1Nan = input.rs1.isNan + val rs2Nan = input.rs2.isNan + val rs1NanNv = input.rs1.isNan && (!input.rs1.isQuiet || signalQuiet) + val rs2NanNv = input.rs2.isNan && (!input.rs2.isQuiet || signalQuiet) + val NV = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv || + List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv + rspNv setWhen(NV) + + val rspStreams = Vec(Stream(FpuRsp(p)), portCount) + input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source)) + for(i <- 0 until portCount){ + def rsp = rspStreams(i) + rsp.valid := input.valid && input.source === i && !toFpuRf && !halt + rsp.value := result + rsp.NV := rspNv + rsp.NX := rspNx + io.port(i).rsp << rsp.stage() + } + + + rfOutput.NV := NV + rfOutput.DZ := False + } + + val mul = p.withMul generate new Area{ + val inWidthA = p.internalMantissaSize+1 + val inWidthB = p.internalMantissaSize+1 + val outWidth = p.internalMantissaSize*2+2 + + case class MulSplit(offsetA : Int, offsetB : Int, widthA : Int, widthB : Int, id : Int){ + val offsetC = offsetA+offsetB + val widthC = widthA + widthB + val endC = offsetC+widthC + } + val splitsUnordered = for(offsetA <- 0 until inWidthA by p.mulWidthA; + offsetB <- 0 until inWidthB by p.mulWidthB; + widthA = (inWidthA - offsetA) min p.mulWidthA; + widthB = (inWidthB - offsetB) min p.mulWidthB) yield { + MulSplit(offsetA, offsetB, widthA, widthB, -1) + } + val splits = splitsUnordered.sortWith(_.endC < _.endC).zipWithIndex.map(e => e._1.copy(id=e._2)) + + class MathWithExp extends MulInput{ + val exp = UInt(p.internalExponentSize+1 bits) + } + val preMul = new Area{ + val input = decode.mul.stage() + val output = input.swapPayload(new MathWithExp()) + output.payload.assignSomeByName(input.payload) + output.exp := input.rs1.exponent +^ input.rs2.exponent + } + class MathWithMul extends MathWithExp{ + val muls = Vec(splits.map(e => UInt(e.widthA + e.widthB bits))) + } + val mul = new Area{ + val input = preMul.output.stage() + val output = input.swapPayload(new MathWithMul()) + val mulA = U(input.msb1) @@ input.rs1.mantissa + val mulB = U(input.msb2) @@ input.rs2.mantissa + output.payload.assignSomeByName(input.payload) + splits.foreach(e => output.muls(e.id) := mulA(e.offsetA, e.widthA bits) * mulB(e.offsetB, e.widthB bits)) + } + + val sumSplitAt = splits.size/2//splits.filter(e => e.endC <= p.internalMantissaSize).size + + class Sum1Output extends MathWithExp{ + val muls2 = Vec(splits.drop(sumSplitAt).map(e => UInt(e.widthA + e.widthB bits))) + val mulC2 = UInt(p.internalMantissaSize*2+2 bits) + } + class Sum2Output extends MathWithExp{ + val mulC = UInt(p.internalMantissaSize*2+2 bits) + } + + val sum1 = new Area { + val input = mul.output.stage() + val sum = splits.take(sumSplitAt).map(e => (input.muls(e.id) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _) + + val output = input.swapPayload(new Sum1Output()) + output.payload.assignSomeByName(input.payload) + output.mulC2 := sum.resized + output.muls2 := Vec(input.muls.drop(sumSplitAt)) + } + + val sum2 = new Area { + val input = sum1.output.stage() + val sum = input.mulC2 + splits.drop(sumSplitAt).map(e => (input.muls2(e.id-sumSplitAt) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _) + + val isCommited = commitConsume(_.mul, input.source, input.fire) + val output = input.haltWhen(!isCommited).swapPayload(new Sum2Output()) + output.payload.assignSomeByName(input.payload) + output.mulC := sum + } + + val norm = new Area{ + val input = sum2.output.stage() + val (mulHigh, mulLow) = input.mulC.splitAt(p.internalMantissaSize-1) + val scrap = mulLow =/= 0 + val needShift = mulHigh.msb + val exp = input.exp + U(needShift) + val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits) + scrap setWhen(needShift && mulHigh(0)) + val forceZero = input.rs1.isZero || input.rs2.isZero + val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24) + val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25) + val forceUnderflow = exp < underflowThreshold + val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity + val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero)) + val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan + + val output = p.writeFloating() + output.sign := input.rs1.sign ^ input.rs2.sign + output.exponent := (exp - exponentOne).resized + output.mantissa := man.asUInt + output.setNormal + val NV = False + + when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 } + + when(forceNan) { + output.setNanQuiet + NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling) + } elsewhen(forceOverflow) { + output.setInfinity + } elsewhen(forceZero) { + output.setZero + } elsewhen(forceUnderflow) { + output.exponent := underflowExp.resized + } + } + + val result = new Area { + def input = norm.input + def NV = norm.NV + + val notMul = new Area { + val output = Flow(UInt(p.internalMantissaSize + 1 bits)) + output.valid := input.valid && input.divSqrt + output.payload := input.mulC(p.internalMantissaSize, p.internalMantissaSize + 1 bits) + } + + val output = Stream(new MergeInput()) + output.valid := input.valid && !input.add && !input.divSqrt + output.source := input.source + output.rd := input.rd + if (p.withDouble) output.format := input.format + output.roundMode := input.roundMode + output.scrap := norm.scrap + output.value := norm.output + output.NV := NV + output.DZ := False + + val mulToAdd = Stream(AddInput()) + decode.mulToAdd << mulToAdd.stage() + + mulToAdd.valid := input.valid && input.add + mulToAdd.source := input.source + mulToAdd.rs1.mantissa := norm.output.mantissa @@ norm.scrap //FMA Precision lost + mulToAdd.rs1.exponent := norm.output.exponent + mulToAdd.rs1.sign := norm.output.sign + mulToAdd.rs1.special := norm.output.special + mulToAdd.rs2 := input.rs3 + mulToAdd.rs2.mantissa.removeAssignments() := input.rs3.mantissa << addExtraBits + mulToAdd.rd := input.rd + mulToAdd.roundMode := input.roundMode + mulToAdd.needCommit := False + if (p.withDouble) mulToAdd.format := input.format + + when(NV){ + mulToAdd.rs1.mantissa.msb := False + } + + input.ready := (input.add ? mulToAdd.ready | output.ready) || input.divSqrt + } + } + + + val div = p.withDiv generate new Area{ + val input = decode.div.halfPipe() + val haltIt = True + val isCommited = RegNext(commitConsume(_.div, input.source, input.fire)) + val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput()) + + val dividerShift = if(p.withDouble) 0 else 1 + val divider = FpuDiv(p.internalMantissaSize + dividerShift) + divider.io.input.a := input.rs1.mantissa << dividerShift + divider.io.input.b := input.rs2.mantissa << dividerShift + val dividerResult = divider.io.output.result >> dividerShift + val dividerScrap = divider.io.output.remain =/= 0 || divider.io.output.result(0, dividerShift bits) =/= 0 + + val cmdSent = RegInit(False) setWhen(divider.io.input.fire) clearWhen(!haltIt) + divider.io.input.valid := input.valid && !cmdSent + divider.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + val needShift = !dividerResult.msb + val mantissa = needShift ? dividerResult(0, p.internalMantissaSize + 1 bits) | dividerResult(1, p.internalMantissaSize + 1 bits) + val scrap = dividerScrap || !needShift && dividerResult(0) + val exponentOffset = 1 << (p.internalExponentSize + 1) + val exponent = input.rs1.exponent + U(exponentOffset | exponentOne) - input.rs2.exponent - U(needShift) + + output.value.setNormal + output.value.sign := input.rs1.sign ^ input.rs2.sign + output.value.exponent := exponent.resized + output.value.mantissa := mantissa + output.scrap := scrap + when(exponent.takeHigh(2) === 3){ output.value.exponent(p.internalExponentSize-3, 3 bits) := 7} //Handle overflow + + + + val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) + val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) + val forceUnderflow = exponent < underflowThreshold + val forceOverflow = input.rs1.isInfinity || input.rs2.isZero + val infinitynan = input.rs1.isZero && input.rs2.isZero || input.rs1.isInfinity && input.rs2.isInfinity + val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan + val forceZero = input.rs1.isZero || input.rs2.isInfinity + + + + output.NV := False + output.DZ := !forceNan && !input.rs1.isInfinity && input.rs2.isZero + + when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } + + when(forceNan) { + output.value.setNanQuiet + output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + } elsewhen(forceOverflow) { + output.value.setInfinity + } elsewhen(forceZero) { + output.value.setZero + } elsewhen(forceUnderflow) { + output.value.exponent := underflowExp.resized + } + + + haltIt clearWhen(divider.io.output.valid) + } + + + + val sqrt = p.withSqrt generate new Area{ + val input = decode.sqrt.halfPipe() + val haltIt = True + val isCommited = RegNext(commitConsume(_.sqrt, input.source, input.fire)) + val output = input.haltWhen(haltIt || !isCommited).swapPayload(new MergeInput()) + + val needShift = !input.rs1.exponent.lsb + val sqrt = FpuSqrt(p.internalMantissaSize) + sqrt.io.input.a := (needShift ? (U"1" @@ input.rs1.mantissa @@ U"0") | (U"01" @@ input.rs1.mantissa)) + + val cmdSent = RegInit(False) setWhen(sqrt.io.input.fire) clearWhen(!haltIt) + sqrt.io.input.valid := input.valid && !cmdSent + sqrt.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + + val scrap = sqrt.io.output.remain =/= 0 + val exponent = RegNext(exponentOne-exponentOne/2 -1 +^ (input.rs1.exponent >> 1) + U(input.rs1.exponent.lsb)) + + output.value.setNormal + output.value.sign := input.rs1.sign + output.value.exponent := exponent + output.value.mantissa := sqrt.io.output.result + output.scrap := scrap + output.NV := False + output.DZ := False + + val negative = !input.rs1.isNan && !input.rs1.isZero && input.rs1.sign + + when(input.rs1.isInfinity){ + output.value.setInfinity + } + when(negative){ + output.value.setNanQuiet + output.NV := True + } + when(input.rs1.isNan){ + output.value.setNanQuiet + output.NV := !input.rs1.isQuiet + } + when(input.rs1.isZero){ + output.value.setZero + } + + +// val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) +// val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) +// val forceUnderflow = exponent < underflowThreshold +// val forceOverflow = input.rs1.isInfinity// || input.rs2.isInfinity +// val infinitynan = input.rs1.isZero && input.rs2.isZero +// val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan +// val forceZero = input.rs1.isZero +// +// +// +// output.NV := False +// output.DZ := !forceNan && input.rs2.isZero +// +// when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } +// +// when(forceNan) { +// output.value.setNanQuiet +// output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) +// } elsewhen(forceOverflow) { +// output.value.setInfinity +// } elsewhen(forceZero) { +// output.value.setZero +// } elsewhen(forceUnderflow) { +// output.value.exponent := underflowExp.resized +// } + + + haltIt clearWhen(sqrt.io.output.valid) + } + + //divSqrt isn't realy used anymore + val divSqrt = p.withDivSqrt generate new Area { + val input = decode.divSqrt.halfPipe() + assert(false, "Need to implement commit tracking") + val aproxWidth = 8 + val aproxDepth = 64 + val divIterationCount = 3 + val sqrtIterationCount = 3 + + val mulWidth = p.internalMantissaSize + 1 + + import FpuDivSqrtIterationState._ + val state = RegInit(FpuDivSqrtIterationState.IDLE()) + val iteration = Reg(UInt(log2Up(divIterationCount max sqrtIterationCount) bits)) + + decode.divSqrtToMul.valid := False + decode.divSqrtToMul.source := input.source + decode.divSqrtToMul.rs1.assignDontCare() + decode.divSqrtToMul.rs2.assignDontCare() + decode.divSqrtToMul.rs3.assignDontCare() + decode.divSqrtToMul.rd := input.rd + decode.divSqrtToMul.add := False + decode.divSqrtToMul.divSqrt := True + decode.divSqrtToMul.msb1 := True + decode.divSqrtToMul.msb2 := True + decode.divSqrtToMul.rs1.special := False //TODO + decode.divSqrtToMul.rs2.special := False + decode.divSqrtToMul.roundMode := input.roundMode + if(p.withDouble) decode.divSqrtToMul.format := input.format + + + val aprox = new Area { + val rom = Mem(UInt(aproxWidth bits), aproxDepth * 2) + val divTable, sqrtTable = ArrayBuffer[Double]() + for(i <- 0 until aproxDepth){ + val value = 1+(i+0.5)/aproxDepth + divTable += 1/value + } + for(i <- 0 until aproxDepth){ + val scale = if(i < aproxDepth/2) 2 else 1 + val value = scale+(scale*(i%(aproxDepth/2)+0.5)/aproxDepth*2) +// println(s"$i => $value" ) + sqrtTable += 1/Math.sqrt(value) + } + val romElaboration = (sqrtTable ++ divTable).map(v => BigInt(((v-0.5)*2*(1 << aproxWidth)).round)) + + rom.initBigInt(romElaboration) + val div = input.rs2.mantissa.takeHigh(log2Up(aproxDepth)) + val sqrt = U(input.rs1.exponent.lsb ## input.rs1.mantissa).takeHigh(log2Up(aproxDepth)) + val address = U(input.div ## (input.div ? div | sqrt)) + val raw = rom.readAsync(address) + val result = U"01" @@ (raw << (mulWidth-aproxWidth-2)) + } + + val divExp = new Area{ + val value = (1 << p.internalExponentSize) - 3 - input.rs2.exponent + } + val sqrtExp = new Area{ + val value = ((1 << p.internalExponentSize-1) + (1 << p.internalExponentSize-2) - 2 -1) - (input.rs1.exponent >> 1) + U(!input.rs1.exponent.lsb) + } + + def mulArg(rs1 : UInt, rs2 : UInt): Unit ={ + decode.divSqrtToMul.rs1.mantissa := rs1.resized + decode.divSqrtToMul.rs2.mantissa := rs2.resized + decode.divSqrtToMul.msb1 := rs1.msb + decode.divSqrtToMul.msb2 := rs2.msb + } + + val mulBuffer = mul.result.notMul.output.toStream.stage + mulBuffer.ready := False + + val iterationValue = Reg(UInt(mulWidth bits)) + + input.ready := False + switch(state){ + is(IDLE){ + iterationValue := aprox.result + iteration := 0 + when(input.valid) { + state := YY + } + } + is(YY){ + decode.divSqrtToMul.valid := True + mulArg(iterationValue, iterationValue) + when(decode.divSqrtToMul.ready) { + state := XYY + } + } + is(XYY){ + decode.divSqrtToMul.valid := mulBuffer.valid + val sqrtIn = !input.rs1.exponent.lsb ? (U"1" @@ input.rs1.mantissa) | ((U"1" @@ input.rs1.mantissa) |>> 1) + val divIn = U"1" @@ input.rs2.mantissa + mulArg(input.div ? divIn| sqrtIn, mulBuffer.payload) + when(mulBuffer.valid && decode.divSqrtToMul.ready) { + state := (input.div ? Y2_XYY | _15_XYY2) + mulBuffer.ready := True + } + } + is(Y2_XYY){ + mulBuffer.ready := True + when(mulBuffer.valid) { + iterationValue := ((iterationValue << 1) - mulBuffer.payload).resized + mulBuffer.ready := True + iteration := iteration + 1 + when(iteration =/= divIterationCount-1){ //TODO + state := YY + } otherwise { + state := DIV + } + } + } + is(DIV){ + decode.divSqrtToMul.valid := True + decode.divSqrtToMul.divSqrt := False + decode.divSqrtToMul.rs1 := input.rs1 + decode.divSqrtToMul.rs2.sign := input.rs2.sign + decode.divSqrtToMul.rs2.exponent := divExp.value + iterationValue.msb.asUInt + decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized + val zero = input.rs2.isInfinity + val overflow = input.rs2.isZero + val nan = input.rs2.isNan || (input.rs1.isZero && input.rs2.isZero) + + when(nan){ + decode.divSqrtToMul.rs2.setNanQuiet + } elsewhen(overflow) { + decode.divSqrtToMul.rs2.setInfinity + } elsewhen(zero) { + decode.divSqrtToMul.rs2.setZero + } + when(decode.divSqrtToMul.ready) { + state := IDLE + input.ready := True + } + } + is(_15_XYY2){ + when(mulBuffer.valid) { + state := Y_15_XYY2 + mulBuffer.payload.getDrivingReg := (U"11" << mulWidth-2) - (mulBuffer.payload) + } + } + is(Y_15_XYY2){ + decode.divSqrtToMul.valid := True + mulArg(iterationValue, mulBuffer.payload) + when(decode.divSqrtToMul.ready) { + mulBuffer.ready := True + state := Y_15_XYY2_RESULT + } + } + is(Y_15_XYY2_RESULT){ + iterationValue := mulBuffer.payload + mulBuffer.ready := True + when(mulBuffer.valid) { + iteration := iteration + 1 + when(iteration =/= sqrtIterationCount-1){ + state := YY + } otherwise { + state := SQRT + } + } + } + is(SQRT){ + decode.divSqrtToMul.valid := True + decode.divSqrtToMul.divSqrt := False + decode.divSqrtToMul.rs1 := input.rs1 + decode.divSqrtToMul.rs2.sign := False + decode.divSqrtToMul.rs2.exponent := sqrtExp.value + iterationValue.msb.asUInt + decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized + + val nan = input.rs1.sign && !input.rs1.isZero + + when(nan){ + decode.divSqrtToMul.rs2.setNanQuiet + } + + when(decode.divSqrtToMul.ready) { + state := IDLE + input.ready := True + } + } + } + } + + val add = p.withAdd generate new Area{ + + + class PreShifterOutput extends AddInput{ + val absRs1Bigger = Bool() + val rs1ExponentBigger = Bool() + } + + val preShifter = new Area{ + val input = decode.add.combStage() + val output = input.swapPayload(new PreShifterOutput) + + val exp21 = input.rs2.exponent -^ input.rs1.exponent + val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero + val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent + val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa + val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity + + output.payload.assignSomeByName(input.payload) + output.absRs1Bigger := absRs1Bigger + output.rs1ExponentBigger := rs1ExponentBigger + } + + class ShifterOutput extends AddInput{ + val xSign, ySign = Bool() + val xMantissa, yMantissa = UInt(p.internalMantissaSize+1+addExtraBits bits) + val xyExponent = UInt(p.internalExponentSize bits) + val xySign = Bool() + val roundingScrap = Bool() + } + + val shifter = new Area { + val input = preShifter.output.stage() + val output = input.swapPayload(new ShifterOutput) + output.payload.assignSomeByName(input.payload) + + val exp21 = input.rs2.exponent -^ input.rs1.exponent + val shiftBy = exp21.asSInt.abs//rs1ExponentBigger ? (0-exp21) | exp21 + val shiftOverflow = (shiftBy >= p.internalMantissaSize+1+addExtraBits) + val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero) + + def absRs1Bigger = input.absRs1Bigger + def rs1ExponentBigger = input.rs1ExponentBigger + + //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path + val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign + output.xSign := xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign) + output.ySign := xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign) + val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) + val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) + var yMantissa = CombInit(yMantissaUnshifted) + val roundingScrap = False + for(i <- log2Up(p.internalMantissaSize) - 1 downto 0){ + roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0) + yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa + } + when(passThrough) { yMantissa := 0 } + when(shiftOverflow) { roundingScrap := True } + when(input.rs1.special || input.rs2.special){ roundingScrap := False } + output.xyExponent := rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent + output.xMantissa := xMantissa + output.yMantissa := yMantissa + output.xySign := xySign + output.roundingScrap := roundingScrap + } + + class MathOutput extends ShifterOutput{ + val xyMantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits) + } + + val math = new Area { + val input = shifter.output.stage() + val output = input.swapPayload(new MathOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ? + val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here + output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits) + + } + + class OhOutput extends MathOutput{ + val shift = UInt(log2Up(p.internalMantissaSize+1+addExtraBits+1) bits) + } + + val oh = new Area { + val input = math.output.stage() + val isCommited = commitConsume(_.add, input.source, input.fire && input.needCommit) + val output = input.haltWhen(input.needCommit && !isCommited).swapPayload(new OhOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + val shiftOh = OHMasking.first(output.xyMantissa.asBools.reverse) //The OhMasking.first can be processed in parallel to the xyMantissa carry chaine +// output.shiftOh := shiftOh + + val shift = OHToUInt(shiftOh) + output.shift := shift + } + + + class NormOutput extends AddInput{ + val mantissa = UInt(p.internalMantissaSize+1+addExtraBits+1 bits) + val exponent = UInt(p.internalExponentSize+1 bits) + val infinityNan, forceNan, forceZero, forceInfinity = Bool() + val xySign, roundingScrap = Bool() + val xyMantissaZero = Bool() + } + + val norm = new Area{ + val input = oh.output.stage() + val output = input.swapPayload(new NormOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + output.mantissa := (xyMantissa |<< shift) + output.exponent := xyExponent -^ shift + 1 + output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity) + output.forceZero := xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero) + output.infinityNan := (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign)) + output.forceNan := input.rs1.isNan || input.rs2.isNan || output.infinityNan + output.xyMantissaZero := xyMantissa === 0 + } + + val result = new Area { + val input = norm.output.pipelined() + val output = input.swapPayload(new MergeInput()) + import input.payload._ + + output.source := input.source + output.rd := input.rd + output.value.sign := xySign + output.value.mantissa := (mantissa >> addExtraBits).resized + output.value.exponent := exponent.resized + output.value.special := False + output.roundMode := input.roundMode + if (p.withDouble) output.format := input.format + output.scrap := (mantissa(1) | mantissa(0) | roundingScrap) + + output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling + output.DZ := False + when(forceNan) { + output.value.setNanQuiet + } elsewhen (forceInfinity) { + output.value.setInfinity + } elsewhen (forceZero) { + output.value.setZero + when(xyMantissaZero || input.rs1.isZero && input.rs2.isZero) { + output.value.sign := input.rs1.sign && input.rs2.sign + } + when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN) { + output.value.sign := True + } + } + } + } + + + val merge = new Area { + val inputs = ArrayBuffer[Stream[MergeInput]]() + inputs += load.s1.output.stage() + if(p.withSqrt) (inputs += sqrt.output) + if(p.withDiv) (inputs += div.output) + if(p.withAdd) (inputs += add.result.output) + if(p.withMul) (inputs += mul.result.output) + if(p.withShortPipMisc) (inputs += shortPip.output.pipelined(m2s = true)) + val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs).toFlow + } + + class RoundFront extends MergeInput{ + val mantissaIncrement = Bool() + val roundAdjusted = Bits(2 bits) + val exactMask = UInt(p.internalMantissaSize + 2 bits) + } + + val roundFront = new Area { + val input = merge.arbitrated.stage() + val output = input.swapPayload(new RoundFront()) + output.payload.assignSomeByName(input.payload) + + val manAggregate = input.value.mantissa @@ input.scrap + val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal + 1)(exponentF32Subnormal + 1) + val expDif = expBase -^ input.value.exponent + val expSubnormal = !expDif.msb + var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0)) + if (p.withDouble) when(input.format === FpuFormat.FLOAT) { + discardCount \= discardCount + 29 + } + val exactMask = (List(True) ++ (0 until p.internalMantissaSize + 1).map(_ < discardCount)).asBits.asUInt + val roundAdjusted = (True ## (manAggregate >> 1)) (discardCount) ## ((manAggregate & exactMask) =/= 0) + + val mantissaIncrement = !input.value.special && input.roundMode.mux( + FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate >> 2)) (discardCount))), + FpuRoundMode.RTZ -> False, + FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign), + FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign), + FpuRoundMode.RMM -> (roundAdjusted(1)) + ) + + output.mantissaIncrement := mantissaIncrement + output.roundAdjusted := roundAdjusted + output.exactMask := exactMask + } + + val roundBack = new Area{ + val input = roundFront.output.stage() + val output = input.swapPayload(RoundOutput()) + import input.payload._ + + val math = p.internalFloating() + val mantissaRange = p.internalMantissaSize downto 1 + val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue) + val adderRightOp = (mantissaIncrement ? (exactMask >> 1)| U(0)).resize(p.internalMantissaSize bits) + val adder = KeepAttribute(KeepAttribute(input.value.exponent @@ adderMantissa) + KeepAttribute(adderRightOp) + KeepAttribute(U(mantissaIncrement))) + math.special := input.value.special + math.sign := input.value.sign + math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits) + math.mantissa := adder(0, p.internalMantissaSize bits) + + val patched = CombInit(math) + val nx,of,uf = False + + val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal) + val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1) + val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1) + + //catch exact 1.17549435E-38 underflow, but, who realy care ? +// val borringCase = input.value.exponent === ufSubnormalThreshold && roundAdjusted.asUInt < U"11" +// when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){ +// uf := True +// } + val threshold = input.roundMode.mux( + FpuRoundMode.RNE -> U"110", + FpuRoundMode.RTZ -> U"110", + FpuRoundMode.RDN -> (input.value.sign ? U"101" | U"111"), + FpuRoundMode.RUP -> (input.value.sign ? U"111" | U"101"), + FpuRoundMode.RMM -> U"110" + ) + val borringRound = (input.value.mantissa(1 downto 0) ## input.scrap) + if(p.withDouble) when(input.format === FpuFormat.FLOAT) { borringRound := (input.value.mantissa(30 downto 29) ## input.value.mantissa(28 downto 0).orR)} + + val borringCase = input.value.exponent === ufSubnormalThreshold && borringRound.asUInt < threshold + when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){ + uf := True + } + when(!math.special && math.exponent > ofThreshold){ + nx := True + of := True + val doMax = input.roundMode.mux( + FpuRoundMode.RNE -> (False), + FpuRoundMode.RTZ -> (True), + FpuRoundMode.RDN -> (!math.sign), + FpuRoundMode.RUP -> (math.sign), + FpuRoundMode.RMM -> (False) + ) + when(doMax){ + patched.exponent := ofThreshold + patched.mantissa.setAll() + } otherwise { + patched.setInfinity + } + } + + + when(!math.special && math.exponent < ufThreshold){ + nx := True + uf := True + val doMin = input.roundMode.mux( + FpuRoundMode.RNE -> (False), + FpuRoundMode.RTZ -> (False), + FpuRoundMode.RDN -> (math.sign), + FpuRoundMode.RUP -> (!math.sign), + FpuRoundMode.RMM -> (False) + ) + when(doMin){ + patched.exponent := ufThreshold.resized + patched.mantissa := 0 + } otherwise { + patched.setZero + } + } + + + nx setWhen(!input.value.special && (roundAdjusted =/= 0)) + val writes = rf.scoreboards.map(_.writes.readAsync(input.rd)) + val write = writes.toList.read(input.source) + output.NX := nx & write + output.OF := of & write + output.UF := uf & write + output.NV := input.NV & write + output.DZ := input.DZ & write + output.source := input.source + output.rd := input.rd + output.write := write + if(p.withDouble) output.format := input.format + output.value := patched + } + + val writeback = new Area{ + val input = roundBack.output.stage() + + for(i <- 0 until portCount){ + val c = io.port(i).completion + c.valid := input.fire && input.source === i + c.flags.NX := input.NX + c.flags.OF := input.OF + c.flags.UF := input.UF + c.flags.NV := input.NV + c.flags.DZ := input.DZ + c.written := input.write + } + + when(input.valid){ + for(i <- 0 until portCount) { + val port = rf.scoreboards(i).hitWrite + port.valid setWhen(input.source === i) + port.address := input.rd + port.data := !rf.scoreboards(i).hit(input.rd) //TODO improve + } + } + + val port = rf.ram.writePort + port.valid := input.valid && input.write + port.address := input.source @@ input.rd + port.data.value := input.value + if(p.withDouble) port.data.boxed := input.format === FpuFormat.FLOAT + + val randomSim = p.sim generate (in UInt(p.internalMantissaSize bits)) + if(p.sim) when(port.data.value.isZero || port.data.value.isInfinity){ + port.data.value.mantissa := randomSim + } + if(p.sim) when(input.value.special){ + port.data.value.exponent(p.internalExponentSize-1 downto 3) := randomSim.resized + when(!input.value.isNan){ + port.data.value.exponent(2 downto 2) := randomSim.resized + } + } + + when(port.valid){ + assert(!(port.data.value.exponent === 0 && !port.data.value.special), "Special violation") + assert(!(port.data.value.exponent === port.data.value.exponent.maxValue && !port.data.value.special), "Special violation") + } + } +} + + + + +object FpuSynthesisBench extends App{ + val payloadType = HardType(Bits(8 bits)) + class Fpu(name : String, portCount : Int, p : FpuParameter) extends Rtl{ + override def getName(): String = "Fpu_" + name + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuCore(portCount, p){ + + setDefinitionName(Fpu.this.getName()) + }) + } + + class Shifter(width : Int) extends Rtl{ + override def getName(): String = "shifter_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(a >> sel) + setDefinitionName(Shifter.this.getName()) + }) + } + + class Rotate(width : Int) extends Rtl{ + override def getName(): String = "rotate_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(Delay(Delay(a,3).rotateLeft(Delay(sel,3)),3)) + setDefinitionName(Rotate.this.getName()) + }) + } + +// rotate2_24 -> +// Artix 7 -> 233 Mhz 96 LUT 167 FF +// Artix 7 -> 420 Mhz 86 LUT 229 FF +// rotate2_32 -> +// Artix 7 -> 222 Mhz 108 LUT 238 FF +// Artix 7 -> 399 Mhz 110 LUT 300 FF +// rotate2_52 -> +// Artix 7 -> 195 Mhz 230 LUT 362 FF +// Artix 7 -> 366 Mhz 225 LUT 486 FF +// rotate2_64 -> +// Artix 7 -> 182 Mhz 257 LUT 465 FF +// Artix 7 -> 359 Mhz 266 LUT 591 FF + class Rotate2(width : Int) extends Rtl{ + override def getName(): String = "rotate2_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = in UInt(width bits) + val sel = in UInt(log2Up(width) bits) + val result = out(Delay((U(0, width bits) @@ Delay(a,3)).rotateLeft(Delay(sel,3)),3)) + setDefinitionName(Rotate2.this.getName()) + }) + } + + class Rotate3(width : Int) extends Rtl{ + override def getName(): String = "rotate3_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a = Delay(in UInt(width bits), 3) + val sel = Delay(in UInt(log2Up(width) bits),3) + // val result = + // val output = Delay(result, 3) + setDefinitionName(Rotate3.this.getName()) + }) + } + + class Div(width : Int) extends Rtl{ + override def getName(): String = "div_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new UnsignedDivider(width,width, false).setDefinitionName(Div.this.getName())) + } + + class Add(width : Int) extends Rtl{ + override def getName(): String = "add_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a, b = in UInt(width bits) + val result = out(a + b) + setDefinitionName(Add.this.getName()) + }) + } + + class DivSqrtRtl(width : Int) extends Rtl{ + override def getName(): String = "DivSqrt_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuDiv(width).setDefinitionName(DivSqrtRtl.this.getName())) + } + + val rtls = ArrayBuffer[Rtl]() + rtls += new Fpu( + "32", + portCount = 1, + FpuParameter( +// withDivSqrt = false, + withDouble = false + ) + ) + rtls += new Fpu( + "64", + portCount = 1, + FpuParameter( +// withDivSqrt = false, + withDouble = true + ) + ) + +// rtls += new Div(52) +// rtls += new Div(23) +// rtls += new Add(64) +// rtls += new DivSqrtRtl(52) +// rtls += new DivSqrtRtl(23) + + // rtls += new Shifter(24) +// rtls += new Shifter(32) +// rtls += new Shifter(52) +// rtls += new Shifter(64) +// rtls += new Rotate(24) +// rtls += new Rotate(32) +// rtls += new Rotate(52) +// rtls += new Rotate(64) +// rtls += new Rotate3(24) +// rtls += new Rotate3(32) +// rtls += new Rotate3(52) +// rtls += new Rotate3(64) + + val targets = XilinxStdTargets()// ++ AlteraStdTargets() + + + Bench(rtls, targets) +} + +//Fpu_32 -> +//Artix 7 -> 136 Mhz 1471 LUT 1336 FF +//Artix 7 -> 196 Mhz 1687 LUT 1371 FF +//Fpu_64 -> +//Artix 7 -> 105 Mhz 2822 LUT 2132 FF +//Artix 7 -> 161 Mhz 3114 LUT 2272 FF +// +// +// +//Fpu_32 -> +//Artix 7 -> 128 Mhz 1693 LUT 1481 FF +//Artix 7 -> 203 Mhz 1895 LUT 1481 FF +//Fpu_64 -> +//Artix 7 -> 99 Mhz 3073 LUT 2396 FF +//Artix 7 -> 164 Mhz 3433 LUT 2432 FF + + +//Fpu_32 -> +//Artix 7 -> 112 Mhz 1790 LUT 1666 FF +//Artix 7 -> 158 Mhz 1989 LUT 1701 FF +//Fpu_64 -> +//Artix 7 -> 100 Mhz 3294 LUT 2763 FF +//Artix 7 -> 151 Mhz 3708 LUT 2904 FF + +//Fpu_32 -> +//Artix 7 -> 139 Mhz 1879 LUT 1713 FF +//Artix 7 -> 206 Mhz 2135 LUT 1723 FF +//Fpu_64 -> +//Artix 7 -> 106 Mhz 3502 LUT 2811 FF +//Artix 7 -> 163 Mhz 3905 LUT 2951 FF + +//Fpu_32 -> +//Artix 7 -> 130 Mhz 1889 LUT 1835 FF +//Artix 7 -> 210 Mhz 2131 LUT 1845 FF +//Fpu_64 -> +//Artix 7 -> 106 Mhz 3322 LUT 3023 FF +//Artix 7 -> 161 Mhz 3675 LUT 3163 FF + +//Fpu_32 -> +//Artix 7 -> 132 Mhz 1891 LUT 1837 FF +//Artix 7 -> 209 Mhz 2132 LUT 1847 FF +//Fpu_64 -> +//Artix 7 -> 105 Mhz 3348 LUT 3024 FF +//Artix 7 -> 162 Mhz 3712 LUT 3165 FF + +//Fpu_32 -> +//Artix 7 -> 128 Mhz 1796 LUT 1727 FF +//Artix 7 -> 208 Mhz 2049 LUT 1727 FF +//Fpu_64 -> +//Artix 7 -> 109 Mhz 3417 LUT 2913 FF +//Artix 7 -> 168 Mhz 3844 LUT 3053 FF + +/* +testfloat -tininessafter -all1 > all1.txt +cat all1.txt | grep "Errors found in" + +testfloat -tininessafter -all2 > all2.txt +cat all2.txt | grep "Errors found in" + +testfloat -tininessafter -f32_mulAdd > fma.txt + +testfloat -tininessafter -all2 -level 2 -checkall > all2.txt + + + +all1 => +Errors found in f32_to_ui64_rx_minMag: +Errors found in f32_to_i64_rx_minMag: +Errors found in f64_to_ui64_rx_minMag: +Errors found in f64_to_i64_rx_minMag: + +all2 => + + +Errors found in f32_mulAdd, rounding min: ++00.7FFFFF +67.000001 -01.000000 + => -01.000000 ...ux expected -01.000000 ....x ++67.000001 +00.7FFFFF -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +-00.7FFFFF -67.000001 -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +-67.000001 -00.7FFFFF -01.000000 + => -01.000000 ...ux expected -01.000000 ....x +Errors found in f32_mulAdd, rounding max: ++00.7FFFFF -67.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x ++67.000001 -00.7FFFFF +01.000000 + => +01.000000 ...ux expected +01.000000 ....x ++66.7FFFFE -01.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x +-00.7FFFFF +67.000001 +01.000000 + => +01.000000 ...ux expected +01.000000 ....x +-67.000001 +00.7FFFFF +01.000000 + => +01.000000 ...ux expected +01.000000 ....x + + + + */
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala new file mode 100644 index 0000000..7c9e713 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala @@ -0,0 +1,140 @@ +package vexriscv.ip.fpu + + +import spinal.core._ +import spinal.lib.math.{UnsignedDividerCmd, UnsignedDividerRsp} +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuDivCmd(mantissaWidth : Int) extends Bundle{ + val a,b = UInt(mantissaWidth bits) +} + +case class FpuDivRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 + 2 bits) + val remain = UInt(mantissaWidth+1 bits) +} + +case class FpuDiv(val mantissaWidth : Int) extends Component { + assert(mantissaWidth % 2 == 0) + val io = new Bundle{ + val input = slave Stream(FpuDivCmd(mantissaWidth)) + val output = master Stream(FpuDivRsp(mantissaWidth)) + } + + val iterations = (mantissaWidth+2+2)/2 + val counter = Reg(UInt(log2Up(iterations) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val shifter = Reg(UInt(mantissaWidth + 3 bits)) + val result = Reg(UInt(mantissaWidth+1+2 bits)) + + val div1, div3 = Reg(UInt(mantissaWidth+3 bits)) + val div2 = div1 |<< 1 + + val sub1 = shifter -^ div1 + val sub2 = shifter -^ div2 + val sub3 = shifter -^ div3 + + io.output.valid := done + io.output.result := (result << 0).resized + io.output.remain := (shifter >> 2).resized + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(shifter) + result := result |<< 2 + when(!sub1.msb){ + sel := sub1.resized + result(1 downto 0) := 1 + } + when(!sub2.msb){ + sel := sub2.resized + result(1 downto 0) := 2 + } + when(!sub3.msb){ + sel := sub3.resized + result(1 downto 0) := 3 + } + shifter := sel |<< 2 + } + + when(!busy){ + counter := 0 + shifter := (U"1" @@ io.input.a @@ U"").resized + div1 := (U"1" @@ io.input.b).resized + div3 := (U"1" @@ io.input.b) +^ (((U"1" @@ io.input.b)) << 1) + busy := io.input.valid + } +} + + +object FpuDivTester extends App{ + import spinal.core.sim._ + + for(w <- List(16, 20)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuDiv(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuDivRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int, b : Int): Unit ={ + cmdQueue +={p => + p.a #= a + p.b #= b + } + rspQueue += {p => + val x = (a | (1 << dut.mantissaWidth)).toLong + val y = (b | (1 << dut.mantissaWidth)).toLong + val result = (x << dut.mantissaWidth+2) / y + val remain = (x << dut.mantissaWidth+2) % y + + assert(p.result.toLong == result, f"$x%x/$y%x=${p.result.toLong}%x instead of $result%x") + assert(p.remain.toLong == remain, f"$x%x %% $y%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 + test(0xE000 << s, 0x8000 << s) + test(0xC000 << s, 0x4000 << s) + test(0xC835 << s, 0x4742 << s) + test(0,0) + test(0,f) + test(f,0) + test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(1 << dut.mantissaWidth), Random.nextInt(1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +} + +object FpuDivTester2 extends App{ + val mantissaWidth = 52 + val a = BigInt(0xfffffff810000l) + val b = BigInt(0x0000000000FF0l) + val x = (a | (1l << mantissaWidth)) + val y = (b | (1l << mantissaWidth)) + val result = (x << mantissaWidth+2) / y + val remain = (x << mantissaWidth+2) % y + println("done") + +}
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala new file mode 100644 index 0000000..0f80905 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala @@ -0,0 +1,116 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuSqrtCmd(mantissaWidth : Int) extends Bundle{ + val a = UInt(mantissaWidth+2 bits) +} + +case class FpuSqrtRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 bits) + val remain = UInt(mantissaWidth+5 bits) +} + +case class FpuSqrt(val mantissaWidth : Int) extends Component { + val io = new Bundle{ + val input = slave Stream(FpuSqrtCmd(mantissaWidth)) + val output = master Stream(FpuSqrtRsp(mantissaWidth)) + } + + val iterations = mantissaWidth+2 + val counter = Reg(UInt(log2Up(iterations ) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val a = Reg(UInt(mantissaWidth+5 bits)) + val x = Reg(UInt(mantissaWidth bits)) + val q = Reg(UInt(mantissaWidth+1 bits)) + val t = a-(q @@ U"01") + + + io.output.valid := done + io.output.result := (q << 0).resized + io.output.remain := a + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(a) + when(!t.msb){ + sel := t.resized + } + q := (q @@ !t.msb).resized + a := (sel @@ x(widthOf(x)-2,2 bits)).resized + x := x |<< 2 + } + + when(!busy){ + q := 0 + a := io.input.a(widthOf(io.input.a)-2,2 bits).resized + x := (io.input.a).resized + counter := 0 + when(io.input.valid){ + busy := True + } + } +} + + +object FpuSqrtTester extends App{ + import spinal.core.sim._ + + for(w <- List(16)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuSqrt(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuSqrtRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int): Unit ={ + cmdQueue +={p => + p.a #= a + } + rspQueue += {p => +// val x = (a * (1l << dut.mantissaWidth)).toLong +// val result = Math.sqrt(x).toLong/(1 << dut.mantissaWidth/2) +// val remain = a-x*x + val x = a.toDouble / (1 << dut.mantissaWidth) + val result = (Math.sqrt(x)*(1 << dut.mantissaWidth+1)).toLong + val filtred = result % (1 << dut.mantissaWidth+1) +// val remain = (a-(result*result)).toLong + assert(p.result.toLong == filtred, f"$a%x=${p.result.toLong}%x instead of $filtred%x") +// assert(p.remain.toLong == remain, f"$a%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 +// test(121) + test(0x20000) + test(0x18000) +// test(0,0) +// test(0,f) +// test(f,0) +// test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(3 << dut.mantissaWidth) + (1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +}
\ No newline at end of file diff --git a/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala new file mode 100644 index 0000000..9338c35 --- /dev/null +++ b/VexRiscv/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -0,0 +1,186 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ + + +object Fpu{ + + object Function{ + val MUL = 0 + val ADD = 1 + } + +} + + +case class FpuFloatDecoded() extends Bundle{ + val isNan = Bool() + val isNormal = Bool() + val isSubnormal = Bool() + val isZero = Bool() + val isInfinity = Bool() + val isQuiet = Bool() +} + +object FpuFloat{ + val ZERO = 0 + val INFINITY = 1 + val NAN = 2 + val NAN_CANONICAL_BIT = 2 +} + +case class FpuFloat(exponentSize: Int, + mantissaSize: Int) extends Bundle { + val mantissa = UInt(mantissaSize bits) + val exponent = UInt(exponentSize bits) + val sign = Bool() + val special = Bool() + + def withInvertSign : FpuFloat ={ + val ret = FpuFloat(exponentSize,mantissaSize) + ret.sign := !sign + ret.exponent := exponent + ret.mantissa := mantissa + ret + } + + def isNormal = !special + def isZero = special && exponent(1 downto 0) === FpuFloat.ZERO + def isInfinity = special && exponent(1 downto 0) === FpuFloat.INFINITY + def isNan = special && exponent(1 downto 0) === FpuFloat.NAN + def isQuiet = mantissa.msb + def isNanSignaling = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet + def isCanonical = exponent(FpuFloat.NAN_CANONICAL_BIT) + + def setNormal = { special := False } + def setZero = { special := True; exponent(1 downto 0) := FpuFloat.ZERO } + def setInfinity = { special := True; exponent(1 downto 0) := FpuFloat.INFINITY } + def setNan = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := False} + def setNanQuiet = { special := True; exponent(1 downto 0) := FpuFloat.NAN; exponent(FpuFloat.NAN_CANONICAL_BIT) := True; mantissa.msb := True; } + + def decode() = { + val ret = FpuFloatDecoded() + ret.isZero := isZero + ret.isNormal := isNormal + ret.isInfinity := isInfinity + ret.isNan := isNan + ret.isQuiet := mantissa.msb + ret + } + + def decodeIeee754() = { + val ret = FpuFloatDecoded() + val expZero = exponent === 0 + val expOne = exponent === exponent.maxValue + val manZero = mantissa === 0 + ret.isZero := expZero && manZero + ret.isSubnormal := expZero && !manZero + ret.isNormal := !expOne && !expZero + ret.isInfinity := expOne && manZero + ret.isNan := expOne && !manZero + ret.isQuiet := mantissa.msb + ret + } +} + +object FpuOpcode extends SpinalEnum{ + val LOAD, STORE, MUL, ADD, FMA, I2F, F2I, CMP, DIV, SQRT, MIN_MAX, SGNJ, FMV_X_W, FMV_W_X, FCLASS, FCVT_X_X = newElement() +} + +object FpuFormat extends SpinalEnum{ + val FLOAT, DOUBLE = newElement() +} + +object FpuRoundMode extends SpinalEnum(){ + val RNE, RTZ, RDN, RUP, RMM = newElement() + defaultEncoding = SpinalEnumEncoding("opt")( + RNE -> 0, + RTZ -> 1, + RDN -> 2, + RUP -> 3, + RMM -> 4 + ) +} +object FpuRoundModeInstr extends SpinalEnum(){ + val RNE, RTZ, RDN, RUP, RMM, DYN = newElement() + defaultEncoding = SpinalEnumEncoding("opt")( + RNE -> 0, + RTZ -> 1, + RDN -> 2, + RUP -> 3, + RMM -> 4, + DYN -> 7 + ) +} + + +case class FpuParameter( withDouble : Boolean, + asyncRegFile : Boolean = false, + mulWidthA : Int = 18, + mulWidthB : Int = 18, + schedulerM2sPipe : Boolean = false, + sim : Boolean = false, + withAdd : Boolean = true, + withMul : Boolean = true, + withDivSqrt : Boolean = false, + withDiv : Boolean = true, + withSqrt : Boolean = true, + withShortPipMisc : Boolean = true){ + + val internalMantissaSize = if(withDouble) 52 else 23 + val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits)) + val internalExponentSize = (if(withDouble) 11 else 8) + 1 + val internalFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize)) + val writeFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize+1)) + + val rfAddress = HardType(UInt(5 bits)) + + val Opcode = FpuOpcode + val Format = FpuFormat + val argWidth = 2 + val Arg = HardType(Bits(2 bits)) +} + +case class FpuFlags() extends Bundle{ + val NX, UF, OF, DZ, NV = Bool() +} + +case class FpuCompletion() extends Bundle{ + val flags = FpuFlags() + val written = Bool() //Used for verification purposes +} + +case class FpuCmd(p : FpuParameter) extends Bundle{ + val opcode = p.Opcode() + val arg = Bits(2 bits) + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + val format = p.Format() + val roundMode = FpuRoundMode() +} + +case class FpuCommit(p : FpuParameter) extends Bundle{ + val opcode = FpuOpcode() + val rd = UInt(5 bits) + val write = Bool() + val value = p.storeLoadType() // IEEE 754 +} + +case class FpuRsp(p : FpuParameter) extends Bundle{ + val value = p.storeLoadType() // IEEE754 store || Integer + val NV, NX = Bool() +} + +case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave { + val cmd = Stream(FpuCmd(p)) + val commit = Stream(FpuCommit(p)) + val rsp = Stream(FpuRsp(p)) + val completion = Flow(FpuCompletion()) + + override def asMaster(): Unit = { + master(cmd, commit) + slave(rsp) + in(completion) + } +} |
