The base RV32I ISA has no multiplication or division — those are in the optional M extension. Any real-world RISC-V implementation marks itself as RV32IM once it adds these 8 instructions. Today we build a pipelined multiplier and an iterative divider, and show how to hook them into the datapath.
| Instruction | funct3 | Result in rd | Types |
|---|---|---|---|
| MUL | 000 | Low 32 bits of rs1 × rs2 | Signed × Signed |
| MULH | 001 | High 32 bits of rs1 × rs2 | Signed × Signed |
| MULHSU | 010 | High 32 bits of rs1 × rs2 | Signed × Unsigned |
| MULHU | 011 | High 32 bits of rs1 × rs2 | Unsigned × Unsigned |
| DIV | 100 | Signed quotient | Signed ÷ Signed |
| DIVU | 101 | Unsigned quotient | Unsigned ÷ Unsigned |
| REM | 110 | Signed remainder | Signed rem Signed |
| REMU | 111 | Unsigned remainder | Unsigned rem Unsigned |
All 8 share opcode 0110011 (same as R-type) with funct7 = 0000001. The control unit detects funct7=0000001 and routes to the multiply/divide units instead of the base ALU.
// mul_unit.v — 2-stage pipelined 32x32 multiplier (RV32M)
// Latency: 2 cycles. funct3 selects which 32-bit slice of the 64-bit product.
module mul_unit (
input clk, rst,
input valid, // 1 = compute this cycle
input [31:0] a, b,
input [ 2:0] funct3, // 000=MUL, 001=MULH, 010=MULHSU, 011=MULHU
output reg [31:0] result,
output reg done // result is valid on next cycle after done
);
// Stage 1: sign-extend operands, store product type
reg [63:0] stage1_prod;
reg [ 2:0] stage1_funct3;
reg stage1_valid;
wire signed [63:0] signed_prod = $signed(a) * $signed(b);
wire [63:0] unsigned_prod = a * b;
wire signed [63:0] signedU_prod = $signed(a) * {1'b0, b};
always @(posedge clk or posedge rst) begin
if (rst) begin
stage1_valid <= 0;
end else begin
stage1_valid <= valid;
stage1_funct3 <= funct3;
case (funct3)
3'b000, 3'b001: stage1_prod <= signed_prod;
3'b010: stage1_prod <= signedU_prod;
3'b011: stage1_prod <= unsigned_prod;
default: stage1_prod <= 0;
endcase
end
end
// Stage 2: select the correct 32-bit slice
always @(posedge clk or posedge rst) begin
if (rst) begin
done <= 0;
end else begin
done <= stage1_valid;
if (stage1_funct3 == 3'b000)
result <= stage1_prod[31:0]; // MUL — low half
else
result <= stage1_prod[63:32]; // MULH/MULHSU/MULHU — high half
end
end
endmodule
// div_unit.v — 32-cycle restoring divider (RV32M DIV/DIVU/REM/REMU)
// Iterates bit-by-bit for 32 cycles then asserts done for 1 cycle.
module div_unit (
input clk, rst,
input start, // assert for 1 cycle to begin
input [31:0] dividend,
input [31:0] divisor,
input is_signed, // 1=DIV/REM, 0=DIVU/REMU
input rem_sel, // 1=return remainder, 0=return quotient
output reg [31:0] result,
output reg done
);
reg [31:0] D, d; // dividend magnitude, divisor magnitude
reg [31:0] Q; // quotient accumulator
reg [32:0] R; // partial remainder (one extra sign bit)
reg [5:0] cnt; // iteration counter 0..31
reg busy;
reg neg_q, neg_r; // sign of quotient and remainder
always @(posedge clk or posedge rst) begin
if (rst) begin
busy <= 0; done <= 0;
end else if (start && !busy) begin
// Determine magnitudes and expected signs
neg_q <= is_signed && (dividend[31] ^ divisor[31]);
neg_r <= is_signed && dividend[31];
D <= (is_signed && dividend[31]) ? -dividend : dividend;
d <= (is_signed && divisor[31]) ? -divisor : divisor;
Q <= 0;
R <= 0;
cnt <= 0;
busy <= 1;
done <= 0;
end else if (busy) begin
// Restoring division step
begin : step
reg [32:0] R_shifted;
reg [32:0] R_sub;
R_shifted = {R[31:0], D[31-cnt[4:0]]};
R_sub = R_shifted - {1'b0, d};
if (R_sub[32] == 0) begin // non-negative: quotient bit = 1
R <= R_sub;
Q <= {Q[30:0], 1'b1};
end else begin
R <= R_shifted;
Q <= {Q[30:0], 1'b0};
end
end
if (cnt == 31) begin
busy <= 0;
done <= 1;
// Apply sign correction
result <= rem_sel ?
(neg_r ? -R[31:0] : R[31:0]) :
(neg_q ? -Q : Q);
end else begin
cnt <= cnt + 1;
done <= 0;
end
end else
done <= 0;
end
endmodule
// tb_m_ext.v — Test mul_unit and div_unit
`timescale 1ns/1ps
module tb_m_ext;
reg clk=0, rst=1;
always #5 clk=~clk;
// MUL test
reg mul_valid;
reg [31:0] mul_a, mul_b;
reg [2:0] mul_f3;
wire [31:0] mul_result;
wire mul_done;
mul_unit mu(.clk(clk),.rst(rst),.valid(mul_valid),
.a(mul_a),.b(mul_b),.funct3(mul_f3),
.result(mul_result),.done(mul_done));
// DIV test
reg div_start;
reg [31:0] div_dvd, div_dvs;
wire [31:0] div_result;
wire div_done;
div_unit du(.clk(clk),.rst(rst),.start(div_start),
.dividend(div_dvd),.divisor(div_dvs),
.is_signed(1),.rem_sel(0),
.result(div_result),.done(div_done));
initial begin
$dumpfile("tb_m_ext.vcd"); $dumpvars(0,tb_m_ext);
mul_valid=0; div_start=0;
@(posedge clk); @(posedge clk); rst=0;
// MUL: 7 * 6 = 42
mul_a=32'd7; mul_b=32'd6; mul_f3=3'b000; mul_valid=1;
@(posedge clk); mul_valid=0;
@(posedge clk); @(posedge clk); // 2-cycle latency
if(mul_result===32'd42) $display("PASS: MUL 7*6=42");
else $display("FAIL: MUL result=%0d", mul_result);
// DIV: 100 / 7 = 14
div_dvd=32'd100; div_dvs=32'd7; div_start=1;
@(posedge clk); div_start=0;
@(div_done); // wait 32 cycles
if(div_result===32'd14) $display("PASS: DIV 100/7=14");
else $display("FAIL: DIV result=%0d", div_result);
$finish;
end
endmodule
RV32M adds 8 integer multiply/divide instructions: MUL, MULH, MULHSU, MULHU, DIV, DIVU, REM, REMU. All use opcode 0110011 with funct7=0000001 to distinguish them from base R-type instructions.
It iterates 32 times. Each cycle it shifts the partial remainder left, subtracts the divisor, and if the result is non-negative the quotient bit is 1 (remainder kept); otherwise the quotient bit is 0 and the remainder is restored. After 32 steps both quotient and remainder are ready.
A 32x32 combinatorial multiplier has a long delay through 32 partial-product adder levels. Pipelining it into 2 stages halves the critical path depth, allowing the overall CPU to run at a higher clock frequency.