/* * par2serial-cc: optimize.c - Optimization passes implementation * * Pass 1: Parallel pattern detection * Pass 2: Scalar optimizations (constant fold, DCE, strength reduce, CSE) * Pass 3: Loop optimizations (tiling, unrolling, interchange, fusion) * Pass 4: SIMD vectorization * Pass 5: Memory optimizations (prefetch, alignment) */ #include "optimize.h" /* ══════════════════════════════════════════════════════════ * SIMD utilities * ══════════════════════════════════════════════════════════ */ int simd_width(SIMDTarget target, TypeKind elem_type) { int elem_bytes; switch (elem_type) { case TYPE_FLOAT: elem_bytes = 4; break; case TYPE_DOUBLE: elem_bytes = 8; break; case TYPE_INT: elem_bytes = 4; break; case TYPE_CHAR: elem_bytes = 1; break; case TYPE_SHORT: elem_bytes = 2; break; default: elem_bytes = 4; break; } int vec_bytes; switch (target) { case SIMD_SSE: vec_bytes = 16; break; case SIMD_AVX: case SIMD_AVX2: vec_bytes = 32; break; case SIMD_AVX512: vec_bytes = 64; break; case SIMD_NEON: vec_bytes = 16; break; default: return 1; } return vec_bytes / elem_bytes; } const char *simd_target_str(SIMDTarget t) { switch (t) { case SIMD_SSE: return "SSE4.2"; case SIMD_AVX: return "AVX"; case SIMD_AVX2: return "AVX2+FMA"; case SIMD_AVX512: return "AVX-512"; case SIMD_NEON: return "NEON"; default: return "scalar"; } } /* ══════════════════════════════════════════════════════════ * Pass 1: Parallel Pattern Detection * ══════════════════════════════════════════════════════════ */ ParLoopVec detect_parallel_patterns(IRBlock *bb) { ParLoopVec loops; vec_init(&loops); for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->op == IR_PAR_FOR_BEGIN) { ParLoopInfo info = {0}; info.start_inst = (int)i; info.iter_var = inst->par.iter_var; info.iter_reg = inst->dst.reg; info.lo_reg = inst->par.lo_reg; info.hi_reg = inst->par.hi_reg; info.is_reduce = false; info.can_vectorize = true; info.has_dependency = false; info.trip_count = -1; info.elem_type = TYPE_FLOAT; /* Find matching end */ int depth = 1; for (size_t j = i + 1; j < bb->insts.len; j++) { if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) depth++; if (bb->insts.data[j]->op == IR_PAR_FOR_END) { depth--; if (depth == 0) { info.end_inst = (int)j; break; } } } /* Analyze loop body for dependencies */ bool has_store = false; bool has_call = false; for (int j = info.start_inst; j <= info.end_inst; j++) { IRInst *bi = bb->insts.data[j]; if (bi->op == IR_STORE) has_store = true; if (bi->op == IR_CALL) has_call = true; } if (has_call) info.can_vectorize = false; vec_push(&loops, info); } if (inst->op == IR_PAR_REDUCE_BEGIN) { ParLoopInfo info = {0}; info.start_inst = (int)i; info.iter_var = inst->par.iter_var; info.accum_reg = inst->par.accum_reg; info.reduce_op = inst->par.reduce_op; info.is_reduce = true; info.can_vectorize = true; info.elem_type = TYPE_FLOAT; int depth = 1; for (size_t j = i + 1; j < bb->insts.len; j++) { if (bb->insts.data[j]->op == IR_PAR_REDUCE_BEGIN) depth++; if (bb->insts.data[j]->op == IR_PAR_REDUCE_END) { depth--; if (depth == 0) { info.end_inst = (int)j; break; } } } vec_push(&loops, info); } if (inst->op == IR_PAR_SCAN_BEGIN) { ParLoopInfo info = {0}; info.start_inst = (int)i; info.is_scan = true; info.can_vectorize = false; /* scans are harder to vectorize */ int depth = 1; for (size_t j = i + 1; j < bb->insts.len; j++) { if (bb->insts.data[j]->op == IR_PAR_SCAN_BEGIN) depth++; if (bb->insts.data[j]->op == IR_PAR_SCAN_END) { depth--; if (depth == 0) { info.end_inst = (int)j; break; } } } vec_push(&loops, info); } } return loops; } /* ══════════════════════════════════════════════════════════ * Pass 2: Scalar Optimizations * ══════════════════════════════════════════════════════════ */ /* ── Constant folding ────────────────────────────────────── */ void opt_constant_fold(IRBlock *bb, Arena *arena) { /* Track known constants: reg -> value */ int64_t *known_int = (int64_t *)calloc(4096, sizeof(int64_t)); bool *is_known = (bool *)calloc(4096, sizeof(bool)); for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->op == IR_LOAD_IMM && inst->dst.kind == VAL_REG) { int r = inst->dst.reg; if (r < 4096) { known_int[r] = inst->src1.int_val; is_known[r] = true; } } /* Try to fold binary ops on known constants */ if (inst->op >= IR_ADD && inst->op <= IR_MOD) { int64_t a = 0, b_val = 0; bool a_known = false, b_known = false; if (inst->src1.kind == VAL_INT) { a = inst->src1.int_val; a_known = true; } else if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096 && is_known[inst->src1.reg]) { a = known_int[inst->src1.reg]; a_known = true; } if (inst->src2.kind == VAL_INT) { b_val = inst->src2.int_val; b_known = true; } else if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096 && is_known[inst->src2.reg]) { b_val = known_int[inst->src2.reg]; b_known = true; } if (a_known && b_known) { int64_t result = 0; switch (inst->op) { case IR_ADD: result = a + b_val; break; case IR_SUB: result = a - b_val; break; case IR_MUL: result = a * b_val; break; case IR_DIV: result = b_val != 0 ? a / b_val : 0; break; case IR_MOD: result = b_val != 0 ? a % b_val : 0; break; default: continue; } /* Replace with constant load */ inst->op = IR_LOAD_IMM; inst->src1 = ir_int(result); inst->src2 = ir_none(); if (inst->dst.reg < 4096) { known_int[inst->dst.reg] = result; is_known[inst->dst.reg] = true; } } } /* Algebraic simplifications */ if (inst->op == IR_ADD || inst->op == IR_FADD) { /* x + 0 = x */ if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) { inst->op = IR_MOVE; inst->src2 = ir_none(); } } if (inst->op == IR_MUL || inst->op == IR_FMUL) { /* x * 1 = x */ if (inst->src2.kind == VAL_INT && inst->src2.int_val == 1) { inst->op = IR_MOVE; inst->src2 = ir_none(); } /* x * 0 = 0 */ if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) { inst->op = IR_LOAD_IMM; inst->src1 = ir_int(0); inst->src2 = ir_none(); } } if (inst->op == IR_MUL) { /* x * 2 → x << 1 (strength reduction) */ if (inst->src2.kind == VAL_INT && inst->src2.int_val == 2) { inst->op = IR_SHL; inst->src2 = ir_int(1); } /* x * power-of-2 → x << log2 */ if (inst->src2.kind == VAL_INT && inst->src2.int_val > 0) { int64_t v = inst->src2.int_val; if ((v & (v - 1)) == 0) { int shift = 0; while ((1LL << shift) < v) shift++; inst->op = IR_SHL; inst->src2 = ir_int(shift); } } } } free(known_int); free(is_known); } /* ── Dead code elimination ───────────────────────────────── */ void opt_dead_code_eliminate(IRBlock *bb) { /* Simple: mark registers that are used, remove unused defs */ bool *used = (bool *)calloc(4096, sizeof(bool)); /* Pass 1: mark used registers */ for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096) used[inst->src1.reg] = true; if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096) used[inst->src2.reg] = true; if (inst->src3.kind == VAL_REG && inst->src3.reg < 4096) used[inst->src3.reg] = true; } /* Pass 2: remove dead definitions (but keep side-effectful ops) */ for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->dst.kind == VAL_REG && inst->dst.reg < 4096 && !used[inst->dst.reg]) { /* Safe to remove if no side effects */ if (inst->op == IR_ADD || inst->op == IR_SUB || inst->op == IR_MUL || inst->op == IR_DIV || inst->op == IR_MOD || inst->op == IR_LOAD_IMM || inst->op == IR_LOAD_FIMM || inst->op == IR_MOVE || inst->op == IR_FADD || inst->op == IR_FSUB || inst->op == IR_FMUL || inst->op == IR_FDIV) { inst->op = IR_NOP; } } } free(used); } /* ── Strength reduction ──────────────────────────────────── */ void opt_strength_reduce(IRBlock *bb, Arena *arena) { for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; /* Division by power of 2 → shift */ if (inst->op == IR_DIV && inst->src2.kind == VAL_INT) { int64_t v = inst->src2.int_val; if (v > 0 && (v & (v - 1)) == 0) { int shift = 0; while ((1LL << shift) < v) shift++; inst->op = IR_SHR; inst->src2 = ir_int(shift); } } /* Modulo by power of 2 → bitwise AND */ if (inst->op == IR_MOD && inst->src2.kind == VAL_INT) { int64_t v = inst->src2.int_val; if (v > 0 && (v & (v - 1)) == 0) { inst->op = IR_AND; inst->src2 = ir_int(v - 1); } } } } /* ── Common subexpression elimination (basic) ────────────── */ void opt_cse(IRBlock *bb, Arena *arena) { /* Simple hash-based CSE within a basic block */ /* For each arithmetic op, check if we've seen the same (op, src1, src2) before */ typedef struct { IROp op; int s1; int s2; int result; } CSEEntry; CSEEntry entries[1024]; int entry_count = 0; for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->op >= IR_ADD && inst->op <= IR_MOD && inst->src1.kind == VAL_REG && inst->src2.kind == VAL_REG) { /* Look for matching previous computation */ for (int j = 0; j < entry_count; j++) { if (entries[j].op == inst->op && entries[j].s1 == inst->src1.reg && entries[j].s2 == inst->src2.reg) { /* Replace with move from previous result */ inst->op = IR_MOVE; inst->src1 = ir_reg(entries[j].result, inst->dst.type); inst->src2 = ir_none(); goto next_inst; } } if (entry_count < 1024 && inst->dst.kind == VAL_REG) { entries[entry_count].op = inst->op; entries[entry_count].s1 = inst->src1.reg; entries[entry_count].s2 = inst->src2.reg; entries[entry_count].result = inst->dst.reg; entry_count++; } } next_inst:; /* Invalidate on stores and calls */ if (inst->op == IR_STORE || inst->op == IR_CALL) entry_count = 0; } } /* ══════════════════════════════════════════════════════════ * Pass 3: Loop Optimizations * ══════════════════════════════════════════════════════════ */ void opt_loop_tile(IRBlock *bb, ParLoopInfo *loop, int tile_size, Arena *arena) { if (tile_size <= 0) tile_size = 32; /* default */ /* Insert tiling markers before the parallel loop */ /* The code generator will use these to emit tiled loops */ IRInst *marker = ir_emit(arena, bb, IR_COMMENT, ir_none(), ir_none(), ir_none()); marker->comment = arena_strdup(arena, "TILED"); /* Mark the loop info */ loop->trip_count = tile_size; /* will be used by codegen */ if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { IRInst *begin = bb->insts.data[loop->start_inst]; begin->comment = "TILED_LOOP"; } } void opt_loop_unroll(IRBlock *bb, ParLoopInfo *loop, int factor, Arena *arena) { if (factor <= 0) factor = 4; /* default unroll factor */ /* Mark for unrolling - codegen will handle the actual unrolling */ if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { IRInst *begin = bb->insts.data[loop->start_inst]; char buf[64]; snprintf(buf, sizeof(buf), "UNROLL_%d", factor); begin->comment = arena_strdup(arena, buf); } } void opt_loop_interchange(IRBlock *bb, Arena *arena) { /* Look for nested parallel loops where interchange improves locality */ /* For now, mark candidates for the codegen */ for (size_t i = 0; i < bb->insts.len; i++) { if (bb->insts.data[i]->op == IR_PAR_FOR_BEGIN) { /* Check for nested parallel for */ for (size_t j = i + 1; j < bb->insts.len; j++) { if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) { /* Found nested loop - check if interchange is beneficial */ /* Heuristic: if inner loop has stride-1 access, don't interchange */ /* If outer loop has stride-1, interchange */ ir_emit_comment(arena, bb, "INTERCHANGE_CANDIDATE"); break; } if (bb->insts.data[j]->op == IR_PAR_FOR_END) break; } } } } void opt_loop_fuse(IRBlock *bb, Arena *arena) { /* Fuse adjacent parallel_for loops with same bounds */ for (size_t i = 0; i < bb->insts.len; i++) { if (bb->insts.data[i]->op == IR_PAR_FOR_END) { /* Look for next PAR_FOR_BEGIN */ for (size_t j = i + 1; j < bb->insts.len; j++) { IRInst *next = bb->insts.data[j]; if (next->op == IR_NOP || next->op == IR_COMMENT) continue; if (next->op == IR_PAR_FOR_BEGIN) { /* Check if bounds match (same lo/hi registers) */ ir_emit_comment(arena, bb, "FUSE_CANDIDATE"); } break; } } } } /* ══════════════════════════════════════════════════════════ * Pass 4: SIMD Vectorization * ══════════════════════════════════════════════════════════ */ void opt_vectorize(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) { if (!loop->can_vectorize || target == SIMD_NONE) return; int width = simd_width(target, loop->elem_type); if (width <= 1) return; /* Mark vectorization on the loop */ char buf[128]; snprintf(buf, sizeof(buf), "VECTORIZE_%s_WIDTH_%d", simd_target_str(target), width); if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf); } /* Transform loads/stores in loop body to SIMD operations */ for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; switch (inst->op) { case IR_LOAD: inst->op = IR_SIMD_LOAD; inst->dst.simd_width = width; break; case IR_STORE: inst->op = IR_SIMD_STORE; inst->src1.simd_width = width; break; case IR_FADD: inst->op = IR_SIMD_ADD; inst->dst.simd_width = width; inst->src1.simd_width = width; inst->src2.simd_width = width; break; case IR_FSUB: inst->op = IR_SIMD_SUB; inst->dst.simd_width = width; inst->src1.simd_width = width; inst->src2.simd_width = width; break; case IR_FMUL: inst->op = IR_SIMD_MUL; inst->dst.simd_width = width; inst->src1.simd_width = width; inst->src2.simd_width = width; break; default: break; } } } void opt_vectorize_reduce(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) { if (!loop->is_reduce || target == SIMD_NONE) return; int width = simd_width(target, loop->elem_type); if (width <= 1) return; /* Transform reduction: * scalar: sum += a[i] * simd: vsum = simd_add(vsum, simd_load(&a[i])) * ... after loop: sum = horizontal_add(vsum) */ char buf[128]; snprintf(buf, sizeof(buf), "VECTORIZE_REDUCE_%s_WIDTH_%d", simd_target_str(target), width); if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf); } /* Transform the accumulation in loop body */ for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; /* Look for add to accumulator */ if ((inst->op == IR_FADD || inst->op == IR_ADD) && inst->dst.kind == VAL_REG && inst->dst.reg == loop->accum_reg) { inst->op = IR_SIMD_ADD; inst->dst.simd_width = width; } if (inst->op == IR_LOAD && inst->dst.simd_width == 0) { /* Promote to SIMD load inside reduction */ inst->op = IR_SIMD_LOAD; inst->dst.simd_width = width; } } /* Insert horizontal reduction after loop */ if (loop->end_inst < (int)bb->insts.len) { int hadd_reg = loop->accum_reg; IRInst *hadd = ir_emit(arena, bb, IR_SIMD_HADD, ir_reg(hadd_reg, loop->elem_type), ir_simd_reg(hadd_reg, loop->elem_type, width), ir_none()); hadd->comment = "horizontal_reduce"; } } /* ══════════════════════════════════════════════════════════ * Pass 5: Memory Optimizations * ══════════════════════════════════════════════════════════ */ void opt_insert_prefetch(IRBlock *bb, Arena *arena) { /* Insert prefetch hints before loads in loops */ /* Look for load instructions that are likely to benefit from prefetching */ size_t orig_len = bb->insts.len; for (size_t i = 0; i < orig_len; i++) { IRInst *inst = bb->insts.data[i]; if ((inst->op == IR_LOAD || inst->op == IR_SIMD_LOAD) && inst->src1.kind == VAL_REG) { /* Insert prefetch for next cache line */ ir_emit_comment(arena, bb, "PREFETCH_NEXT_LINE"); } } } void opt_align_data(IRBlock *bb, Arena *arena) { /* Mark SIMD loads/stores that could use aligned variants */ for (size_t i = 0; i < bb->insts.len; i++) { IRInst *inst = bb->insts.data[i]; if (inst->op == IR_SIMD_LOAD || inst->op == IR_SIMD_STORE) { inst->comment = "ALIGN_CANDIDATE"; } } } /* ══════════════════════════════════════════════════════════ * Main Optimization Pipeline * ══════════════════════════════════════════════════════════ */ void optimize_module(IRModule *mod, OptContext *ctx) { if (ctx->level == OPT_O0) { if (ctx->report) p2s_note("optimization level O0: no optimizations applied"); return; } for (IRFunc *f = mod->functions; f; f = f->next) { for (size_t bi = 0; bi < f->blocks.len; bi++) { IRBlock *bb = f->blocks.data[bi]; /* ── O1: Scalar optimizations ────────────── */ if (ctx->level >= OPT_O1) { opt_constant_fold(bb, ctx->arena); opt_strength_reduce(bb, ctx->arena); opt_cse(bb, ctx->arena); opt_dead_code_eliminate(bb); if (ctx->report) p2s_note("O1: scalar optimizations applied to %s/%s", f->name, bb->name); } /* ── O2: Loop + SIMD ─────────────────────── */ if (ctx->level >= OPT_O2) { /* Detect parallel patterns */ ParLoopVec loops = detect_parallel_patterns(bb); if (ctx->report) p2s_note("O2: detected %zu parallel loop(s) in %s/%s", loops.len, f->name, bb->name); for (size_t li = 0; li < loops.len; li++) { ParLoopInfo *loop = &loops.data[li]; /* Loop optimizations */ opt_loop_unroll(bb, loop, ctx->unroll_factor, ctx->arena); /* SIMD vectorization */ if (loop->is_reduce) { opt_vectorize_reduce(bb, loop, ctx->simd, ctx->arena); if (ctx->report) p2s_note(" vectorized reduction (width=%d)", simd_width(ctx->simd, loop->elem_type)); } else if (loop->can_vectorize) { opt_vectorize(bb, loop, ctx->simd, ctx->arena); if (ctx->report) p2s_note(" vectorized parallel_for (width=%d)", simd_width(ctx->simd, loop->elem_type)); } } /* Inter-loop optimizations */ opt_loop_interchange(bb, ctx->arena); opt_loop_fuse(bb, ctx->arena); vec_free(&loops); } /* ── O3: Memory + aggressive ─────────────── */ if (ctx->level >= OPT_O3) { ParLoopVec loops = detect_parallel_patterns(bb); for (size_t li = 0; li < loops.len; li++) { opt_loop_tile(bb, &loops.data[li], ctx->tile_size > 0 ? ctx->tile_size : 32, ctx->arena); } opt_insert_prefetch(bb, ctx->arena); opt_align_data(bb, ctx->arena); if (ctx->report) p2s_note("O3: memory optimizations applied to %s/%s", f->name, bb->name); vec_free(&loops); } /* Remove barriers (serial execution is naturally ordered) */ for (size_t i = 0; i < bb->insts.len; i++) { if (bb->insts.data[i]->op == IR_BARRIER) { bb->insts.data[i]->op = IR_NOP; bb->insts.data[i]->comment = "barrier removed (serial)"; if (ctx->report) p2s_note(" removed barrier (serial execution)"); } } } } }