par2serial-cc / src /optimize.c
clarenceleo's picture
Add optimize.c - complete optimization pipeline with all passes
c8426e4 verified
Raw
History Blame Contribute Delete
26 kB
/*
* par2serial-cc: optimize.c - Optimization passes implementation
*
* Pass 1: Parallel pattern detection
* Pass 2: Scalar optimizations (constant fold, DCE, strength reduce, CSE)
* Pass 3: Loop optimizations (tiling, unrolling, interchange, fusion)
* Pass 4: SIMD vectorization
* Pass 5: Memory optimizations (prefetch, alignment)
*/
#include "optimize.h"
/* ══════════════════════════════════════════════════════════
* SIMD utilities
* ══════════════════════════════════════════════════════════ */
int simd_width(SIMDTarget target, TypeKind elem_type) {
int elem_bytes;
switch (elem_type) {
case TYPE_FLOAT: elem_bytes = 4; break;
case TYPE_DOUBLE: elem_bytes = 8; break;
case TYPE_INT: elem_bytes = 4; break;
case TYPE_CHAR: elem_bytes = 1; break;
case TYPE_SHORT: elem_bytes = 2; break;
default: elem_bytes = 4; break;
}
int vec_bytes;
switch (target) {
case SIMD_SSE: vec_bytes = 16; break;
case SIMD_AVX:
case SIMD_AVX2: vec_bytes = 32; break;
case SIMD_AVX512: vec_bytes = 64; break;
case SIMD_NEON: vec_bytes = 16; break;
default: return 1;
}
return vec_bytes / elem_bytes;
}
const char *simd_target_str(SIMDTarget t) {
switch (t) {
case SIMD_SSE: return "SSE4.2";
case SIMD_AVX: return "AVX";
case SIMD_AVX2: return "AVX2+FMA";
case SIMD_AVX512: return "AVX-512";
case SIMD_NEON: return "NEON";
default: return "scalar";
}
}
/* ══════════════════════════════════════════════════════════
* Pass 1: Parallel Pattern Detection
* ══════════════════════════════════════════════════════════ */
ParLoopVec detect_parallel_patterns(IRBlock *bb) {
ParLoopVec loops;
vec_init(&loops);
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->op == IR_PAR_FOR_BEGIN) {
ParLoopInfo info = {0};
info.start_inst = (int)i;
info.iter_var = inst->par.iter_var;
info.iter_reg = inst->dst.reg;
info.lo_reg = inst->par.lo_reg;
info.hi_reg = inst->par.hi_reg;
info.is_reduce = false;
info.can_vectorize = true;
info.has_dependency = false;
info.trip_count = -1;
info.elem_type = TYPE_FLOAT;
/* Find matching end */
int depth = 1;
for (size_t j = i + 1; j < bb->insts.len; j++) {
if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) depth++;
if (bb->insts.data[j]->op == IR_PAR_FOR_END) {
depth--;
if (depth == 0) { info.end_inst = (int)j; break; }
}
}
/* Analyze loop body for dependencies */
bool has_store = false;
bool has_call = false;
for (int j = info.start_inst; j <= info.end_inst; j++) {
IRInst *bi = bb->insts.data[j];
if (bi->op == IR_STORE) has_store = true;
if (bi->op == IR_CALL) has_call = true;
}
if (has_call) info.can_vectorize = false;
vec_push(&loops, info);
}
if (inst->op == IR_PAR_REDUCE_BEGIN) {
ParLoopInfo info = {0};
info.start_inst = (int)i;
info.iter_var = inst->par.iter_var;
info.accum_reg = inst->par.accum_reg;
info.reduce_op = inst->par.reduce_op;
info.is_reduce = true;
info.can_vectorize = true;
info.elem_type = TYPE_FLOAT;
int depth = 1;
for (size_t j = i + 1; j < bb->insts.len; j++) {
if (bb->insts.data[j]->op == IR_PAR_REDUCE_BEGIN) depth++;
if (bb->insts.data[j]->op == IR_PAR_REDUCE_END) {
depth--;
if (depth == 0) { info.end_inst = (int)j; break; }
}
}
vec_push(&loops, info);
}
if (inst->op == IR_PAR_SCAN_BEGIN) {
ParLoopInfo info = {0};
info.start_inst = (int)i;
info.is_scan = true;
info.can_vectorize = false; /* scans are harder to vectorize */
int depth = 1;
for (size_t j = i + 1; j < bb->insts.len; j++) {
if (bb->insts.data[j]->op == IR_PAR_SCAN_BEGIN) depth++;
if (bb->insts.data[j]->op == IR_PAR_SCAN_END) {
depth--;
if (depth == 0) { info.end_inst = (int)j; break; }
}
}
vec_push(&loops, info);
}
}
return loops;
}
/* ══════════════════════════════════════════════════════════
* Pass 2: Scalar Optimizations
* ══════════════════════════════════════════════════════════ */
/* ── Constant folding ────────────────────────────────────── */
void opt_constant_fold(IRBlock *bb, Arena *arena) {
/* Track known constants: reg -> value */
int64_t *known_int = (int64_t *)calloc(4096, sizeof(int64_t));
bool *is_known = (bool *)calloc(4096, sizeof(bool));
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->op == IR_LOAD_IMM && inst->dst.kind == VAL_REG) {
int r = inst->dst.reg;
if (r < 4096) {
known_int[r] = inst->src1.int_val;
is_known[r] = true;
}
}
/* Try to fold binary ops on known constants */
if (inst->op >= IR_ADD && inst->op <= IR_MOD) {
int64_t a = 0, b_val = 0;
bool a_known = false, b_known = false;
if (inst->src1.kind == VAL_INT) { a = inst->src1.int_val; a_known = true; }
else if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096 && is_known[inst->src1.reg]) {
a = known_int[inst->src1.reg]; a_known = true;
}
if (inst->src2.kind == VAL_INT) { b_val = inst->src2.int_val; b_known = true; }
else if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096 && is_known[inst->src2.reg]) {
b_val = known_int[inst->src2.reg]; b_known = true;
}
if (a_known && b_known) {
int64_t result = 0;
switch (inst->op) {
case IR_ADD: result = a + b_val; break;
case IR_SUB: result = a - b_val; break;
case IR_MUL: result = a * b_val; break;
case IR_DIV: result = b_val != 0 ? a / b_val : 0; break;
case IR_MOD: result = b_val != 0 ? a % b_val : 0; break;
default: continue;
}
/* Replace with constant load */
inst->op = IR_LOAD_IMM;
inst->src1 = ir_int(result);
inst->src2 = ir_none();
if (inst->dst.reg < 4096) {
known_int[inst->dst.reg] = result;
is_known[inst->dst.reg] = true;
}
}
}
/* Algebraic simplifications */
if (inst->op == IR_ADD || inst->op == IR_FADD) {
/* x + 0 = x */
if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
inst->op = IR_MOVE;
inst->src2 = ir_none();
}
}
if (inst->op == IR_MUL || inst->op == IR_FMUL) {
/* x * 1 = x */
if (inst->src2.kind == VAL_INT && inst->src2.int_val == 1) {
inst->op = IR_MOVE;
inst->src2 = ir_none();
}
/* x * 0 = 0 */
if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
inst->op = IR_LOAD_IMM;
inst->src1 = ir_int(0);
inst->src2 = ir_none();
}
}
if (inst->op == IR_MUL) {
/* x * 2 → x << 1 (strength reduction) */
if (inst->src2.kind == VAL_INT && inst->src2.int_val == 2) {
inst->op = IR_SHL;
inst->src2 = ir_int(1);
}
/* x * power-of-2 → x << log2 */
if (inst->src2.kind == VAL_INT && inst->src2.int_val > 0) {
int64_t v = inst->src2.int_val;
if ((v & (v - 1)) == 0) {
int shift = 0;
while ((1LL << shift) < v) shift++;
inst->op = IR_SHL;
inst->src2 = ir_int(shift);
}
}
}
}
free(known_int);
free(is_known);
}
/* ── Dead code elimination ───────────────────────────────── */
void opt_dead_code_eliminate(IRBlock *bb) {
/* Simple: mark registers that are used, remove unused defs */
bool *used = (bool *)calloc(4096, sizeof(bool));
/* Pass 1: mark used registers */
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096)
used[inst->src1.reg] = true;
if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096)
used[inst->src2.reg] = true;
if (inst->src3.kind == VAL_REG && inst->src3.reg < 4096)
used[inst->src3.reg] = true;
}
/* Pass 2: remove dead definitions (but keep side-effectful ops) */
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->dst.kind == VAL_REG && inst->dst.reg < 4096 && !used[inst->dst.reg]) {
/* Safe to remove if no side effects */
if (inst->op == IR_ADD || inst->op == IR_SUB || inst->op == IR_MUL ||
inst->op == IR_DIV || inst->op == IR_MOD || inst->op == IR_LOAD_IMM ||
inst->op == IR_LOAD_FIMM || inst->op == IR_MOVE ||
inst->op == IR_FADD || inst->op == IR_FSUB ||
inst->op == IR_FMUL || inst->op == IR_FDIV) {
inst->op = IR_NOP;
}
}
}
free(used);
}
/* ── Strength reduction ──────────────────────────────────── */
void opt_strength_reduce(IRBlock *bb, Arena *arena) {
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
/* Division by power of 2 → shift */
if (inst->op == IR_DIV && inst->src2.kind == VAL_INT) {
int64_t v = inst->src2.int_val;
if (v > 0 && (v & (v - 1)) == 0) {
int shift = 0;
while ((1LL << shift) < v) shift++;
inst->op = IR_SHR;
inst->src2 = ir_int(shift);
}
}
/* Modulo by power of 2 → bitwise AND */
if (inst->op == IR_MOD && inst->src2.kind == VAL_INT) {
int64_t v = inst->src2.int_val;
if (v > 0 && (v & (v - 1)) == 0) {
inst->op = IR_AND;
inst->src2 = ir_int(v - 1);
}
}
}
}
/* ── Common subexpression elimination (basic) ────────────── */
void opt_cse(IRBlock *bb, Arena *arena) {
/* Simple hash-based CSE within a basic block */
/* For each arithmetic op, check if we've seen the same (op, src1, src2) before */
typedef struct { IROp op; int s1; int s2; int result; } CSEEntry;
CSEEntry entries[1024];
int entry_count = 0;
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->op >= IR_ADD && inst->op <= IR_MOD &&
inst->src1.kind == VAL_REG && inst->src2.kind == VAL_REG) {
/* Look for matching previous computation */
for (int j = 0; j < entry_count; j++) {
if (entries[j].op == inst->op &&
entries[j].s1 == inst->src1.reg &&
entries[j].s2 == inst->src2.reg) {
/* Replace with move from previous result */
inst->op = IR_MOVE;
inst->src1 = ir_reg(entries[j].result, inst->dst.type);
inst->src2 = ir_none();
goto next_inst;
}
}
if (entry_count < 1024 && inst->dst.kind == VAL_REG) {
entries[entry_count].op = inst->op;
entries[entry_count].s1 = inst->src1.reg;
entries[entry_count].s2 = inst->src2.reg;
entries[entry_count].result = inst->dst.reg;
entry_count++;
}
}
next_inst:;
/* Invalidate on stores and calls */
if (inst->op == IR_STORE || inst->op == IR_CALL)
entry_count = 0;
}
}
/* ══════════════════════════════════════════════════════════
* Pass 3: Loop Optimizations
* ══════════════════════════════════════════════════════════ */
void opt_loop_tile(IRBlock *bb, ParLoopInfo *loop, int tile_size, Arena *arena) {
if (tile_size <= 0) tile_size = 32; /* default */
/* Insert tiling markers before the parallel loop */
/* The code generator will use these to emit tiled loops */
IRInst *marker = ir_emit(arena, bb, IR_COMMENT, ir_none(), ir_none(), ir_none());
marker->comment = arena_strdup(arena, "TILED");
/* Mark the loop info */
loop->trip_count = tile_size; /* will be used by codegen */
if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
IRInst *begin = bb->insts.data[loop->start_inst];
begin->comment = "TILED_LOOP";
}
}
void opt_loop_unroll(IRBlock *bb, ParLoopInfo *loop, int factor, Arena *arena) {
if (factor <= 0) factor = 4; /* default unroll factor */
/* Mark for unrolling - codegen will handle the actual unrolling */
if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
IRInst *begin = bb->insts.data[loop->start_inst];
char buf[64];
snprintf(buf, sizeof(buf), "UNROLL_%d", factor);
begin->comment = arena_strdup(arena, buf);
}
}
void opt_loop_interchange(IRBlock *bb, Arena *arena) {
/* Look for nested parallel loops where interchange improves locality */
/* For now, mark candidates for the codegen */
for (size_t i = 0; i < bb->insts.len; i++) {
if (bb->insts.data[i]->op == IR_PAR_FOR_BEGIN) {
/* Check for nested parallel for */
for (size_t j = i + 1; j < bb->insts.len; j++) {
if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) {
/* Found nested loop - check if interchange is beneficial */
/* Heuristic: if inner loop has stride-1 access, don't interchange */
/* If outer loop has stride-1, interchange */
ir_emit_comment(arena, bb, "INTERCHANGE_CANDIDATE");
break;
}
if (bb->insts.data[j]->op == IR_PAR_FOR_END) break;
}
}
}
}
void opt_loop_fuse(IRBlock *bb, Arena *arena) {
/* Fuse adjacent parallel_for loops with same bounds */
for (size_t i = 0; i < bb->insts.len; i++) {
if (bb->insts.data[i]->op == IR_PAR_FOR_END) {
/* Look for next PAR_FOR_BEGIN */
for (size_t j = i + 1; j < bb->insts.len; j++) {
IRInst *next = bb->insts.data[j];
if (next->op == IR_NOP || next->op == IR_COMMENT) continue;
if (next->op == IR_PAR_FOR_BEGIN) {
/* Check if bounds match (same lo/hi registers) */
ir_emit_comment(arena, bb, "FUSE_CANDIDATE");
}
break;
}
}
}
}
/* ══════════════════════════════════════════════════════════
* Pass 4: SIMD Vectorization
* ══════════════════════════════════════════════════════════ */
void opt_vectorize(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) {
if (!loop->can_vectorize || target == SIMD_NONE) return;
int width = simd_width(target, loop->elem_type);
if (width <= 1) return;
/* Mark vectorization on the loop */
char buf[128];
snprintf(buf, sizeof(buf), "VECTORIZE_%s_WIDTH_%d",
simd_target_str(target), width);
if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
}
/* Transform loads/stores in loop body to SIMD operations */
for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
switch (inst->op) {
case IR_LOAD:
inst->op = IR_SIMD_LOAD;
inst->dst.simd_width = width;
break;
case IR_STORE:
inst->op = IR_SIMD_STORE;
inst->src1.simd_width = width;
break;
case IR_FADD:
inst->op = IR_SIMD_ADD;
inst->dst.simd_width = width;
inst->src1.simd_width = width;
inst->src2.simd_width = width;
break;
case IR_FSUB:
inst->op = IR_SIMD_SUB;
inst->dst.simd_width = width;
inst->src1.simd_width = width;
inst->src2.simd_width = width;
break;
case IR_FMUL:
inst->op = IR_SIMD_MUL;
inst->dst.simd_width = width;
inst->src1.simd_width = width;
inst->src2.simd_width = width;
break;
default:
break;
}
}
}
void opt_vectorize_reduce(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) {
if (!loop->is_reduce || target == SIMD_NONE) return;
int width = simd_width(target, loop->elem_type);
if (width <= 1) return;
/* Transform reduction:
* scalar: sum += a[i]
* simd: vsum = simd_add(vsum, simd_load(&a[i]))
* ... after loop: sum = horizontal_add(vsum)
*/
char buf[128];
snprintf(buf, sizeof(buf), "VECTORIZE_REDUCE_%s_WIDTH_%d",
simd_target_str(target), width);
if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
}
/* Transform the accumulation in loop body */
for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
/* Look for add to accumulator */
if ((inst->op == IR_FADD || inst->op == IR_ADD) &&
inst->dst.kind == VAL_REG && inst->dst.reg == loop->accum_reg) {
inst->op = IR_SIMD_ADD;
inst->dst.simd_width = width;
}
if (inst->op == IR_LOAD && inst->dst.simd_width == 0) {
/* Promote to SIMD load inside reduction */
inst->op = IR_SIMD_LOAD;
inst->dst.simd_width = width;
}
}
/* Insert horizontal reduction after loop */
if (loop->end_inst < (int)bb->insts.len) {
int hadd_reg = loop->accum_reg;
IRInst *hadd = ir_emit(arena, bb, IR_SIMD_HADD,
ir_reg(hadd_reg, loop->elem_type),
ir_simd_reg(hadd_reg, loop->elem_type, width),
ir_none());
hadd->comment = "horizontal_reduce";
}
}
/* ══════════════════════════════════════════════════════════
* Pass 5: Memory Optimizations
* ══════════════════════════════════════════════════════════ */
void opt_insert_prefetch(IRBlock *bb, Arena *arena) {
/* Insert prefetch hints before loads in loops */
/* Look for load instructions that are likely to benefit from prefetching */
size_t orig_len = bb->insts.len;
for (size_t i = 0; i < orig_len; i++) {
IRInst *inst = bb->insts.data[i];
if ((inst->op == IR_LOAD || inst->op == IR_SIMD_LOAD) &&
inst->src1.kind == VAL_REG) {
/* Insert prefetch for next cache line */
ir_emit_comment(arena, bb, "PREFETCH_NEXT_LINE");
}
}
}
void opt_align_data(IRBlock *bb, Arena *arena) {
/* Mark SIMD loads/stores that could use aligned variants */
for (size_t i = 0; i < bb->insts.len; i++) {
IRInst *inst = bb->insts.data[i];
if (inst->op == IR_SIMD_LOAD || inst->op == IR_SIMD_STORE) {
inst->comment = "ALIGN_CANDIDATE";
}
}
}
/* ══════════════════════════════════════════════════════════
* Main Optimization Pipeline
* ══════════════════════════════════════════════════════════ */
void optimize_module(IRModule *mod, OptContext *ctx) {
if (ctx->level == OPT_O0) {
if (ctx->report) p2s_note("optimization level O0: no optimizations applied");
return;
}
for (IRFunc *f = mod->functions; f; f = f->next) {
for (size_t bi = 0; bi < f->blocks.len; bi++) {
IRBlock *bb = f->blocks.data[bi];
/* ── O1: Scalar optimizations ────────────── */
if (ctx->level >= OPT_O1) {
opt_constant_fold(bb, ctx->arena);
opt_strength_reduce(bb, ctx->arena);
opt_cse(bb, ctx->arena);
opt_dead_code_eliminate(bb);
if (ctx->report)
p2s_note("O1: scalar optimizations applied to %s/%s",
f->name, bb->name);
}
/* ── O2: Loop + SIMD ─────────────────────── */
if (ctx->level >= OPT_O2) {
/* Detect parallel patterns */
ParLoopVec loops = detect_parallel_patterns(bb);
if (ctx->report)
p2s_note("O2: detected %zu parallel loop(s) in %s/%s",
loops.len, f->name, bb->name);
for (size_t li = 0; li < loops.len; li++) {
ParLoopInfo *loop = &loops.data[li];
/* Loop optimizations */
opt_loop_unroll(bb, loop, ctx->unroll_factor, ctx->arena);
/* SIMD vectorization */
if (loop->is_reduce) {
opt_vectorize_reduce(bb, loop, ctx->simd, ctx->arena);
if (ctx->report)
p2s_note(" vectorized reduction (width=%d)",
simd_width(ctx->simd, loop->elem_type));
} else if (loop->can_vectorize) {
opt_vectorize(bb, loop, ctx->simd, ctx->arena);
if (ctx->report)
p2s_note(" vectorized parallel_for (width=%d)",
simd_width(ctx->simd, loop->elem_type));
}
}
/* Inter-loop optimizations */
opt_loop_interchange(bb, ctx->arena);
opt_loop_fuse(bb, ctx->arena);
vec_free(&loops);
}
/* ── O3: Memory + aggressive ─────────────── */
if (ctx->level >= OPT_O3) {
ParLoopVec loops = detect_parallel_patterns(bb);
for (size_t li = 0; li < loops.len; li++) {
opt_loop_tile(bb, &loops.data[li],
ctx->tile_size > 0 ? ctx->tile_size : 32,
ctx->arena);
}
opt_insert_prefetch(bb, ctx->arena);
opt_align_data(bb, ctx->arena);
if (ctx->report)
p2s_note("O3: memory optimizations applied to %s/%s",
f->name, bb->name);
vec_free(&loops);
}
/* Remove barriers (serial execution is naturally ordered) */
for (size_t i = 0; i < bb->insts.len; i++) {
if (bb->insts.data[i]->op == IR_BARRIER) {
bb->insts.data[i]->op = IR_NOP;
bb->insts.data[i]->comment = "barrier removed (serial)";
if (ctx->report)
p2s_note(" removed barrier (serial execution)");
}
}
}
}
}