Add optimize.c - complete optimization pipeline with all passes

c8426e4 verified 2 months ago

26 kB

	/*
	* par2serial-cc: optimize.c - Optimization passes implementation
	*
	* Pass 1: Parallel pattern detection
	* Pass 2: Scalar optimizations (constant fold, DCE, strength reduce, CSE)
	* Pass 3: Loop optimizations (tiling, unrolling, interchange, fusion)
	* Pass 4: SIMD vectorization
	* Pass 5: Memory optimizations (prefetch, alignment)
	*/
	#include "optimize.h"

	/* ══════════════════════════════════════════════════════════
	* SIMD utilities
	* ══════════════════════════════════════════════════════════ */

	int simd_width(SIMDTarget target, TypeKind elem_type) {
	int elem_bytes;
	switch (elem_type) {
	case TYPE_FLOAT: elem_bytes = 4; break;
	case TYPE_DOUBLE: elem_bytes = 8; break;
	case TYPE_INT: elem_bytes = 4; break;
	case TYPE_CHAR: elem_bytes = 1; break;
	case TYPE_SHORT: elem_bytes = 2; break;
	default: elem_bytes = 4; break;
	}
	int vec_bytes;
	switch (target) {
	case SIMD_SSE: vec_bytes = 16; break;
	case SIMD_AVX:
	case SIMD_AVX2: vec_bytes = 32; break;
	case SIMD_AVX512: vec_bytes = 64; break;
	case SIMD_NEON: vec_bytes = 16; break;
	default: return 1;
	}
	return vec_bytes / elem_bytes;
	}

	const char *simd_target_str(SIMDTarget t) {
	switch (t) {
	case SIMD_SSE: return "SSE4.2";
	case SIMD_AVX: return "AVX";
	case SIMD_AVX2: return "AVX2+FMA";
	case SIMD_AVX512: return "AVX-512";
	case SIMD_NEON: return "NEON";
	default: return "scalar";
	}
	}

	/* ══════════════════════════════════════════════════════════
	* Pass 1: Parallel Pattern Detection
	* ══════════════════════════════════════════════════════════ */

	ParLoopVec detect_parallel_patterns(IRBlock *bb) {
	ParLoopVec loops;
	vec_init(&loops);

	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];

	if (inst->op == IR_PAR_FOR_BEGIN) {
	ParLoopInfo info = {0};
	info.start_inst = (int)i;
	info.iter_var = inst->par.iter_var;
	info.iter_reg = inst->dst.reg;
	info.lo_reg = inst->par.lo_reg;
	info.hi_reg = inst->par.hi_reg;
	info.is_reduce = false;
	info.can_vectorize = true;
	info.has_dependency = false;
	info.trip_count = -1;
	info.elem_type = TYPE_FLOAT;

	/* Find matching end */
	int depth = 1;
	for (size_t j = i + 1; j < bb->insts.len; j++) {
	if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) depth++;
	if (bb->insts.data[j]->op == IR_PAR_FOR_END) {
	depth--;
	if (depth == 0) { info.end_inst = (int)j; break; }
	}
	}

	/* Analyze loop body for dependencies */
	bool has_store = false;
	bool has_call = false;
	for (int j = info.start_inst; j <= info.end_inst; j++) {
	IRInst *bi = bb->insts.data[j];
	if (bi->op == IR_STORE) has_store = true;
	if (bi->op == IR_CALL) has_call = true;
	}
	if (has_call) info.can_vectorize = false;

	vec_push(&loops, info);
	}

	if (inst->op == IR_PAR_REDUCE_BEGIN) {
	ParLoopInfo info = {0};
	info.start_inst = (int)i;
	info.iter_var = inst->par.iter_var;
	info.accum_reg = inst->par.accum_reg;
	info.reduce_op = inst->par.reduce_op;
	info.is_reduce = true;
	info.can_vectorize = true;
	info.elem_type = TYPE_FLOAT;

	int depth = 1;
	for (size_t j = i + 1; j < bb->insts.len; j++) {
	if (bb->insts.data[j]->op == IR_PAR_REDUCE_BEGIN) depth++;
	if (bb->insts.data[j]->op == IR_PAR_REDUCE_END) {
	depth--;
	if (depth == 0) { info.end_inst = (int)j; break; }
	}
	}
	vec_push(&loops, info);
	}

	if (inst->op == IR_PAR_SCAN_BEGIN) {
	ParLoopInfo info = {0};
	info.start_inst = (int)i;
	info.is_scan = true;
	info.can_vectorize = false; /* scans are harder to vectorize */

	int depth = 1;
	for (size_t j = i + 1; j < bb->insts.len; j++) {
	if (bb->insts.data[j]->op == IR_PAR_SCAN_BEGIN) depth++;
	if (bb->insts.data[j]->op == IR_PAR_SCAN_END) {
	depth--;
	if (depth == 0) { info.end_inst = (int)j; break; }
	}
	}
	vec_push(&loops, info);
	}
	}

	return loops;
	}

	/* ══════════════════════════════════════════════════════════
	* Pass 2: Scalar Optimizations
	* ══════════════════════════════════════════════════════════ */

	/* ── Constant folding ────────────────────────────────────── */
	void opt_constant_fold(IRBlock bb, Arena arena) {
	/* Track known constants: reg -> value */
	int64_t known_int = (int64_t )calloc(4096, sizeof(int64_t));
	bool is_known = (bool )calloc(4096, sizeof(bool));

	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];

	if (inst->op == IR_LOAD_IMM && inst->dst.kind == VAL_REG) {
	int r = inst->dst.reg;
	if (r < 4096) {
	known_int[r] = inst->src1.int_val;
	is_known[r] = true;
	}
	}

	/* Try to fold binary ops on known constants */
	if (inst->op >= IR_ADD && inst->op <= IR_MOD) {
	int64_t a = 0, b_val = 0;
	bool a_known = false, b_known = false;

	if (inst->src1.kind == VAL_INT) { a = inst->src1.int_val; a_known = true; }
	else if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096 && is_known[inst->src1.reg]) {
	a = known_int[inst->src1.reg]; a_known = true;
	}

	if (inst->src2.kind == VAL_INT) { b_val = inst->src2.int_val; b_known = true; }
	else if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096 && is_known[inst->src2.reg]) {
	b_val = known_int[inst->src2.reg]; b_known = true;
	}

	if (a_known && b_known) {
	int64_t result = 0;
	switch (inst->op) {
	case IR_ADD: result = a + b_val; break;
	case IR_SUB: result = a - b_val; break;
	case IR_MUL: result = a * b_val; break;
	case IR_DIV: result = b_val != 0 ? a / b_val : 0; break;
	case IR_MOD: result = b_val != 0 ? a % b_val : 0; break;
	default: continue;
	}
	/* Replace with constant load */
	inst->op = IR_LOAD_IMM;
	inst->src1 = ir_int(result);
	inst->src2 = ir_none();
	if (inst->dst.reg < 4096) {
	known_int[inst->dst.reg] = result;
	is_known[inst->dst.reg] = true;
	}
	}
	}

	/* Algebraic simplifications */
	if (inst->op == IR_ADD \|\| inst->op == IR_FADD) {
	/* x + 0 = x */
	if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
	inst->op = IR_MOVE;
	inst->src2 = ir_none();
	}
	}
	if (inst->op == IR_MUL \|\| inst->op == IR_FMUL) {
	/* x * 1 = x */
	if (inst->src2.kind == VAL_INT && inst->src2.int_val == 1) {
	inst->op = IR_MOVE;
	inst->src2 = ir_none();
	}
	/* x * 0 = 0 */
	if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
	inst->op = IR_LOAD_IMM;
	inst->src1 = ir_int(0);
	inst->src2 = ir_none();
	}
	}
	if (inst->op == IR_MUL) {
	/* x * 2 → x << 1 (strength reduction) */
	if (inst->src2.kind == VAL_INT && inst->src2.int_val == 2) {
	inst->op = IR_SHL;
	inst->src2 = ir_int(1);
	}
	/* x * power-of-2 → x << log2 */
	if (inst->src2.kind == VAL_INT && inst->src2.int_val > 0) {
	int64_t v = inst->src2.int_val;
	if ((v & (v - 1)) == 0) {
	int shift = 0;
	while ((1LL << shift) < v) shift++;
	inst->op = IR_SHL;
	inst->src2 = ir_int(shift);
	}
	}
	}
	}
	free(known_int);
	free(is_known);
	}

	/* ── Dead code elimination ───────────────────────────────── */
	void opt_dead_code_eliminate(IRBlock *bb) {
	/* Simple: mark registers that are used, remove unused defs */
	bool used = (bool )calloc(4096, sizeof(bool));

	/* Pass 1: mark used registers */
	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];
	if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096)
	used[inst->src1.reg] = true;
	if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096)
	used[inst->src2.reg] = true;
	if (inst->src3.kind == VAL_REG && inst->src3.reg < 4096)
	used[inst->src3.reg] = true;
	}

	/* Pass 2: remove dead definitions (but keep side-effectful ops) */
	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];
	if (inst->dst.kind == VAL_REG && inst->dst.reg < 4096 && !used[inst->dst.reg]) {
	/* Safe to remove if no side effects */
	if (inst->op == IR_ADD \|\| inst->op == IR_SUB \|\| inst->op == IR_MUL \|\|
	inst->op == IR_DIV \|\| inst->op == IR_MOD \|\| inst->op == IR_LOAD_IMM \|\|
	inst->op == IR_LOAD_FIMM \|\| inst->op == IR_MOVE \|\|
	inst->op == IR_FADD \|\| inst->op == IR_FSUB \|\|
	inst->op == IR_FMUL \|\| inst->op == IR_FDIV) {
	inst->op = IR_NOP;
	}
	}
	}
	free(used);
	}

	/* ── Strength reduction ──────────────────────────────────── */
	void opt_strength_reduce(IRBlock bb, Arena arena) {
	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];

	/* Division by power of 2 → shift */
	if (inst->op == IR_DIV && inst->src2.kind == VAL_INT) {
	int64_t v = inst->src2.int_val;
	if (v > 0 && (v & (v - 1)) == 0) {
	int shift = 0;
	while ((1LL << shift) < v) shift++;
	inst->op = IR_SHR;
	inst->src2 = ir_int(shift);
	}
	}

	/* Modulo by power of 2 → bitwise AND */
	if (inst->op == IR_MOD && inst->src2.kind == VAL_INT) {
	int64_t v = inst->src2.int_val;
	if (v > 0 && (v & (v - 1)) == 0) {
	inst->op = IR_AND;
	inst->src2 = ir_int(v - 1);
	}
	}
	}
	}

	/* ── Common subexpression elimination (basic) ────────────── */
	void opt_cse(IRBlock bb, Arena arena) {
	/* Simple hash-based CSE within a basic block */
	/* For each arithmetic op, check if we've seen the same (op, src1, src2) before */
	typedef struct { IROp op; int s1; int s2; int result; } CSEEntry;
	CSEEntry entries[1024];
	int entry_count = 0;

	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];
	if (inst->op >= IR_ADD && inst->op <= IR_MOD &&
	inst->src1.kind == VAL_REG && inst->src2.kind == VAL_REG) {

	/* Look for matching previous computation */
	for (int j = 0; j < entry_count; j++) {
	if (entries[j].op == inst->op &&
	entries[j].s1 == inst->src1.reg &&
	entries[j].s2 == inst->src2.reg) {
	/* Replace with move from previous result */
	inst->op = IR_MOVE;
	inst->src1 = ir_reg(entries[j].result, inst->dst.type);
	inst->src2 = ir_none();
	goto next_inst;
	}
	}
	if (entry_count < 1024 && inst->dst.kind == VAL_REG) {
	entries[entry_count].op = inst->op;
	entries[entry_count].s1 = inst->src1.reg;
	entries[entry_count].s2 = inst->src2.reg;
	entries[entry_count].result = inst->dst.reg;
	entry_count++;
	}
	}
	next_inst:;
	/* Invalidate on stores and calls */
	if (inst->op == IR_STORE \|\| inst->op == IR_CALL)
	entry_count = 0;
	}
	}

	/* ══════════════════════════════════════════════════════════
	* Pass 3: Loop Optimizations
	* ══════════════════════════════════════════════════════════ */

	void opt_loop_tile(IRBlock bb, ParLoopInfo loop, int tile_size, Arena *arena) {
	if (tile_size <= 0) tile_size = 32; /* default */

	/* Insert tiling markers before the parallel loop */
	/* The code generator will use these to emit tiled loops */
	IRInst *marker = ir_emit(arena, bb, IR_COMMENT, ir_none(), ir_none(), ir_none());
	marker->comment = arena_strdup(arena, "TILED");

	/* Mark the loop info */
	loop->trip_count = tile_size; /* will be used by codegen */

	if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
	IRInst *begin = bb->insts.data[loop->start_inst];
	begin->comment = "TILED_LOOP";
	}
	}

	void opt_loop_unroll(IRBlock bb, ParLoopInfo loop, int factor, Arena *arena) {
	if (factor <= 0) factor = 4; /* default unroll factor */

	/* Mark for unrolling - codegen will handle the actual unrolling */
	if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
	IRInst *begin = bb->insts.data[loop->start_inst];
	char buf[64];
	snprintf(buf, sizeof(buf), "UNROLL_%d", factor);
	begin->comment = arena_strdup(arena, buf);
	}
	}

	void opt_loop_interchange(IRBlock bb, Arena arena) {
	/* Look for nested parallel loops where interchange improves locality */
	/* For now, mark candidates for the codegen */
	for (size_t i = 0; i < bb->insts.len; i++) {
	if (bb->insts.data[i]->op == IR_PAR_FOR_BEGIN) {
	/* Check for nested parallel for */
	for (size_t j = i + 1; j < bb->insts.len; j++) {
	if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) {
	/* Found nested loop - check if interchange is beneficial */
	/* Heuristic: if inner loop has stride-1 access, don't interchange */
	/* If outer loop has stride-1, interchange */
	ir_emit_comment(arena, bb, "INTERCHANGE_CANDIDATE");
	break;
	}
	if (bb->insts.data[j]->op == IR_PAR_FOR_END) break;
	}
	}
	}
	}

	void opt_loop_fuse(IRBlock bb, Arena arena) {
	/* Fuse adjacent parallel_for loops with same bounds */
	for (size_t i = 0; i < bb->insts.len; i++) {
	if (bb->insts.data[i]->op == IR_PAR_FOR_END) {
	/* Look for next PAR_FOR_BEGIN */
	for (size_t j = i + 1; j < bb->insts.len; j++) {
	IRInst *next = bb->insts.data[j];
	if (next->op == IR_NOP \|\| next->op == IR_COMMENT) continue;
	if (next->op == IR_PAR_FOR_BEGIN) {
	/* Check if bounds match (same lo/hi registers) */
	ir_emit_comment(arena, bb, "FUSE_CANDIDATE");
	}
	break;
	}
	}
	}
	}

	/* ══════════════════════════════════════════════════════════
	* Pass 4: SIMD Vectorization
	* ══════════════════════════════════════════════════════════ */

	void opt_vectorize(IRBlock bb, ParLoopInfo loop, SIMDTarget target, Arena *arena) {
	if (!loop->can_vectorize \|\| target == SIMD_NONE) return;

	int width = simd_width(target, loop->elem_type);
	if (width <= 1) return;

	/* Mark vectorization on the loop */
	char buf[128];
	snprintf(buf, sizeof(buf), "VECTORIZE_%s_WIDTH_%d",
	simd_target_str(target), width);

	if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
	bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
	}

	/* Transform loads/stores in loop body to SIMD operations */
	for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];

	switch (inst->op) {
	case IR_LOAD:
	inst->op = IR_SIMD_LOAD;
	inst->dst.simd_width = width;
	break;
	case IR_STORE:
	inst->op = IR_SIMD_STORE;
	inst->src1.simd_width = width;
	break;
	case IR_FADD:
	inst->op = IR_SIMD_ADD;
	inst->dst.simd_width = width;
	inst->src1.simd_width = width;
	inst->src2.simd_width = width;
	break;
	case IR_FSUB:
	inst->op = IR_SIMD_SUB;
	inst->dst.simd_width = width;
	inst->src1.simd_width = width;
	inst->src2.simd_width = width;
	break;
	case IR_FMUL:
	inst->op = IR_SIMD_MUL;
	inst->dst.simd_width = width;
	inst->src1.simd_width = width;
	inst->src2.simd_width = width;
	break;
	default:
	break;
	}
	}
	}

	void opt_vectorize_reduce(IRBlock bb, ParLoopInfo loop, SIMDTarget target, Arena *arena) {
	if (!loop->is_reduce \|\| target == SIMD_NONE) return;

	int width = simd_width(target, loop->elem_type);
	if (width <= 1) return;

	/* Transform reduction:
	* scalar: sum += a[i]
	* simd: vsum = simd_add(vsum, simd_load(&a[i]))
	* ... after loop: sum = horizontal_add(vsum)
	*/
	char buf[128];
	snprintf(buf, sizeof(buf), "VECTORIZE_REDUCE_%s_WIDTH_%d",
	simd_target_str(target), width);

	if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
	bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
	}

	/* Transform the accumulation in loop body */
	for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];

	/* Look for add to accumulator */
	if ((inst->op == IR_FADD \|\| inst->op == IR_ADD) &&
	inst->dst.kind == VAL_REG && inst->dst.reg == loop->accum_reg) {
	inst->op = IR_SIMD_ADD;
	inst->dst.simd_width = width;
	}
	if (inst->op == IR_LOAD && inst->dst.simd_width == 0) {
	/* Promote to SIMD load inside reduction */
	inst->op = IR_SIMD_LOAD;
	inst->dst.simd_width = width;
	}
	}

	/* Insert horizontal reduction after loop */
	if (loop->end_inst < (int)bb->insts.len) {
	int hadd_reg = loop->accum_reg;
	IRInst *hadd = ir_emit(arena, bb, IR_SIMD_HADD,
	ir_reg(hadd_reg, loop->elem_type),
	ir_simd_reg(hadd_reg, loop->elem_type, width),
	ir_none());
	hadd->comment = "horizontal_reduce";
	}
	}

	/* ══════════════════════════════════════════════════════════
	* Pass 5: Memory Optimizations
	* ══════════════════════════════════════════════════════════ */

	void opt_insert_prefetch(IRBlock bb, Arena arena) {
	/* Insert prefetch hints before loads in loops */
	/* Look for load instructions that are likely to benefit from prefetching */
	size_t orig_len = bb->insts.len;
	for (size_t i = 0; i < orig_len; i++) {
	IRInst *inst = bb->insts.data[i];
	if ((inst->op == IR_LOAD \|\| inst->op == IR_SIMD_LOAD) &&
	inst->src1.kind == VAL_REG) {
	/* Insert prefetch for next cache line */
	ir_emit_comment(arena, bb, "PREFETCH_NEXT_LINE");
	}
	}
	}

	void opt_align_data(IRBlock bb, Arena arena) {
	/* Mark SIMD loads/stores that could use aligned variants */
	for (size_t i = 0; i < bb->insts.len; i++) {
	IRInst *inst = bb->insts.data[i];
	if (inst->op == IR_SIMD_LOAD \|\| inst->op == IR_SIMD_STORE) {
	inst->comment = "ALIGN_CANDIDATE";
	}
	}
	}

	/* ══════════════════════════════════════════════════════════
	* Main Optimization Pipeline
	* ══════════════════════════════════════════════════════════ */

	void optimize_module(IRModule mod, OptContext ctx) {
	if (ctx->level == OPT_O0) {
	if (ctx->report) p2s_note("optimization level O0: no optimizations applied");
	return;
	}

	for (IRFunc *f = mod->functions; f; f = f->next) {
	for (size_t bi = 0; bi < f->blocks.len; bi++) {
	IRBlock *bb = f->blocks.data[bi];

	/* ── O1: Scalar optimizations ────────────── */
	if (ctx->level >= OPT_O1) {
	opt_constant_fold(bb, ctx->arena);
	opt_strength_reduce(bb, ctx->arena);
	opt_cse(bb, ctx->arena);
	opt_dead_code_eliminate(bb);
	if (ctx->report)
	p2s_note("O1: scalar optimizations applied to %s/%s",
	f->name, bb->name);
	}

	/* ── O2: Loop + SIMD ─────────────────────── */
	if (ctx->level >= OPT_O2) {
	/* Detect parallel patterns */
	ParLoopVec loops = detect_parallel_patterns(bb);

	if (ctx->report)
	p2s_note("O2: detected %zu parallel loop(s) in %s/%s",
	loops.len, f->name, bb->name);

	for (size_t li = 0; li < loops.len; li++) {
	ParLoopInfo *loop = &loops.data[li];

	/* Loop optimizations */
	opt_loop_unroll(bb, loop, ctx->unroll_factor, ctx->arena);

	/* SIMD vectorization */
	if (loop->is_reduce) {
	opt_vectorize_reduce(bb, loop, ctx->simd, ctx->arena);
	if (ctx->report)
	p2s_note(" vectorized reduction (width=%d)",
	simd_width(ctx->simd, loop->elem_type));
	} else if (loop->can_vectorize) {
	opt_vectorize(bb, loop, ctx->simd, ctx->arena);
	if (ctx->report)
	p2s_note(" vectorized parallel_for (width=%d)",
	simd_width(ctx->simd, loop->elem_type));
	}
	}

	/* Inter-loop optimizations */
	opt_loop_interchange(bb, ctx->arena);
	opt_loop_fuse(bb, ctx->arena);

	vec_free(&loops);
	}

	/* ── O3: Memory + aggressive ─────────────── */
	if (ctx->level >= OPT_O3) {
	ParLoopVec loops = detect_parallel_patterns(bb);

	for (size_t li = 0; li < loops.len; li++) {
	opt_loop_tile(bb, &loops.data[li],
	ctx->tile_size > 0 ? ctx->tile_size : 32,
	ctx->arena);
	}

	opt_insert_prefetch(bb, ctx->arena);
	opt_align_data(bb, ctx->arena);

	if (ctx->report)
	p2s_note("O3: memory optimizations applied to %s/%s",
	f->name, bb->name);

	vec_free(&loops);
	}

	/* Remove barriers (serial execution is naturally ordered) */
	for (size_t i = 0; i < bb->insts.len; i++) {
	if (bb->insts.data[i]->op == IR_BARRIER) {
	bb->insts.data[i]->op = IR_NOP;
	bb->insts.data[i]->comment = "barrier removed (serial)";
	if (ctx->report)
	p2s_note(" removed barrier (serial execution)");
	}
	}
	}
	}
	}