| |
| |
| |
| |
| |
| |
| |
| |
| |
| #include "optimize.h" |
|
|
| |
| |
| |
|
|
| int simd_width(SIMDTarget target, TypeKind elem_type) { |
| int elem_bytes; |
| switch (elem_type) { |
| case TYPE_FLOAT: elem_bytes = 4; break; |
| case TYPE_DOUBLE: elem_bytes = 8; break; |
| case TYPE_INT: elem_bytes = 4; break; |
| case TYPE_CHAR: elem_bytes = 1; break; |
| case TYPE_SHORT: elem_bytes = 2; break; |
| default: elem_bytes = 4; break; |
| } |
| int vec_bytes; |
| switch (target) { |
| case SIMD_SSE: vec_bytes = 16; break; |
| case SIMD_AVX: |
| case SIMD_AVX2: vec_bytes = 32; break; |
| case SIMD_AVX512: vec_bytes = 64; break; |
| case SIMD_NEON: vec_bytes = 16; break; |
| default: return 1; |
| } |
| return vec_bytes / elem_bytes; |
| } |
|
|
| const char *simd_target_str(SIMDTarget t) { |
| switch (t) { |
| case SIMD_SSE: return "SSE4.2"; |
| case SIMD_AVX: return "AVX"; |
| case SIMD_AVX2: return "AVX2+FMA"; |
| case SIMD_AVX512: return "AVX-512"; |
| case SIMD_NEON: return "NEON"; |
| default: return "scalar"; |
| } |
| } |
|
|
| |
| |
| |
|
|
| ParLoopVec detect_parallel_patterns(IRBlock *bb) { |
| ParLoopVec loops; |
| vec_init(&loops); |
|
|
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
|
|
| if (inst->op == IR_PAR_FOR_BEGIN) { |
| ParLoopInfo info = {0}; |
| info.start_inst = (int)i; |
| info.iter_var = inst->par.iter_var; |
| info.iter_reg = inst->dst.reg; |
| info.lo_reg = inst->par.lo_reg; |
| info.hi_reg = inst->par.hi_reg; |
| info.is_reduce = false; |
| info.can_vectorize = true; |
| info.has_dependency = false; |
| info.trip_count = -1; |
| info.elem_type = TYPE_FLOAT; |
|
|
| |
| int depth = 1; |
| for (size_t j = i + 1; j < bb->insts.len; j++) { |
| if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) depth++; |
| if (bb->insts.data[j]->op == IR_PAR_FOR_END) { |
| depth--; |
| if (depth == 0) { info.end_inst = (int)j; break; } |
| } |
| } |
|
|
| |
| bool has_store = false; |
| bool has_call = false; |
| for (int j = info.start_inst; j <= info.end_inst; j++) { |
| IRInst *bi = bb->insts.data[j]; |
| if (bi->op == IR_STORE) has_store = true; |
| if (bi->op == IR_CALL) has_call = true; |
| } |
| if (has_call) info.can_vectorize = false; |
|
|
| vec_push(&loops, info); |
| } |
|
|
| if (inst->op == IR_PAR_REDUCE_BEGIN) { |
| ParLoopInfo info = {0}; |
| info.start_inst = (int)i; |
| info.iter_var = inst->par.iter_var; |
| info.accum_reg = inst->par.accum_reg; |
| info.reduce_op = inst->par.reduce_op; |
| info.is_reduce = true; |
| info.can_vectorize = true; |
| info.elem_type = TYPE_FLOAT; |
|
|
| int depth = 1; |
| for (size_t j = i + 1; j < bb->insts.len; j++) { |
| if (bb->insts.data[j]->op == IR_PAR_REDUCE_BEGIN) depth++; |
| if (bb->insts.data[j]->op == IR_PAR_REDUCE_END) { |
| depth--; |
| if (depth == 0) { info.end_inst = (int)j; break; } |
| } |
| } |
| vec_push(&loops, info); |
| } |
|
|
| if (inst->op == IR_PAR_SCAN_BEGIN) { |
| ParLoopInfo info = {0}; |
| info.start_inst = (int)i; |
| info.is_scan = true; |
| info.can_vectorize = false; |
|
|
| int depth = 1; |
| for (size_t j = i + 1; j < bb->insts.len; j++) { |
| if (bb->insts.data[j]->op == IR_PAR_SCAN_BEGIN) depth++; |
| if (bb->insts.data[j]->op == IR_PAR_SCAN_END) { |
| depth--; |
| if (depth == 0) { info.end_inst = (int)j; break; } |
| } |
| } |
| vec_push(&loops, info); |
| } |
| } |
|
|
| return loops; |
| } |
|
|
| |
| |
| |
|
|
| |
| void opt_constant_fold(IRBlock *bb, Arena *arena) { |
| |
| int64_t *known_int = (int64_t *)calloc(4096, sizeof(int64_t)); |
| bool *is_known = (bool *)calloc(4096, sizeof(bool)); |
|
|
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
|
|
| if (inst->op == IR_LOAD_IMM && inst->dst.kind == VAL_REG) { |
| int r = inst->dst.reg; |
| if (r < 4096) { |
| known_int[r] = inst->src1.int_val; |
| is_known[r] = true; |
| } |
| } |
|
|
| |
| if (inst->op >= IR_ADD && inst->op <= IR_MOD) { |
| int64_t a = 0, b_val = 0; |
| bool a_known = false, b_known = false; |
|
|
| if (inst->src1.kind == VAL_INT) { a = inst->src1.int_val; a_known = true; } |
| else if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096 && is_known[inst->src1.reg]) { |
| a = known_int[inst->src1.reg]; a_known = true; |
| } |
|
|
| if (inst->src2.kind == VAL_INT) { b_val = inst->src2.int_val; b_known = true; } |
| else if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096 && is_known[inst->src2.reg]) { |
| b_val = known_int[inst->src2.reg]; b_known = true; |
| } |
|
|
| if (a_known && b_known) { |
| int64_t result = 0; |
| switch (inst->op) { |
| case IR_ADD: result = a + b_val; break; |
| case IR_SUB: result = a - b_val; break; |
| case IR_MUL: result = a * b_val; break; |
| case IR_DIV: result = b_val != 0 ? a / b_val : 0; break; |
| case IR_MOD: result = b_val != 0 ? a % b_val : 0; break; |
| default: continue; |
| } |
| |
| inst->op = IR_LOAD_IMM; |
| inst->src1 = ir_int(result); |
| inst->src2 = ir_none(); |
| if (inst->dst.reg < 4096) { |
| known_int[inst->dst.reg] = result; |
| is_known[inst->dst.reg] = true; |
| } |
| } |
| } |
|
|
| |
| if (inst->op == IR_ADD || inst->op == IR_FADD) { |
| |
| if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) { |
| inst->op = IR_MOVE; |
| inst->src2 = ir_none(); |
| } |
| } |
| if (inst->op == IR_MUL || inst->op == IR_FMUL) { |
| |
| if (inst->src2.kind == VAL_INT && inst->src2.int_val == 1) { |
| inst->op = IR_MOVE; |
| inst->src2 = ir_none(); |
| } |
| |
| if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) { |
| inst->op = IR_LOAD_IMM; |
| inst->src1 = ir_int(0); |
| inst->src2 = ir_none(); |
| } |
| } |
| if (inst->op == IR_MUL) { |
| |
| if (inst->src2.kind == VAL_INT && inst->src2.int_val == 2) { |
| inst->op = IR_SHL; |
| inst->src2 = ir_int(1); |
| } |
| |
| if (inst->src2.kind == VAL_INT && inst->src2.int_val > 0) { |
| int64_t v = inst->src2.int_val; |
| if ((v & (v - 1)) == 0) { |
| int shift = 0; |
| while ((1LL << shift) < v) shift++; |
| inst->op = IR_SHL; |
| inst->src2 = ir_int(shift); |
| } |
| } |
| } |
| } |
| free(known_int); |
| free(is_known); |
| } |
|
|
| |
| void opt_dead_code_eliminate(IRBlock *bb) { |
| |
| bool *used = (bool *)calloc(4096, sizeof(bool)); |
|
|
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
| if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096) |
| used[inst->src1.reg] = true; |
| if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096) |
| used[inst->src2.reg] = true; |
| if (inst->src3.kind == VAL_REG && inst->src3.reg < 4096) |
| used[inst->src3.reg] = true; |
| } |
|
|
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
| if (inst->dst.kind == VAL_REG && inst->dst.reg < 4096 && !used[inst->dst.reg]) { |
| |
| if (inst->op == IR_ADD || inst->op == IR_SUB || inst->op == IR_MUL || |
| inst->op == IR_DIV || inst->op == IR_MOD || inst->op == IR_LOAD_IMM || |
| inst->op == IR_LOAD_FIMM || inst->op == IR_MOVE || |
| inst->op == IR_FADD || inst->op == IR_FSUB || |
| inst->op == IR_FMUL || inst->op == IR_FDIV) { |
| inst->op = IR_NOP; |
| } |
| } |
| } |
| free(used); |
| } |
|
|
| |
| void opt_strength_reduce(IRBlock *bb, Arena *arena) { |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
|
|
| |
| if (inst->op == IR_DIV && inst->src2.kind == VAL_INT) { |
| int64_t v = inst->src2.int_val; |
| if (v > 0 && (v & (v - 1)) == 0) { |
| int shift = 0; |
| while ((1LL << shift) < v) shift++; |
| inst->op = IR_SHR; |
| inst->src2 = ir_int(shift); |
| } |
| } |
|
|
| |
| if (inst->op == IR_MOD && inst->src2.kind == VAL_INT) { |
| int64_t v = inst->src2.int_val; |
| if (v > 0 && (v & (v - 1)) == 0) { |
| inst->op = IR_AND; |
| inst->src2 = ir_int(v - 1); |
| } |
| } |
| } |
| } |
|
|
| |
| void opt_cse(IRBlock *bb, Arena *arena) { |
| |
| |
| typedef struct { IROp op; int s1; int s2; int result; } CSEEntry; |
| CSEEntry entries[1024]; |
| int entry_count = 0; |
|
|
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
| if (inst->op >= IR_ADD && inst->op <= IR_MOD && |
| inst->src1.kind == VAL_REG && inst->src2.kind == VAL_REG) { |
|
|
| |
| for (int j = 0; j < entry_count; j++) { |
| if (entries[j].op == inst->op && |
| entries[j].s1 == inst->src1.reg && |
| entries[j].s2 == inst->src2.reg) { |
| |
| inst->op = IR_MOVE; |
| inst->src1 = ir_reg(entries[j].result, inst->dst.type); |
| inst->src2 = ir_none(); |
| goto next_inst; |
| } |
| } |
| if (entry_count < 1024 && inst->dst.kind == VAL_REG) { |
| entries[entry_count].op = inst->op; |
| entries[entry_count].s1 = inst->src1.reg; |
| entries[entry_count].s2 = inst->src2.reg; |
| entries[entry_count].result = inst->dst.reg; |
| entry_count++; |
| } |
| } |
| next_inst:; |
| |
| if (inst->op == IR_STORE || inst->op == IR_CALL) |
| entry_count = 0; |
| } |
| } |
|
|
| |
| |
| |
|
|
| void opt_loop_tile(IRBlock *bb, ParLoopInfo *loop, int tile_size, Arena *arena) { |
| if (tile_size <= 0) tile_size = 32; |
|
|
| |
| |
| IRInst *marker = ir_emit(arena, bb, IR_COMMENT, ir_none(), ir_none(), ir_none()); |
| marker->comment = arena_strdup(arena, "TILED"); |
|
|
| |
| loop->trip_count = tile_size; |
|
|
| if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { |
| IRInst *begin = bb->insts.data[loop->start_inst]; |
| begin->comment = "TILED_LOOP"; |
| } |
| } |
|
|
| void opt_loop_unroll(IRBlock *bb, ParLoopInfo *loop, int factor, Arena *arena) { |
| if (factor <= 0) factor = 4; |
|
|
| |
| if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { |
| IRInst *begin = bb->insts.data[loop->start_inst]; |
| char buf[64]; |
| snprintf(buf, sizeof(buf), "UNROLL_%d", factor); |
| begin->comment = arena_strdup(arena, buf); |
| } |
| } |
|
|
| void opt_loop_interchange(IRBlock *bb, Arena *arena) { |
| |
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| if (bb->insts.data[i]->op == IR_PAR_FOR_BEGIN) { |
| |
| for (size_t j = i + 1; j < bb->insts.len; j++) { |
| if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) { |
| |
| |
| |
| ir_emit_comment(arena, bb, "INTERCHANGE_CANDIDATE"); |
| break; |
| } |
| if (bb->insts.data[j]->op == IR_PAR_FOR_END) break; |
| } |
| } |
| } |
| } |
|
|
| void opt_loop_fuse(IRBlock *bb, Arena *arena) { |
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| if (bb->insts.data[i]->op == IR_PAR_FOR_END) { |
| |
| for (size_t j = i + 1; j < bb->insts.len; j++) { |
| IRInst *next = bb->insts.data[j]; |
| if (next->op == IR_NOP || next->op == IR_COMMENT) continue; |
| if (next->op == IR_PAR_FOR_BEGIN) { |
| |
| ir_emit_comment(arena, bb, "FUSE_CANDIDATE"); |
| } |
| break; |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
|
|
| void opt_vectorize(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) { |
| if (!loop->can_vectorize || target == SIMD_NONE) return; |
|
|
| int width = simd_width(target, loop->elem_type); |
| if (width <= 1) return; |
|
|
| |
| char buf[128]; |
| snprintf(buf, sizeof(buf), "VECTORIZE_%s_WIDTH_%d", |
| simd_target_str(target), width); |
|
|
| if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { |
| bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf); |
| } |
|
|
| |
| for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
|
|
| switch (inst->op) { |
| case IR_LOAD: |
| inst->op = IR_SIMD_LOAD; |
| inst->dst.simd_width = width; |
| break; |
| case IR_STORE: |
| inst->op = IR_SIMD_STORE; |
| inst->src1.simd_width = width; |
| break; |
| case IR_FADD: |
| inst->op = IR_SIMD_ADD; |
| inst->dst.simd_width = width; |
| inst->src1.simd_width = width; |
| inst->src2.simd_width = width; |
| break; |
| case IR_FSUB: |
| inst->op = IR_SIMD_SUB; |
| inst->dst.simd_width = width; |
| inst->src1.simd_width = width; |
| inst->src2.simd_width = width; |
| break; |
| case IR_FMUL: |
| inst->op = IR_SIMD_MUL; |
| inst->dst.simd_width = width; |
| inst->src1.simd_width = width; |
| inst->src2.simd_width = width; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
|
|
| void opt_vectorize_reduce(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) { |
| if (!loop->is_reduce || target == SIMD_NONE) return; |
|
|
| int width = simd_width(target, loop->elem_type); |
| if (width <= 1) return; |
|
|
| |
| |
| |
| |
| |
| char buf[128]; |
| snprintf(buf, sizeof(buf), "VECTORIZE_REDUCE_%s_WIDTH_%d", |
| simd_target_str(target), width); |
|
|
| if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) { |
| bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf); |
| } |
|
|
| |
| for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
|
|
| |
| if ((inst->op == IR_FADD || inst->op == IR_ADD) && |
| inst->dst.kind == VAL_REG && inst->dst.reg == loop->accum_reg) { |
| inst->op = IR_SIMD_ADD; |
| inst->dst.simd_width = width; |
| } |
| if (inst->op == IR_LOAD && inst->dst.simd_width == 0) { |
| |
| inst->op = IR_SIMD_LOAD; |
| inst->dst.simd_width = width; |
| } |
| } |
|
|
| |
| if (loop->end_inst < (int)bb->insts.len) { |
| int hadd_reg = loop->accum_reg; |
| IRInst *hadd = ir_emit(arena, bb, IR_SIMD_HADD, |
| ir_reg(hadd_reg, loop->elem_type), |
| ir_simd_reg(hadd_reg, loop->elem_type, width), |
| ir_none()); |
| hadd->comment = "horizontal_reduce"; |
| } |
| } |
|
|
| |
| |
| |
|
|
| void opt_insert_prefetch(IRBlock *bb, Arena *arena) { |
| |
| |
| size_t orig_len = bb->insts.len; |
| for (size_t i = 0; i < orig_len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
| if ((inst->op == IR_LOAD || inst->op == IR_SIMD_LOAD) && |
| inst->src1.kind == VAL_REG) { |
| |
| ir_emit_comment(arena, bb, "PREFETCH_NEXT_LINE"); |
| } |
| } |
| } |
|
|
| void opt_align_data(IRBlock *bb, Arena *arena) { |
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| IRInst *inst = bb->insts.data[i]; |
| if (inst->op == IR_SIMD_LOAD || inst->op == IR_SIMD_STORE) { |
| inst->comment = "ALIGN_CANDIDATE"; |
| } |
| } |
| } |
|
|
| |
| |
| |
|
|
| void optimize_module(IRModule *mod, OptContext *ctx) { |
| if (ctx->level == OPT_O0) { |
| if (ctx->report) p2s_note("optimization level O0: no optimizations applied"); |
| return; |
| } |
|
|
| for (IRFunc *f = mod->functions; f; f = f->next) { |
| for (size_t bi = 0; bi < f->blocks.len; bi++) { |
| IRBlock *bb = f->blocks.data[bi]; |
|
|
| |
| if (ctx->level >= OPT_O1) { |
| opt_constant_fold(bb, ctx->arena); |
| opt_strength_reduce(bb, ctx->arena); |
| opt_cse(bb, ctx->arena); |
| opt_dead_code_eliminate(bb); |
| if (ctx->report) |
| p2s_note("O1: scalar optimizations applied to %s/%s", |
| f->name, bb->name); |
| } |
|
|
| |
| if (ctx->level >= OPT_O2) { |
| |
| ParLoopVec loops = detect_parallel_patterns(bb); |
|
|
| if (ctx->report) |
| p2s_note("O2: detected %zu parallel loop(s) in %s/%s", |
| loops.len, f->name, bb->name); |
|
|
| for (size_t li = 0; li < loops.len; li++) { |
| ParLoopInfo *loop = &loops.data[li]; |
|
|
| |
| opt_loop_unroll(bb, loop, ctx->unroll_factor, ctx->arena); |
|
|
| |
| if (loop->is_reduce) { |
| opt_vectorize_reduce(bb, loop, ctx->simd, ctx->arena); |
| if (ctx->report) |
| p2s_note(" vectorized reduction (width=%d)", |
| simd_width(ctx->simd, loop->elem_type)); |
| } else if (loop->can_vectorize) { |
| opt_vectorize(bb, loop, ctx->simd, ctx->arena); |
| if (ctx->report) |
| p2s_note(" vectorized parallel_for (width=%d)", |
| simd_width(ctx->simd, loop->elem_type)); |
| } |
| } |
|
|
| |
| opt_loop_interchange(bb, ctx->arena); |
| opt_loop_fuse(bb, ctx->arena); |
|
|
| vec_free(&loops); |
| } |
|
|
| |
| if (ctx->level >= OPT_O3) { |
| ParLoopVec loops = detect_parallel_patterns(bb); |
|
|
| for (size_t li = 0; li < loops.len; li++) { |
| opt_loop_tile(bb, &loops.data[li], |
| ctx->tile_size > 0 ? ctx->tile_size : 32, |
| ctx->arena); |
| } |
|
|
| opt_insert_prefetch(bb, ctx->arena); |
| opt_align_data(bb, ctx->arena); |
|
|
| if (ctx->report) |
| p2s_note("O3: memory optimizations applied to %s/%s", |
| f->name, bb->name); |
|
|
| vec_free(&loops); |
| } |
|
|
| |
| for (size_t i = 0; i < bb->insts.len; i++) { |
| if (bb->insts.data[i]->op == IR_BARRIER) { |
| bb->insts.data[i]->op = IR_NOP; |
| bb->insts.data[i]->comment = "barrier removed (serial)"; |
| if (ctx->report) |
| p2s_note(" removed barrier (serial execution)"); |
| } |
| } |
| } |
| } |
| } |
|
|