/*
 * par2serial-cc: optimize.c - Optimization passes implementation
 *
 * Pass 1: Parallel pattern detection
 * Pass 2: Scalar optimizations (constant fold, DCE, strength reduce, CSE)
 * Pass 3: Loop optimizations (tiling, unrolling, interchange, fusion)
 * Pass 4: SIMD vectorization
 * Pass 5: Memory optimizations (prefetch, alignment)
 */
#include "optimize.h"

/* ══════════════════════════════════════════════════════════
 *  SIMD utilities
 * ══════════════════════════════════════════════════════════ */

int simd_width(SIMDTarget target, TypeKind elem_type) {
    int elem_bytes;
    switch (elem_type) {
    case TYPE_FLOAT:  elem_bytes = 4; break;
    case TYPE_DOUBLE: elem_bytes = 8; break;
    case TYPE_INT:    elem_bytes = 4; break;
    case TYPE_CHAR:   elem_bytes = 1; break;
    case TYPE_SHORT:  elem_bytes = 2; break;
    default:          elem_bytes = 4; break;
    }
    int vec_bytes;
    switch (target) {
    case SIMD_SSE:    vec_bytes = 16; break;
    case SIMD_AVX:
    case SIMD_AVX2:   vec_bytes = 32; break;
    case SIMD_AVX512: vec_bytes = 64; break;
    case SIMD_NEON:   vec_bytes = 16; break;
    default:          return 1;
    }
    return vec_bytes / elem_bytes;
}

const char *simd_target_str(SIMDTarget t) {
    switch (t) {
    case SIMD_SSE:    return "SSE4.2";
    case SIMD_AVX:    return "AVX";
    case SIMD_AVX2:   return "AVX2+FMA";
    case SIMD_AVX512: return "AVX-512";
    case SIMD_NEON:   return "NEON";
    default:          return "scalar";
    }
}

/* ══════════════════════════════════════════════════════════
 *  Pass 1: Parallel Pattern Detection
 * ══════════════════════════════════════════════════════════ */

ParLoopVec detect_parallel_patterns(IRBlock *bb) {
    ParLoopVec loops;
    vec_init(&loops);

    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];

        if (inst->op == IR_PAR_FOR_BEGIN) {
            ParLoopInfo info = {0};
            info.start_inst = (int)i;
            info.iter_var = inst->par.iter_var;
            info.iter_reg = inst->dst.reg;
            info.lo_reg = inst->par.lo_reg;
            info.hi_reg = inst->par.hi_reg;
            info.is_reduce = false;
            info.can_vectorize = true;
            info.has_dependency = false;
            info.trip_count = -1;
            info.elem_type = TYPE_FLOAT;

            /* Find matching end */
            int depth = 1;
            for (size_t j = i + 1; j < bb->insts.len; j++) {
                if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) depth++;
                if (bb->insts.data[j]->op == IR_PAR_FOR_END) {
                    depth--;
                    if (depth == 0) { info.end_inst = (int)j; break; }
                }
            }

            /* Analyze loop body for dependencies */
            bool has_store = false;
            bool has_call = false;
            for (int j = info.start_inst; j <= info.end_inst; j++) {
                IRInst *bi = bb->insts.data[j];
                if (bi->op == IR_STORE) has_store = true;
                if (bi->op == IR_CALL) has_call = true;
            }
            if (has_call) info.can_vectorize = false;

            vec_push(&loops, info);
        }

        if (inst->op == IR_PAR_REDUCE_BEGIN) {
            ParLoopInfo info = {0};
            info.start_inst = (int)i;
            info.iter_var = inst->par.iter_var;
            info.accum_reg = inst->par.accum_reg;
            info.reduce_op = inst->par.reduce_op;
            info.is_reduce = true;
            info.can_vectorize = true;
            info.elem_type = TYPE_FLOAT;

            int depth = 1;
            for (size_t j = i + 1; j < bb->insts.len; j++) {
                if (bb->insts.data[j]->op == IR_PAR_REDUCE_BEGIN) depth++;
                if (bb->insts.data[j]->op == IR_PAR_REDUCE_END) {
                    depth--;
                    if (depth == 0) { info.end_inst = (int)j; break; }
                }
            }
            vec_push(&loops, info);
        }

        if (inst->op == IR_PAR_SCAN_BEGIN) {
            ParLoopInfo info = {0};
            info.start_inst = (int)i;
            info.is_scan = true;
            info.can_vectorize = false; /* scans are harder to vectorize */

            int depth = 1;
            for (size_t j = i + 1; j < bb->insts.len; j++) {
                if (bb->insts.data[j]->op == IR_PAR_SCAN_BEGIN) depth++;
                if (bb->insts.data[j]->op == IR_PAR_SCAN_END) {
                    depth--;
                    if (depth == 0) { info.end_inst = (int)j; break; }
                }
            }
            vec_push(&loops, info);
        }
    }

    return loops;
}

/* ══════════════════════════════════════════════════════════
 *  Pass 2: Scalar Optimizations
 * ══════════════════════════════════════════════════════════ */

/* ── Constant folding ────────────────────────────────────── */
void opt_constant_fold(IRBlock *bb, Arena *arena) {
    /* Track known constants: reg -> value */
    int64_t *known_int = (int64_t *)calloc(4096, sizeof(int64_t));
    bool *is_known = (bool *)calloc(4096, sizeof(bool));

    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];

        if (inst->op == IR_LOAD_IMM && inst->dst.kind == VAL_REG) {
            int r = inst->dst.reg;
            if (r < 4096) {
                known_int[r] = inst->src1.int_val;
                is_known[r] = true;
            }
        }

        /* Try to fold binary ops on known constants */
        if (inst->op >= IR_ADD && inst->op <= IR_MOD) {
            int64_t a = 0, b_val = 0;
            bool a_known = false, b_known = false;

            if (inst->src1.kind == VAL_INT) { a = inst->src1.int_val; a_known = true; }
            else if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096 && is_known[inst->src1.reg]) {
                a = known_int[inst->src1.reg]; a_known = true;
            }

            if (inst->src2.kind == VAL_INT) { b_val = inst->src2.int_val; b_known = true; }
            else if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096 && is_known[inst->src2.reg]) {
                b_val = known_int[inst->src2.reg]; b_known = true;
            }

            if (a_known && b_known) {
                int64_t result = 0;
                switch (inst->op) {
                case IR_ADD: result = a + b_val; break;
                case IR_SUB: result = a - b_val; break;
                case IR_MUL: result = a * b_val; break;
                case IR_DIV: result = b_val != 0 ? a / b_val : 0; break;
                case IR_MOD: result = b_val != 0 ? a % b_val : 0; break;
                default: continue;
                }
                /* Replace with constant load */
                inst->op = IR_LOAD_IMM;
                inst->src1 = ir_int(result);
                inst->src2 = ir_none();
                if (inst->dst.reg < 4096) {
                    known_int[inst->dst.reg] = result;
                    is_known[inst->dst.reg] = true;
                }
            }
        }

        /* Algebraic simplifications */
        if (inst->op == IR_ADD || inst->op == IR_FADD) {
            /* x + 0 = x */
            if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
                inst->op = IR_MOVE;
                inst->src2 = ir_none();
            }
        }
        if (inst->op == IR_MUL || inst->op == IR_FMUL) {
            /* x * 1 = x */
            if (inst->src2.kind == VAL_INT && inst->src2.int_val == 1) {
                inst->op = IR_MOVE;
                inst->src2 = ir_none();
            }
            /* x * 0 = 0 */
            if (inst->src2.kind == VAL_INT && inst->src2.int_val == 0) {
                inst->op = IR_LOAD_IMM;
                inst->src1 = ir_int(0);
                inst->src2 = ir_none();
            }
        }
        if (inst->op == IR_MUL) {
            /* x * 2 → x << 1 (strength reduction) */
            if (inst->src2.kind == VAL_INT && inst->src2.int_val == 2) {
                inst->op = IR_SHL;
                inst->src2 = ir_int(1);
            }
            /* x * power-of-2 → x << log2 */
            if (inst->src2.kind == VAL_INT && inst->src2.int_val > 0) {
                int64_t v = inst->src2.int_val;
                if ((v & (v - 1)) == 0) {
                    int shift = 0;
                    while ((1LL << shift) < v) shift++;
                    inst->op = IR_SHL;
                    inst->src2 = ir_int(shift);
                }
            }
        }
    }
    free(known_int);
    free(is_known);
}

/* ── Dead code elimination ───────────────────────────────── */
void opt_dead_code_eliminate(IRBlock *bb) {
    /* Simple: mark registers that are used, remove unused defs */
    bool *used = (bool *)calloc(4096, sizeof(bool));

    /* Pass 1: mark used registers */
    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];
        if (inst->src1.kind == VAL_REG && inst->src1.reg < 4096)
            used[inst->src1.reg] = true;
        if (inst->src2.kind == VAL_REG && inst->src2.reg < 4096)
            used[inst->src2.reg] = true;
        if (inst->src3.kind == VAL_REG && inst->src3.reg < 4096)
            used[inst->src3.reg] = true;
    }

    /* Pass 2: remove dead definitions (but keep side-effectful ops) */
    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];
        if (inst->dst.kind == VAL_REG && inst->dst.reg < 4096 && !used[inst->dst.reg]) {
            /* Safe to remove if no side effects */
            if (inst->op == IR_ADD || inst->op == IR_SUB || inst->op == IR_MUL ||
                inst->op == IR_DIV || inst->op == IR_MOD || inst->op == IR_LOAD_IMM ||
                inst->op == IR_LOAD_FIMM || inst->op == IR_MOVE ||
                inst->op == IR_FADD || inst->op == IR_FSUB ||
                inst->op == IR_FMUL || inst->op == IR_FDIV) {
                inst->op = IR_NOP;
            }
        }
    }
    free(used);
}

/* ── Strength reduction ──────────────────────────────────── */
void opt_strength_reduce(IRBlock *bb, Arena *arena) {
    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];

        /* Division by power of 2 → shift */
        if (inst->op == IR_DIV && inst->src2.kind == VAL_INT) {
            int64_t v = inst->src2.int_val;
            if (v > 0 && (v & (v - 1)) == 0) {
                int shift = 0;
                while ((1LL << shift) < v) shift++;
                inst->op = IR_SHR;
                inst->src2 = ir_int(shift);
            }
        }

        /* Modulo by power of 2 → bitwise AND */
        if (inst->op == IR_MOD && inst->src2.kind == VAL_INT) {
            int64_t v = inst->src2.int_val;
            if (v > 0 && (v & (v - 1)) == 0) {
                inst->op = IR_AND;
                inst->src2 = ir_int(v - 1);
            }
        }
    }
}

/* ── Common subexpression elimination (basic) ────────────── */
void opt_cse(IRBlock *bb, Arena *arena) {
    /* Simple hash-based CSE within a basic block */
    /* For each arithmetic op, check if we've seen the same (op, src1, src2) before */
    typedef struct { IROp op; int s1; int s2; int result; } CSEEntry;
    CSEEntry entries[1024];
    int entry_count = 0;

    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];
        if (inst->op >= IR_ADD && inst->op <= IR_MOD &&
            inst->src1.kind == VAL_REG && inst->src2.kind == VAL_REG) {

            /* Look for matching previous computation */
            for (int j = 0; j < entry_count; j++) {
                if (entries[j].op == inst->op &&
                    entries[j].s1 == inst->src1.reg &&
                    entries[j].s2 == inst->src2.reg) {
                    /* Replace with move from previous result */
                    inst->op = IR_MOVE;
                    inst->src1 = ir_reg(entries[j].result, inst->dst.type);
                    inst->src2 = ir_none();
                    goto next_inst;
                }
            }
            if (entry_count < 1024 && inst->dst.kind == VAL_REG) {
                entries[entry_count].op = inst->op;
                entries[entry_count].s1 = inst->src1.reg;
                entries[entry_count].s2 = inst->src2.reg;
                entries[entry_count].result = inst->dst.reg;
                entry_count++;
            }
        }
        next_inst:;
        /* Invalidate on stores and calls */
        if (inst->op == IR_STORE || inst->op == IR_CALL)
            entry_count = 0;
    }
}

/* ══════════════════════════════════════════════════════════
 *  Pass 3: Loop Optimizations
 * ══════════════════════════════════════════════════════════ */

void opt_loop_tile(IRBlock *bb, ParLoopInfo *loop, int tile_size, Arena *arena) {
    if (tile_size <= 0) tile_size = 32; /* default */

    /* Insert tiling markers before the parallel loop */
    /* The code generator will use these to emit tiled loops */
    IRInst *marker = ir_emit(arena, bb, IR_COMMENT, ir_none(), ir_none(), ir_none());
    marker->comment = arena_strdup(arena, "TILED");

    /* Mark the loop info */
    loop->trip_count = tile_size; /* will be used by codegen */

    if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
        IRInst *begin = bb->insts.data[loop->start_inst];
        begin->comment = "TILED_LOOP";
    }
}

void opt_loop_unroll(IRBlock *bb, ParLoopInfo *loop, int factor, Arena *arena) {
    if (factor <= 0) factor = 4; /* default unroll factor */

    /* Mark for unrolling - codegen will handle the actual unrolling */
    if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
        IRInst *begin = bb->insts.data[loop->start_inst];
        char buf[64];
        snprintf(buf, sizeof(buf), "UNROLL_%d", factor);
        begin->comment = arena_strdup(arena, buf);
    }
}

void opt_loop_interchange(IRBlock *bb, Arena *arena) {
    /* Look for nested parallel loops where interchange improves locality */
    /* For now, mark candidates for the codegen */
    for (size_t i = 0; i < bb->insts.len; i++) {
        if (bb->insts.data[i]->op == IR_PAR_FOR_BEGIN) {
            /* Check for nested parallel for */
            for (size_t j = i + 1; j < bb->insts.len; j++) {
                if (bb->insts.data[j]->op == IR_PAR_FOR_BEGIN) {
                    /* Found nested loop - check if interchange is beneficial */
                    /* Heuristic: if inner loop has stride-1 access, don't interchange */
                    /* If outer loop has stride-1, interchange */
                    ir_emit_comment(arena, bb, "INTERCHANGE_CANDIDATE");
                    break;
                }
                if (bb->insts.data[j]->op == IR_PAR_FOR_END) break;
            }
        }
    }
}

void opt_loop_fuse(IRBlock *bb, Arena *arena) {
    /* Fuse adjacent parallel_for loops with same bounds */
    for (size_t i = 0; i < bb->insts.len; i++) {
        if (bb->insts.data[i]->op == IR_PAR_FOR_END) {
            /* Look for next PAR_FOR_BEGIN */
            for (size_t j = i + 1; j < bb->insts.len; j++) {
                IRInst *next = bb->insts.data[j];
                if (next->op == IR_NOP || next->op == IR_COMMENT) continue;
                if (next->op == IR_PAR_FOR_BEGIN) {
                    /* Check if bounds match (same lo/hi registers) */
                    ir_emit_comment(arena, bb, "FUSE_CANDIDATE");
                }
                break;
            }
        }
    }
}

/* ══════════════════════════════════════════════════════════
 *  Pass 4: SIMD Vectorization
 * ══════════════════════════════════════════════════════════ */

void opt_vectorize(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) {
    if (!loop->can_vectorize || target == SIMD_NONE) return;

    int width = simd_width(target, loop->elem_type);
    if (width <= 1) return;

    /* Mark vectorization on the loop */
    char buf[128];
    snprintf(buf, sizeof(buf), "VECTORIZE_%s_WIDTH_%d",
             simd_target_str(target), width);

    if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
        bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
    }

    /* Transform loads/stores in loop body to SIMD operations */
    for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];

        switch (inst->op) {
        case IR_LOAD:
            inst->op = IR_SIMD_LOAD;
            inst->dst.simd_width = width;
            break;
        case IR_STORE:
            inst->op = IR_SIMD_STORE;
            inst->src1.simd_width = width;
            break;
        case IR_FADD:
            inst->op = IR_SIMD_ADD;
            inst->dst.simd_width = width;
            inst->src1.simd_width = width;
            inst->src2.simd_width = width;
            break;
        case IR_FSUB:
            inst->op = IR_SIMD_SUB;
            inst->dst.simd_width = width;
            inst->src1.simd_width = width;
            inst->src2.simd_width = width;
            break;
        case IR_FMUL:
            inst->op = IR_SIMD_MUL;
            inst->dst.simd_width = width;
            inst->src1.simd_width = width;
            inst->src2.simd_width = width;
            break;
        default:
            break;
        }
    }
}

void opt_vectorize_reduce(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena) {
    if (!loop->is_reduce || target == SIMD_NONE) return;

    int width = simd_width(target, loop->elem_type);
    if (width <= 1) return;

    /* Transform reduction:
     * scalar:  sum += a[i]
     * simd:    vsum = simd_add(vsum, simd_load(&a[i]))
     *          ... after loop: sum = horizontal_add(vsum)
     */
    char buf[128];
    snprintf(buf, sizeof(buf), "VECTORIZE_REDUCE_%s_WIDTH_%d",
             simd_target_str(target), width);

    if (loop->start_inst >= 0 && loop->start_inst < (int)bb->insts.len) {
        bb->insts.data[loop->start_inst]->comment = arena_strdup(arena, buf);
    }

    /* Transform the accumulation in loop body */
    for (int i = loop->start_inst; i <= loop->end_inst && i < (int)bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];

        /* Look for add to accumulator */
        if ((inst->op == IR_FADD || inst->op == IR_ADD) &&
            inst->dst.kind == VAL_REG && inst->dst.reg == loop->accum_reg) {
            inst->op = IR_SIMD_ADD;
            inst->dst.simd_width = width;
        }
        if (inst->op == IR_LOAD && inst->dst.simd_width == 0) {
            /* Promote to SIMD load inside reduction */
            inst->op = IR_SIMD_LOAD;
            inst->dst.simd_width = width;
        }
    }

    /* Insert horizontal reduction after loop */
    if (loop->end_inst < (int)bb->insts.len) {
        int hadd_reg = loop->accum_reg;
        IRInst *hadd = ir_emit(arena, bb, IR_SIMD_HADD,
                               ir_reg(hadd_reg, loop->elem_type),
                               ir_simd_reg(hadd_reg, loop->elem_type, width),
                               ir_none());
        hadd->comment = "horizontal_reduce";
    }
}

/* ══════════════════════════════════════════════════════════
 *  Pass 5: Memory Optimizations
 * ══════════════════════════════════════════════════════════ */

void opt_insert_prefetch(IRBlock *bb, Arena *arena) {
    /* Insert prefetch hints before loads in loops */
    /* Look for load instructions that are likely to benefit from prefetching */
    size_t orig_len = bb->insts.len;
    for (size_t i = 0; i < orig_len; i++) {
        IRInst *inst = bb->insts.data[i];
        if ((inst->op == IR_LOAD || inst->op == IR_SIMD_LOAD) &&
            inst->src1.kind == VAL_REG) {
            /* Insert prefetch for next cache line */
            ir_emit_comment(arena, bb, "PREFETCH_NEXT_LINE");
        }
    }
}

void opt_align_data(IRBlock *bb, Arena *arena) {
    /* Mark SIMD loads/stores that could use aligned variants */
    for (size_t i = 0; i < bb->insts.len; i++) {
        IRInst *inst = bb->insts.data[i];
        if (inst->op == IR_SIMD_LOAD || inst->op == IR_SIMD_STORE) {
            inst->comment = "ALIGN_CANDIDATE";
        }
    }
}

/* ══════════════════════════════════════════════════════════
 *  Main Optimization Pipeline
 * ══════════════════════════════════════════════════════════ */

void optimize_module(IRModule *mod, OptContext *ctx) {
    if (ctx->level == OPT_O0) {
        if (ctx->report) p2s_note("optimization level O0: no optimizations applied");
        return;
    }

    for (IRFunc *f = mod->functions; f; f = f->next) {
        for (size_t bi = 0; bi < f->blocks.len; bi++) {
            IRBlock *bb = f->blocks.data[bi];

            /* ── O1: Scalar optimizations ────────────── */
            if (ctx->level >= OPT_O1) {
                opt_constant_fold(bb, ctx->arena);
                opt_strength_reduce(bb, ctx->arena);
                opt_cse(bb, ctx->arena);
                opt_dead_code_eliminate(bb);
                if (ctx->report)
                    p2s_note("O1: scalar optimizations applied to %s/%s",
                             f->name, bb->name);
            }

            /* ── O2: Loop + SIMD ─────────────────────── */
            if (ctx->level >= OPT_O2) {
                /* Detect parallel patterns */
                ParLoopVec loops = detect_parallel_patterns(bb);

                if (ctx->report)
                    p2s_note("O2: detected %zu parallel loop(s) in %s/%s",
                             loops.len, f->name, bb->name);

                for (size_t li = 0; li < loops.len; li++) {
                    ParLoopInfo *loop = &loops.data[li];

                    /* Loop optimizations */
                    opt_loop_unroll(bb, loop, ctx->unroll_factor, ctx->arena);

                    /* SIMD vectorization */
                    if (loop->is_reduce) {
                        opt_vectorize_reduce(bb, loop, ctx->simd, ctx->arena);
                        if (ctx->report)
                            p2s_note("  vectorized reduction (width=%d)",
                                     simd_width(ctx->simd, loop->elem_type));
                    } else if (loop->can_vectorize) {
                        opt_vectorize(bb, loop, ctx->simd, ctx->arena);
                        if (ctx->report)
                            p2s_note("  vectorized parallel_for (width=%d)",
                                     simd_width(ctx->simd, loop->elem_type));
                    }
                }

                /* Inter-loop optimizations */
                opt_loop_interchange(bb, ctx->arena);
                opt_loop_fuse(bb, ctx->arena);

                vec_free(&loops);
            }

            /* ── O3: Memory + aggressive ─────────────── */
            if (ctx->level >= OPT_O3) {
                ParLoopVec loops = detect_parallel_patterns(bb);

                for (size_t li = 0; li < loops.len; li++) {
                    opt_loop_tile(bb, &loops.data[li],
                                  ctx->tile_size > 0 ? ctx->tile_size : 32,
                                  ctx->arena);
                }

                opt_insert_prefetch(bb, ctx->arena);
                opt_align_data(bb, ctx->arena);

                if (ctx->report)
                    p2s_note("O3: memory optimizations applied to %s/%s",
                             f->name, bb->name);

                vec_free(&loops);
            }

            /* Remove barriers (serial execution is naturally ordered) */
            for (size_t i = 0; i < bb->insts.len; i++) {
                if (bb->insts.data[i]->op == IR_BARRIER) {
                    bb->insts.data[i]->op = IR_NOP;
                    bb->insts.data[i]->comment = "barrier removed (serial)";
                    if (ctx->report)
                        p2s_note("  removed barrier (serial execution)");
                }
            }
        }
    }
}