/*
 * par2serial-cc: optimize.h - Optimization pass manager
 */
#ifndef P2S_OPTIMIZE_H
#define P2S_OPTIMIZE_H

#include "ir.h"

/* ── Optimization levels ─────────────────────────────────── */
typedef enum {
    OPT_O0 = 0,   /* No optimization, just serial lowering */
    OPT_O1 = 1,   /* Basic scalar optimizations */
    OPT_O2 = 2,   /* + loop optimizations + SIMD */
    OPT_O3 = 3,   /* + memory layout + aggressive */
} OptLevel;

/* ── SIMD target ─────────────────────────────────────────── */
typedef enum {
    SIMD_NONE   = 0,
    SIMD_SSE    = 1,   /* 128-bit, 4 floats */
    SIMD_AVX    = 2,   /* 256-bit, 8 floats */
    SIMD_AVX2   = 3,   /* 256-bit + FMA */
    SIMD_AVX512 = 4,   /* 512-bit, 16 floats */
    SIMD_NEON   = 5,   /* ARM 128-bit */
} SIMDTarget;

/* ── Optimization context ────────────────────────────────── */
typedef struct {
    OptLevel    level;
    SIMDTarget  simd;
    bool        report;        /* generate optimization report */
    int         tile_size;     /* default tile size (0 = auto) */
    int         unroll_factor; /* default unroll (0 = auto) */
    Arena      *arena;
} OptContext;

/* ── Parallel loop info (detected by analysis) ───────────── */
typedef struct {
    int         start_inst;    /* index of PAR_FOR_BEGIN */
    int         end_inst;      /* index of PAR_FOR_END */
    const char *iter_var;
    int         iter_reg;
    int         lo_reg;
    int         hi_reg;
    int         step_reg;
    ReduceOp    reduce_op;     /* for reductions */
    int         accum_reg;     /* for reductions */
    bool        is_reduce;
    bool        is_scan;
    bool        can_vectorize;
    bool        has_dependency;
    int         trip_count;    /* estimated trip count, -1 if unknown */
    TypeKind    elem_type;     /* element type for SIMD */
} ParLoopInfo;

VEC_TYPEDEF(ParLoopInfo, ParLoopVec);

/* ── Optimization report entry ───────────────────────────── */
typedef struct {
    const char *pass_name;
    const char *description;
    SourceLoc   loc;
    bool        applied;
} OptReport;

VEC_TYPEDEF(OptReport, OptReportVec);

/* ── Main optimization pipeline ──────────────────────────── */
void optimize_module(IRModule *mod, OptContext *ctx);

/* ── Individual passes ───────────────────────────────────── */
/* Pass 1: Parallel pattern detection & analysis */
ParLoopVec detect_parallel_patterns(IRBlock *bb);

/* Pass 2: Scalar optimizations */
void opt_constant_fold(IRBlock *bb, Arena *arena);
void opt_dead_code_eliminate(IRBlock *bb);
void opt_strength_reduce(IRBlock *bb, Arena *arena);
void opt_cse(IRBlock *bb, Arena *arena);

/* Pass 3: Loop optimizations */
void opt_loop_tile(IRBlock *bb, ParLoopInfo *loop, int tile_size, Arena *arena);
void opt_loop_unroll(IRBlock *bb, ParLoopInfo *loop, int factor, Arena *arena);
void opt_loop_interchange(IRBlock *bb, Arena *arena);
void opt_loop_fuse(IRBlock *bb, Arena *arena);

/* Pass 4: Vectorization */
void opt_vectorize(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena);
void opt_vectorize_reduce(IRBlock *bb, ParLoopInfo *loop, SIMDTarget target, Arena *arena);

/* Pass 5: Memory optimizations */
void opt_insert_prefetch(IRBlock *bb, Arena *arena);
void opt_align_data(IRBlock *bb, Arena *arena);

/* ── Utility ─────────────────────────────────────────────── */
int simd_width(SIMDTarget target, TypeKind elem_type);
const char *simd_target_str(SIMDTarget t);

#endif /* P2S_OPTIMIZE_H */