#!/usr/bin/env python3
import os
import sys
import time
import subprocess
import argparse


_SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.dirname(_SCRIPTS_DIR)

def check_artifacts_up_to_date(precomputed_dir: str, candidates_path: str) -> bool:
    """Check if precomputed artifacts exist and are newer than candidates.jsonl."""
    required_files = [
        "bm25_index.pkl",
        "candidate_ids.pkl",
        "lgbm_model.pkl",
        "static_features.pkl",
        "vocab.pkl",
        "bm25_matrix.npz",
        "candidate_offsets.pkl",
        "lgbm_model.txt"
    ]
    for f in required_files:
        fpath = os.path.join(precomputed_dir, f)
        if not os.path.isfile(fpath):
            return False
        
        #  mtime vs candidates.jsonl
        if os.path.isfile(candidates_path):
            if os.path.getmtime(fpath) < os.path.getmtime(candidates_path):
                return False
    return True

def run_step(command_list, step_label, step_num):
    print(f"\n[{step_num}/3] Running {step_label}...")
    t0 = time.time()
    
    # process
    result = subprocess.run(command_list, capture_output=True, text=True)
    
    elapsed = time.time() - t0
    
    if result.returncode != 0:
        print(f"\n[ERROR] Step {step_num}/3 ({step_label}) FAILED (Exit Code: {result.returncode})")
        print("--- STDOUT ---")
        print(result.stdout)
        print("--- STDERR ---")
        print(result.stderr)
        sys.exit(result.returncode)
    
    print(result.stdout.strip())
    print(f"[{step_num}/3] {step_label.capitalize()} complete ({elapsed:.2f}s)")
    return elapsed

def main():
    parser = argparse.ArgumentParser(description="Redrob Ranking Pipeline Runner")
    parser.add_argument("--candidates", default="./candidates.jsonl", help="Path to candidates JSONL")
    parser.add_argument("--out", default="./CTRL_COFFEE_REPEAT.csv", help="Path to output CSV")
    parser.add_argument("--force-precompute", action="store_true", help="Force rebuild precompute artifacts")
    args = parser.parse_args()

    candidates_path = os.path.abspath(args.candidates)
    out_path = os.path.abspath(args.out)
    precomputed_dir = os.path.join(_PROJECT_ROOT, "precomputed")
    
    t_start = time.time()
    
    artifacts_ready = check_artifacts_up_to_date(precomputed_dir, candidates_path)
    
    python_exe = sys.executable

    t_precompute = 0.0
    if not artifacts_ready or args.force_precompute:
        cmd = [python_exe, "scripts/precompute.py", "--candidates", candidates_path, "--base-dir", _PROJECT_ROOT]
        t_precompute = run_step(cmd, "precompute", 1)
    else:
        print("\n[1/3] Precompute skipped (artifacts up to date)")

    # rank
    cmd = [python_exe, "src/rank.py", "--candidates", candidates_path, "--out", out_path, "--base-dir", _PROJECT_ROOT]
    t_rank = run_step(cmd, "rank", 2)

    # validate
    cmd = [python_exe, "scripts/validate_submission.py", "--submission", out_path]
    t_validate = run_step(cmd, "validate_submission", 3)

    total_wall = time.time() - t_start
    
    print("\n" + "=" * 60)
    print("PIPELINE EXECUTION SUMMARY")
    print("=" * 60)
    print(f"  Total Clock Time: {total_wall:.2f} seconds")
    print(f"  Step 1 (Precompute):   {t_precompute:.2f}s" if t_precompute > 0 else "  Step 1 (Precompute):   Skipped (up to date)")
    print(f"  Step 2 (Ranking):      {t_rank:.2f}s")
    print(f"  Step 3 (Validation):   {t_validate:.2f}s")
    
    
    if os.path.isfile(out_path):
        try:
            import pandas as pd
            df = pd.read_csv(out_path)
            if len(df) == 100:
                print("  CONFIRMED:             submission.csv exists with exactly 100 rows.")
            else:
                print(f"  [ERROR]                submission.csv has {len(df)} rows, expected exactly 100!")
                sys.exit(1)
        except Exception as e:
            print(f"  [ERROR]                Error reading CSV: {e}")
            sys.exit(1)
    else:
        print("  [ERROR]                Missing output file submission.csv!")
        sys.exit(1)
    
    log_dir = os.path.join(_PROJECT_ROOT, "logs")
    if os.path.isdir(log_dir):
        logs = [os.path.join(log_dir, f) for f in os.listdir(log_dir) if f.startswith("rank_")]
        if logs:
            latest_log = max(logs, key=os.path.getmtime)
            print(f"  Latest Log File:       {latest_log}")
            
    print("=" * 60)

if __name__ == "__main__":
    main()