#!/usr/bin/env python3
"""
Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).

Runs both Base and SFT models on the 1533 open-ended physics test set.
Saves raw model outputs for later judging.

Usage (inside Docker container):
    cd /tmp && python3 /path/to/eval_openended_inference.py

Output:
    sft_eval_footprint/inference_results_base.jsonl
    sft_eval_footprint/inference_results_sft.jsonl
"""
import os
import sys
import json
import re
import time
import torch
import multiprocessing as mp
from collections import Counter

# ============ CONFIG ============
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
SFT_MODEL = "MODEL_PATH_PLACEHOLDER"
TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"

# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
BASE_GPUS = [0, 1, 2, 3]
SFT_GPUS = [4, 5, 6, 7]
MAX_NEW_TOKENS = 2048
# ================================


def load_test_data():
    """Load test samples from JSONL."""
    samples = []
    with open(TEST_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                samples.append(json.loads(line))
    return samples


def build_open_ended_prompt(sample):
    """Build an open-ended prompt (no MCQ options)."""
    desc = sample.get('description', '')
    question = sample.get('question', '')

    prompt = f"""Look at the image and answer the physics question.

{desc}

{question}

Please reason step by step, and put your final answer within \\boxed{{}}.
"""
    return prompt.strip()


def worker_inference(gpu_id, model_path, samples, output_file, model_name):
    """Worker: load model on specific GPU and run inference on assigned samples."""
    import torch
    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
    from qwen_vl_utils import process_vision_info
    from PIL import Image

    device = f"cuda:{gpu_id}"
    print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)

    processor = AutoProcessor.from_pretrained(
        model_path,
        min_pixels=3136,
        max_pixels=200704,
        local_files_only=True,
        trust_remote_code=True,
    )
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map=device,
        local_files_only=True,
        trust_remote_code=True,
    )
    model.eval()
    print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)

    results = []
    for i, sample in enumerate(samples):
        idx = sample['index']
        prompt_text = build_open_ended_prompt(sample)
        image_path = os.path.join(IMAGE_DIR, sample['image'])

        # Build messages
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{image_path}"},
                {"type": "text", "text": prompt_text},
            ],
        }]

        try:
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            ).to(device)

            with torch.no_grad():
                output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

            generated = output_ids[0][inputs.input_ids.shape[1]:]
            response = processor.decode(generated, skip_special_tokens=True)
        except Exception as e:
            response = f"ERROR: {str(e)}"

        result = {
            "index": idx,
            "category": sample['category'],
            "subfield": sample.get('subfield', ''),
            "question": sample['question'],
            "ground_truth_value": sample['ground_truth_value'],
            "ground_truth_letter": sample.get('ground_truth_letter', ''),
            "model_output": response,
            "model_name": model_name,
            "gpu_id": gpu_id,
        }
        results.append(result)

        if (i + 1) % 20 == 0 or (i + 1) == len(samples):
            print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)

    # Write results
    with open(output_file, 'w', encoding='utf-8') as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + '\n')

    print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
    return len(results)


def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
    """Split samples across GPUs and run in parallel."""
    n = len(samples)
    k = len(gpu_ids)
    chunk_size = (n + k - 1) // k

    processes = []
    output_files = []
    for i, gpu_id in enumerate(gpu_ids):
        chunk = samples[i * chunk_size: (i + 1) * chunk_size]
        if not chunk:
            continue
        out_file = f"{output_base}_gpu{gpu_id}.jsonl"
        output_files.append(out_file)
        p = mp.Process(
            target=worker_inference,
            args=(gpu_id, model_path, chunk, out_file, model_name)
        )
        processes.append(p)

    for p in processes:
        p.start()
    for p in processes:
        p.join()

    return output_files


def merge_results(output_files, final_output):
    """Merge per-GPU result files into one."""
    all_results = []
    for f in output_files:
        if os.path.exists(f):
            with open(f, 'r', encoding='utf-8') as fh:
                for line in fh:
                    if line.strip():
                        all_results.append(json.loads(line))

    # Sort by index for consistency
    all_results.sort(key=lambda x: x['index'])

    with open(final_output, 'w', encoding='utf-8') as f:
        for r in all_results:
            f.write(json.dumps(r, ensure_ascii=False) + '\n')

    # Cleanup per-GPU files
    for f in output_files:
        if os.path.exists(f):
            os.remove(f)

    return all_results


def main():
    import json, os
    TEST_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
    samples = []
    with open(TEST_FILE) as f:
        for line in f:
            if line.strip():
                samples.append(json.loads(line))
    print(f"Loaded {len(samples)} test samples")
    print(f"Model: {SFT_MODEL}")
    
    sft_output = os.path.join(OUTPUT_DIR, "OUTPUT_NAME_PLACEHOLDER")
    SFT_GPUS = list(range(8))
    run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
    
    sft_final = sft_output + ".jsonl" if not sft_output.endswith(".jsonl") else sft_output
    if not os.path.exists(sft_final):
        # merge from per-gpu files
        all_r = []
        for gpu in SFT_GPUS:
            gf = sft_output + f"_gpu{gpu}.jsonl"
            if os.path.exists(gf):
                with open(gf) as f:
                    for line in f:
                        if line.strip():
                            all_r.append(line)
                os.remove(gf)
        with open(sft_final, 'w') as f:
            for line in all_r:
                f.write(line)
    
    with open(sft_final) as f:
        count = sum(1 for _ in f)
    print(f"Total: {count} results -> {sft_final}")

if __name__ == "__main__":
    main()