"""
YOFO Benchmark Script.

This script runs a rigorous comparison between YOFO and standard baselines.
It measures:
1. Latency (Time per example)
2. Token Usage (Input + Output tokens)
3. Extrapolated Cost (Based on GPT-4 pricing)

Baselines:
- YOFO (Ours): Single forward pass
- N-Call Judge: 12 separate API calls (one per requirement)
- CoT Judge: 1 call generating detailed reasoning
"""

import time
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import sys
import os

# Add src to path
sys.path.append(os.getcwd())
from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS

# Pricing constants (GPT-4 Turbo Pricing - Nov 2024)
PRICE_INPUT_1K = 0.01
PRICE_OUTPUT_1K = 0.03

class Benchmark:
    def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Initializing benchmark on {self.device}...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map=self.device,
            trust_remote_code=True
        )
        self.model.eval()
        self.builder = YOFOTemplateBuilder(self.tokenizer)

    def _count_tokens(self, text):
        return len(self.tokenizer.encode(text, add_special_tokens=False))

    def benchmark_yofo(self, prompt, response, n_repeats=5):
        """Measure YOFO performance (Single Forward Pass)."""
        
        # Prepare Input
        yofo_input = self.builder.build_template(prompt, response)
        
        # Count actual non-pad tokens
        actual_tokens = yofo_input.attention_mask.sum().item()
        print(f"DEBUG: YOFO actual tokens: {actual_tokens}")
        
        input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device)
        
        # Warmup
        with torch.no_grad():
            self.model(input_ids)
            
        # Timing
        latencies = []
        for _ in range(n_repeats):
            start = time.time()
            with torch.no_grad():
                self.model(input_ids)
            latencies.append(time.time() - start)
            
        avg_latency = sum(latencies) / len(latencies)
        
        return {
            "method": "YOFO (Ours)",
            "latency_ms": avg_latency * 1000,
            "input_tokens": actual_tokens,
            "output_tokens": 0,
            "calls": 1
        }

    def benchmark_n_call(self, prompt, response, n_repeats=1):
        """Measure N-Call Baseline (12 separate calls)."""
        total_input_tokens = 0
        total_output_tokens = 12 
        
        base_context = f"User: {prompt}\nResponse: {response}\n"
        base_tokens = self._count_tokens(base_context)
        print(f"DEBUG: Base context tokens: {base_tokens}")
        
        for req in YOFO_REQS:
            question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):"
            q_tokens = self._count_tokens(question_text)
            total_input_tokens += (base_tokens + q_tokens)
            
        print(f"DEBUG: N-Call total input tokens: {total_input_tokens}")
            
        # Timing (Simulate 1 call * 12)
        sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):"
        inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device)
        
        start = time.time()
        with torch.no_grad():
             self.model.generate(**inputs, max_new_tokens=1)
        one_call_time = time.time() - start
        
        return {
            "method": "N-Call Baseline",
            "latency_ms": one_call_time * 12 * 1000,
            "input_tokens": total_input_tokens,
            "output_tokens": total_output_tokens,
            "calls": 12
        }

    def benchmark_cot(self, prompt, response):
        """Measure CoT Baseline (Generate reasoning)."""
        # Prompt asking for reasoning
        text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:"
        input_tokens = self._count_tokens(text)
        
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        
        # Measure generation time for ~100 tokens of reasoning
        start = time.time()
        with torch.no_grad():
            output = self.model.generate(**inputs, max_new_tokens=100)
        latency = time.time() - start
        
        output_tokens = len(output[0]) - len(inputs['input_ids'][0])
        
        return {
            "method": "Chain-of-Thought",
            "latency_ms": latency * 1000,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "calls": 1
        }

def calculate_cost(row):
    """Calculate cost per 1k evaluations based on tokens."""
    cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K
    cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K
    total_cost_per_eval = cost_in + cost_out
    return total_cost_per_eval * 1000 # Per 1k evals

def run_benchmark():
    bench = Benchmark()
    
    # LONG CONTEXT Example (Realistic Chat)
    # This mimics a typical user query + long model explanation
    prompt = "Explain the history of the Roman Empire and its fall." * 10  # ~100 tokens
    response = "The Roman Empire was one of the largest... " * 20  # ~200 tokens
    # Total context approx 300-400 tokens.
    # This will show the penalty of repeating it 12 times.
    
    print("\nRunning benchmarks with Long Context (~400 tokens)...")
    results = []
    
    # 1. Run YOFO
    print("1. Benchmarking YOFO...")
    results.append(bench.benchmark_yofo(prompt, response))
    
    # 2. Run N-Call
    print("2. Benchmarking N-Call Baseline...")
    results.append(bench.benchmark_n_call(prompt, response))
    
    # 3. Run CoT
    print("3. Benchmarking CoT Baseline...")
    results.append(bench.benchmark_cot(prompt, response))
    
    # Analysis
    df = pd.DataFrame(results)
    df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1)
    df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms']
    
    print("\n" + "="*80)
    print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)")
    print("="*80)
    print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False))
    
    # Save results
    df.to_csv("benchmark_results.csv", index=False)
    print("\nSaved results to benchmark_results.csv")

if __name__ == "__main__":
    run_benchmark()