#!/usr/bin/env python3
"""
Evaluate ByT5 + XLM-RoBERTa reranker on Indo NLP Sinhala test sets.
Test Set 1: 10K formal sentences
Test Set 2: 5K informal sentences (ad-hoc, colloquial)
"""

import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

import torch
import pandas as pd
import numpy as np
from collections import defaultdict

# Import our models
from core.decoder import BeamSearchDecoder

def load_test_set(filepath, max_samples=None):
    """
    Load Indo NLP test set.
    Format: pairs of lines (Singlish, Sinhala expected output)
    """
    samples = []
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    
    for i in range(0, len(lines), 2):
        if i + 1 < len(lines):
            singlish_input = lines[i]
            sinhala_expected = lines[i + 1]
            samples.append({
                'singlish': singlish_input,
                'expected': sinhala_expected
            })
            if max_samples and len(samples) >= max_samples:
                break
    
    return samples

def compute_cer(predicted, expected):
    """Character Error Rate"""
    if not expected:
        return 1.0 if predicted else 0.0
    
    # Simple character-level edit distance
    from difflib import SequenceMatcher
    matcher = SequenceMatcher(None, predicted, expected)
    ratio = matcher.ratio()
    return 1.0 - ratio

def compute_wer(predicted, expected):
    """Word Error Rate (space-separated tokens)"""
    pred_words = predicted.split()
    exp_words = expected.split()
    
    if not exp_words:
        return 1.0 if pred_words else 0.0
    
    from difflib import SequenceMatcher
    matcher = SequenceMatcher(None, pred_words, exp_words)
    ratio = matcher.ratio()
    return 1.0 - ratio

def compute_em(predicted, expected):
    """Exact Match"""
    return 1.0 if predicted == expected else 0.0

def compute_bleu(predicted, expected, n=4):
    """Simple BLEU approximation (unigram overlap)"""
    pred_tokens = predicted.split()
    exp_tokens = expected.split()
    
    if not exp_tokens:
        return 1.0 if not pred_tokens else 0.0
    
    # Count matching tokens
    matches = sum(1 for t in pred_tokens if t in exp_tokens)
    return matches / len(exp_tokens)

def evaluate_samples(decoder, samples, device, batch_size=8):
    """
    Evaluate ByT5 + MLM reranker on samples.
    Returns: list of results with metrics
    """
    results = []
    total = len(samples)
    
    for idx, sample in enumerate(samples):
        singlish_input = sample['singlish']
        expected_output = sample['expected']
        
        # Print progress every 10 samples
        if idx % 10 == 0:
            print(f"  Progress: {idx}/{total}", flush=True)
        
        try:
            # Decode using BeamSearchDecoder (includes ByT5 + MLM reranking)
            predicted, trace_logs, _ = decoder.decode(singlish_input)
            
            # Compute metrics
            cer = compute_cer(predicted, expected_output)
            wer = compute_wer(predicted, expected_output)
            bleu = compute_bleu(predicted, expected_output)
            em = compute_em(predicted, expected_output)
            
            results.append({
                'singlish': singlish_input,
                'expected': expected_output,
                'predicted': predicted,
                'cer': cer,
                'wer': wer,
                'bleu': bleu,
                'em': em
            })
            
        except Exception as e:
            print(f"  Error at {idx}/{total} processing '{singlish_input}': {e}")
            results.append({
                'singlish': singlish_input,
                'expected': expected_output,
                'predicted': '[ERROR]',
                'cer': 1.0,
                'wer': 1.0,
                'bleu': 0.0,
                'em': 0
            })
    
    print(f"  Completed: {total}/{total}", flush=True)
    return results

def print_metrics(results, subset_name):
    """Print metrics summary"""
    if not results:
        print(f"{subset_name}: No results")
        return
    
    df = pd.DataFrame(results)
    
    print(f"\n{'='*60}")
    print(f"Subset: {subset_name} (n={len(results)})")
    print(f"{'='*60}")
    print(f"CER (lower is better):  {df['cer'].mean():.4f} ± {df['cer'].std():.4f}")
    print(f"WER (lower is better):  {df['wer'].mean():.4f} ± {df['wer'].std():.4f}")
    print(f"BLEU (higher is better): {df['bleu'].mean():.4f} ± {df['bleu'].std():.4f}")
    print(f"EM (higher is better):   {df['em'].mean():.4f} ({int(df['em'].sum())} / {len(results)})")
    
    # Show sample failures
    failures = df[df['em'] == 0].head(3)
    if len(failures) > 0:
        print(f"\nSample Failures (first 3):")
        for idx, row in failures.iterrows():
            print(f"  Input:    {row['singlish']}")
            print(f"  Expected: {row['expected']}")
            print(f"  Got:      {row['predicted']}")
            print()

def main():
    import sys
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Parse command line args for sample limits
    max_formal = int(sys.argv[1]) if len(sys.argv) > 1 else None
    max_informal = int(sys.argv[2]) if len(sys.argv) > 2 else None
    
    # Initialize model
    print("Loading BeamSearchDecoder (ByT5 + MLM reranker)...")
    decoder = BeamSearchDecoder(device=device)
    
    # Load test sets
    test_dir = Path("IndoNLP-2025-Shared-Task/Test Dataset/Sinhala")
    
    print("\nLoading Test Set 1 (formal, 10K)...")
    formal_samples = load_test_set(test_dir / "Sinhala Test set 1.txt", max_samples=max_formal)
    print(f"Loaded {len(formal_samples)} formal samples")
    
    print("Loading Test Set 2 (informal, 5K)...")
    informal_samples = load_test_set(test_dir / "Sinhala Test set 2.txt", max_samples=max_informal)
    print(f"Loaded {len(informal_samples)} informal samples")
    
    # Evaluate
    print("\n" + "="*60)
    print(f"EVALUATING FORMAL SUBSET ({len(formal_samples)} samples)")
    print("="*60)
    formal_results = evaluate_samples(decoder, formal_samples, device)
    
    print("\n" + "="*60)
    print(f"EVALUATING INFORMAL SUBSET ({len(informal_samples)} samples)")
    print("="*60)
    informal_results = evaluate_samples(decoder, informal_samples, device)
    
    # Print results
    print_metrics(formal_results, f"Formal ({len(formal_results)})")
    print_metrics(informal_results, f"Informal ({len(informal_results)})")
    
    # Overall
    all_results = formal_results + informal_results
    print_metrics(all_results, f"OVERALL ({len(all_results)} samples)")
    
    # Save detailed results
    results_df = pd.DataFrame(all_results)
    results_df.to_csv("misc/indo_nlp_eval_results.csv", index=False)
    print(f"\nDetailed results saved to: misc/indo_nlp_eval_results.csv")
    
    # Save summary
    summary = {
        'Subset': [f'Formal ({len(formal_results)})', f'Informal ({len(informal_results)})', f'Overall ({len(all_results)})'],
        'CER': [
            f"{pd.DataFrame(formal_results)['cer'].mean():.4f}",
            f"{pd.DataFrame(informal_results)['cer'].mean():.4f}",
            f"{results_df['cer'].mean():.4f}"
        ],
        'WER': [
            f"{pd.DataFrame(formal_results)['wer'].mean():.4f}",
            f"{pd.DataFrame(informal_results)['wer'].mean():.4f}",
            f"{results_df['wer'].mean():.4f}"
        ],
        'BLEU': [
            f"{pd.DataFrame(formal_results)['bleu'].mean():.4f}",
            f"{pd.DataFrame(informal_results)['bleu'].mean():.4f}",
            f"{results_df['bleu'].mean():.4f}"
        ],
        'EM': [
            f"{pd.DataFrame(formal_results)['em'].mean():.4f}",
            f"{pd.DataFrame(informal_results)['em'].mean():.4f}",
            f"{results_df['em'].mean():.4f}"
        ]
    }
    summary_df = pd.DataFrame(summary)
    summary_df.to_csv("misc/indo_nlp_eval_summary.csv", index=False)
    print(f"Summary saved to: misc/indo_nlp_eval_summary.csv")

if __name__ == "__main__":
    main()