#!/usr/bin/env python
"""
Benchmark ASR models on Common Voice Danish dataset.

This script evaluates hvisketiske-v2 (Qwen3-ASR) and hviske-v3 (Whisper)
on the Mozilla Common Voice Danish test set for comparison.

IMPORTANT: Common Voice requires authentication and agreement to terms of use.
Before running this script:
1. Create a HuggingFace account at https://huggingface.co
2. Visit https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0
3. Agree to the dataset terms of use
4. Create an access token at https://huggingface.co/settings/tokens
5. Login via CLI: `huggingface-cli login`

Usage:
    # After logging in:
    python huggingface/evaluate_common_voice.py \
        --hvisketiske-path ./outputs/hvisketiske-v2/checkpoint-23448 \
        --max-samples 1000 \
        --output-file ./results/common_voice_comparison.json

    # Quick test with fewer samples:
    python huggingface/evaluate_common_voice.py --max-samples 100

    # Use specific token:
    python huggingface/evaluate_common_voice.py --hf-token YOUR_TOKEN
"""

import argparse
import json
import sys
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

import soundfile as sf
from datasets import load_dataset
from jiwer import cer, wer
from tqdm import tqdm

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from hvisketiske.evaluation.model_adapters import (
    ASRModelAdapter,
    HviskeV3Adapter,
    Qwen3ASRAdapter,
    TranscriptionResult,
)
from hvisketiske.evaluation.timing import AggregatedTimingStats


@dataclass
class CommonVoiceSample:
    """A single Common Voice sample."""

    audio_path: str
    reference: str
    audio_duration: float


def load_common_voice_danish(
    split: str = "test",
    max_samples: Optional[int] = None,
    cache_dir: Optional[str] = None,
    hf_token: Optional[str] = None,
) -> List[CommonVoiceSample]:
    """
    Load Common Voice Danish dataset and prepare samples.

    Args:
        split: Dataset split to load (test, validation, train).
        max_samples: Maximum number of samples to load.
        cache_dir: Directory to cache audio files.
        hf_token: HuggingFace API token for authentication.

    Returns:
        List of CommonVoiceSample objects.
    """
    print(f"Loading Common Voice Danish ({split} split)...")
    print("Note: This requires HuggingFace authentication and agreement to dataset terms.")
    print("Visit: https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0")
    print()

    try:
        ds = load_dataset(
            "mozilla-foundation/common_voice_17_0",
            "da",
            split=split,
            trust_remote_code=True,
            token=hf_token,
        )
    except Exception as e:
        error_msg = str(e)
        if "EmptyDatasetError" in error_msg or "doesn't contain any data" in error_msg:
            print("\n" + "=" * 70)
            print("ERROR: Cannot access Common Voice dataset.")
            print("=" * 70)
            print("\nThis dataset requires authentication. Please:")
            print("1. Visit https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0")
            print("2. Log in and agree to the terms of use")
            print("3. Run: huggingface-cli login")
            print("4. Or pass --hf-token YOUR_TOKEN to this script")
            print("=" * 70 + "\n")
        raise

    if max_samples:
        ds = ds.select(range(min(max_samples, len(ds))))

    print(f"Loaded {len(ds)} samples")

    # Create temp directory for audio files if not provided
    if cache_dir is None:
        cache_dir = tempfile.mkdtemp(prefix="cv_danish_")

    cache_path = Path(cache_dir)
    cache_path.mkdir(parents=True, exist_ok=True)

    samples = []
    print("Preparing audio files...")
    for i, item in enumerate(tqdm(ds, desc="Preparing samples")):
        # Extract audio array and sample rate
        audio_array = item["audio"]["array"]
        sample_rate = item["audio"]["sampling_rate"]

        # Save to temp file
        audio_path = cache_path / f"sample_{i:06d}.wav"
        sf.write(str(audio_path), audio_array, sample_rate)

        # Calculate duration
        duration = len(audio_array) / sample_rate

        samples.append(
            CommonVoiceSample(
                audio_path=str(audio_path),
                reference=item["sentence"],
                audio_duration=duration,
            )
        )

    return samples


def normalize_text(text: str) -> str:
    """Normalize text for fair comparison."""
    text = text.lower()
    text = " ".join(text.split())
    return text


def evaluate_model(
    model: ASRModelAdapter,
    samples: List[CommonVoiceSample],
    warmup_samples: int = 3,
) -> dict:
    """
    Evaluate a model on the Common Voice samples.

    Args:
        model: Model adapter to evaluate.
        samples: List of samples to evaluate.
        warmup_samples: Number of warmup iterations.

    Returns:
        Dictionary with evaluation results.
    """
    print(f"\nEvaluating: {model.model_name}")
    print("Loading model...")
    model.load()

    # Warmup
    if warmup_samples > 0 and samples:
        print(f"Running {warmup_samples} warmup iterations...")
        model.warmup(samples[0].audio_path, num_runs=warmup_samples)

    # Transcribe all samples
    predictions = []
    individual_times = []
    total_audio_duration = 0.0
    total_inference_time = 0.0

    print(f"Transcribing {len(samples)} samples...")
    for sample in tqdm(samples, desc=f"Evaluating {model.model_name[:30]}"):
        result = model.transcribe(sample.audio_path)
        predictions.append(result.text)
        individual_times.append(result.inference_time_seconds)
        total_audio_duration += sample.audio_duration
        total_inference_time += result.inference_time_seconds

    # Normalize text
    predictions_norm = [normalize_text(p) for p in predictions]
    references_norm = [normalize_text(s.reference) for s in samples]

    # Calculate metrics
    word_error_rate = wer(references_norm, predictions_norm)
    char_error_rate = cer(references_norm, predictions_norm)

    timing_stats = AggregatedTimingStats(
        total_inference_time_seconds=total_inference_time,
        total_audio_duration_seconds=total_audio_duration,
        num_samples=len(samples),
        individual_times=individual_times,
    )

    return {
        "model_name": model.model_name,
        "model_size": model.model_size_params,
        "accuracy": {
            "wer": word_error_rate,
            "cer": char_error_rate,
        },
        "performance": {
            "total_inference_time_seconds": timing_stats.total_inference_time_seconds,
            "total_audio_duration_seconds": timing_stats.total_audio_duration_seconds,
            "real_time_factor": timing_stats.real_time_factor,
            "throughput_samples_per_second": timing_stats.throughput_samples_per_second,
            "mean_time_per_sample_seconds": timing_stats.mean_time_per_sample,
            "std_time_per_sample_seconds": timing_stats.std_time_per_sample,
        },
        "num_samples": len(samples),
    }


def print_summary(results: dict) -> None:
    """Print formatted comparison summary."""
    print("\n" + "=" * 80)
    print("COMMON VOICE DANISH - ASR MODEL COMPARISON")
    print("=" * 80)
    print(f"Dataset: mozilla-foundation/common_voice_17_0 (Danish)")
    print(f"Number of models: {len(results['models'])}")

    sample_count = next(iter(results["models"].values()))["num_samples"]
    print(f"Samples evaluated: {sample_count}")

    # Accuracy comparison table
    print("\n" + "-" * 80)
    print("ACCURACY METRICS (lower is better)")
    print("-" * 80)
    print(f"{'Model':<45} {'WER':>12} {'CER':>12}")
    print("-" * 80)
    for name, result in sorted(
        results["models"].items(), key=lambda x: x[1]["accuracy"]["wer"]
    ):
        print(
            f"{result['model_name'][:45]:<45} "
            f"{result['accuracy']['wer']:>11.2%} "
            f"{result['accuracy']['cer']:>11.2%}"
        )

    # Performance comparison table
    print("\n" + "-" * 80)
    print("PERFORMANCE METRICS (RTF < 1.0 = faster than real-time)")
    print("-" * 80)
    print(f"{'Model':<35} {'RTF':>8} {'Throughput':>12} {'Mean Time':>12}")
    print(f"{'':35} {'':>8} {'(samples/s)':>12} {'(s/sample)':>12}")
    print("-" * 80)
    for name, result in sorted(
        results["models"].items(), key=lambda x: x[1]["performance"]["real_time_factor"]
    ):
        perf = result["performance"]
        print(
            f"{result['model_name'][:35]:<35} "
            f"{perf['real_time_factor']:>8.3f} "
            f"{perf['throughput_samples_per_second']:>12.2f} "
            f"{perf['mean_time_per_sample_seconds']:>12.3f}"
        )

    print("=" * 80)


def parse_args() -> argparse.Namespace:
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Benchmark ASR models on Common Voice Danish"
    )

    parser.add_argument(
        "--output-file",
        type=Path,
        default=Path("results/common_voice_comparison.json"),
        help="Path to save comparison report (JSON)",
    )
    parser.add_argument(
        "--max-samples",
        type=int,
        default=None,
        help="Maximum samples to evaluate (for quick testing)",
    )
    parser.add_argument(
        "--warmup",
        type=int,
        default=3,
        help="Number of warmup iterations per model (default: 3)",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda:0",
        help="Device for inference (default: cuda:0)",
    )
    parser.add_argument(
        "--cache-dir",
        type=str,
        default=None,
        help="Directory to cache audio files",
    )
    parser.add_argument(
        "--hf-token",
        type=str,
        default=None,
        help="HuggingFace API token for authentication (or use huggingface-cli login)",
    )

    # Model selection
    parser.add_argument(
        "--skip-hviske-v3",
        action="store_true",
        help="Skip hviske-v3-conversation model",
    )
    parser.add_argument(
        "--skip-hvisketiske",
        action="store_true",
        help="Skip hvisketiske-v2 model",
    )
    parser.add_argument(
        "--hvisketiske-path",
        type=str,
        default="./outputs/hvisketiske-v2/checkpoint-23448",
        help="Path to local hvisketiske checkpoint",
    )

    return parser.parse_args()


def main() -> None:
    """Main entry point for Common Voice evaluation."""
    args = parse_args()

    # Load dataset
    samples = load_common_voice_danish(
        split="test",
        max_samples=args.max_samples,
        cache_dir=args.cache_dir,
        hf_token=args.hf_token,
    )

    # Configure models to evaluate
    models = []

    if not args.skip_hviske_v3:
        models.append(
            HviskeV3Adapter(
                model_id="syvai/hviske-v3-conversation",
                device=args.device,
            )
        )

    if not args.skip_hvisketiske:
        models.append(
            Qwen3ASRAdapter(
                model_path=args.hvisketiske_path,
                device=args.device,
            )
        )

    if not models:
        print("Error: No models selected for evaluation")
        sys.exit(1)

    print("=" * 60)
    print("Common Voice Danish ASR Evaluation")
    print("=" * 60)
    print(f"Dataset: mozilla-foundation/common_voice_17_0")
    print(f"Samples: {len(samples)}")
    print(f"Device: {args.device}")
    print(f"Warmup iterations: {args.warmup}")
    print(f"Models to evaluate: {len(models)}")
    for m in models:
        print(f"  - {m.model_name} ({m.model_size_params})")
    print("=" * 60)

    # Evaluate all models
    results = {"dataset": "mozilla-foundation/common_voice_17_0", "models": {}}

    for model in models:
        model_results = evaluate_model(model, samples, warmup_samples=args.warmup)
        results["models"][model.model_name] = model_results

    # Print summary
    print_summary(results)

    # Save results
    args.output_file.parent.mkdir(parents=True, exist_ok=True)
    with open(args.output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\nResults saved to: {args.output_file}")


if __name__ == "__main__":
    main()