"""
Lightweight statistical evaluation — no torch/transformers needed.
Computes the exact text dispersion metrics that AI detectors exploit:
  - Vocabulary richness (TTR, Hapax ratio)
  - Sentence length variance (burstiness)
  - Word frequency dispersion
  - Token repetition patterns
  - Readability scores

These are the features detectors like GPTZero (perplexity + burstiness),
Fast-DetectGPT (curvature), and Binoculars (cross-perplexity) exploit.
"""

from __future__ import annotations

import json
import math
import os
import re
import sys
from collections import Counter
from dataclasses import dataclass, field


@dataclass
class TextStats:
    """Statistical profile of a single text."""
    num_words: int = 0
    num_sentences: int = 0
    num_unique_words: int = 0
    type_token_ratio: float = 0.0       # vocabulary richness
    hapax_ratio: float = 0.0             # words appearing exactly once
    avg_word_len: float = 0.0
    std_word_len: float = 0.0
    avg_sentence_len: float = 0.0        # mean sentence length
    std_sentence_len: float = 0.0        # burstiness proxy
    sentence_len_cv: float = 0.0         # coefficient of variation
    avg_word_freq: float = 0.0           # mean frequency of words
    std_word_freq: float = 0.0           # dispersion of word frequencies
    readability_flesch: float = 0.0      # Flesch Reading Ease


def tokenize_sentences(text: str) -> list[str]:
    """Simple sentence tokenizer."""
    return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip() and len(s.strip().split()) >= 3]


def tokenize_words(text: str) -> list[str]:
    """Simple word tokenizer — lowercase, strip punctuation."""
    return [w.lower().strip('.,;:!?()[]{}"\'-') for w in text.split() if w.strip('.,;:!?()[]{}"\'-')]


def compute_stats(text: str) -> TextStats:
    """Compute all statistical metrics for a text."""
    stats = TextStats()
    words = tokenize_words(text)
    sentences = tokenize_sentences(text)

    if not words:
        return stats

    # Word-level stats
    stats.num_words = len(words)
    stats.num_unique_words = len(set(words))
    stats.type_token_ratio = stats.num_unique_words / max(stats.num_words, 1)

    word_counts = Counter(words)
    hapax = sum(1 for c in word_counts.values() if c == 1)
    stats.hapax_ratio = hapax / max(stats.num_unique_words, 1)

    word_lens = [len(w) for w in words]
    stats.avg_word_len = sum(word_lens) / max(len(word_lens), 1)
    stats.std_word_len = float(
        math.sqrt(sum((l - stats.avg_word_len) ** 2 for l in word_lens) / max(len(word_lens), 1))
    )

    # Word frequency dispersion
    freqs = list(word_counts.values())
    stats.avg_word_freq = sum(freqs) / max(len(freqs), 1)
    stats.std_word_freq = float(
        math.sqrt(sum((f - stats.avg_word_freq) ** 2 for f in freqs) / max(len(freqs), 1))
    )

    # Sentence-level stats (burstiness)
    stats.num_sentences = len(sentences)
    sent_lens = [len(tokenize_words(s)) for s in sentences]
    if sent_lens:
        stats.avg_sentence_len = sum(sent_lens) / len(sent_lens)
        variance = sum((l - stats.avg_sentence_len) ** 2 for l in sent_lens) / len(sent_lens)
        stats.std_sentence_len = float(math.sqrt(variance))
        stats.sentence_len_cv = stats.std_sentence_len / max(stats.avg_sentence_len, 0.01)

    # Flesch Reading Ease
    total_syllables = sum(count_syllables(w) for w in words)
    if stats.num_sentences > 0 and stats.num_words > 0:
        stats.readability_flesch = 206.835 - 1.015 * (stats.num_words / stats.num_sentences) \
                                   - 84.6 * (total_syllables / stats.num_words)

    return stats


def count_syllables(word: str) -> int:
    """Approximate syllable count."""
    word = word.lower()
    if len(word) <= 3:
        return 1
    vowels = "aeiouy"
    count = 0
    prev_vowel = False
    for ch in word:
        is_vowel = ch in vowels
        if is_vowel and not prev_vowel:
            count += 1
        prev_vowel = is_vowel
    if word.endswith("e"):
        count = max(1, count - 1)
    return max(1, count)


def compute_dispersion_score(stats: TextStats) -> dict:
    """Compute a 'human-likeness' score based on dispersion metrics.

    AI text tends to have:
    - Lower TTR (more repetitive vocabulary)
    - Lower sentence length variance (less bursty)
    - Lower word frequency dispersion (tokens cluster in high-prob zones)
    - Higher readability (simpler, more uniform structure)

    Human text has HIGHER dispersion across all these dimensions.
    """
    # Reference values for "AI-like" vs "Human-like" text
    # Based on literature (GPTZero burstiness, Fast-DetectGPT curvature)
    ai_typical = {
        "ttr": 0.35,           # AI: ~0.30-0.40 TTR
        "hapax": 0.40,         # AI: fewer rare words
        "sent_cv": 0.40,       # AI: uniform sentence length
        "word_freq_std": 1.5,  # AI: low dispersion (tokens cluster)
    }
    human_typical = {
        "ttr": 0.55,           # Human: ~0.50-0.65 TTR
        "hapax": 0.55,         # Human: more rare words
        "sent_cv": 0.75,       # Human: varied sentence length
        "word_freq_std": 3.0,  # Human: high dispersion (varied choices)
    }

    attr_map = {
        "ttr": "type_token_ratio",
        "hapax": "hapax_ratio",
        "sent_cv": "sentence_len_cv",
        "word_freq_std": "std_word_freq",
    }

    scores = {}
    for metric, ai_val in ai_typical.items():
        human_val = human_typical[metric]
        actual = getattr(stats, attr_map[metric])
        # Normalize: 0 = AI-like, 1 = Human-like
        normalized = (actual - ai_val) / max(human_val - ai_val, 0.001)
        normalized = max(0.0, min(1.0, normalized))
        scores[metric] = round(normalized, 3)

    scores["overall_human_likeness"] = round(sum(scores.values()) / len(scores), 3)
    return scores


def evaluate_copa_results(input_path: str, output_path: str) -> None:
    """Run statistical evaluation on CoPA results."""
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    results = data.get("results", [])
    if not results:
        print("[Eval] No results to evaluate.")
        return

    print(f"[Eval] Analyzing {len(results)} samples...")

    orig_stats_list = []
    rewritten_stats_list = []
    dispersion_orig = []
    dispersion_rewritten = []

    for i, r in enumerate(results):
        orig_text = r["original"]
        rewritten_text = r["rewritten"]

        orig_s = compute_stats(orig_text)
        rewritten_s = compute_stats(rewritten_text)

        orig_stats_list.append(orig_s)
        rewritten_stats_list.append(rewritten_s)

        disp_orig = compute_dispersion_score(orig_s)
        disp_rewritten = compute_dispersion_score(rewritten_s)
        dispersion_orig.append(disp_orig)
        dispersion_rewritten.append(disp_rewritten)

    # Aggregate
    def avg_stats(stats_list, attr):
        vals = [getattr(s, attr) for s in stats_list if getattr(s, attr) > 0]
        return sum(vals) / max(len(vals), 1)

    def avg_disp(disp_list, key):
        vals = [d[key] for d in disp_list]
        return sum(vals) / max(len(vals), 1)

    report = {
        "eval_type": "statistical_analysis",
        "model": data.get("model", data.get("config", {}).get("model", "unknown")),
        "num_samples": len(results),
        "status": data.get("status", "unknown"),
        "elapsed_seconds": data.get("elapsed_seconds", 0),
        "tokens_per_second": data.get("tokens_per_second", 0),

        "text_statistics": {
            "original": {
                "avg_words": round(avg_stats(orig_stats_list, "num_words"), 1),
                "avg_sentences": round(avg_stats(orig_stats_list, "num_sentences"), 1),
                "avg_sentence_len": round(avg_stats(orig_stats_list, "avg_sentence_len"), 1),
                "sentence_len_cv": round(avg_stats(orig_stats_list, "sentence_len_cv"), 3),
                "type_token_ratio": round(avg_stats(orig_stats_list, "type_token_ratio"), 3),
                "hapax_ratio": round(avg_stats(orig_stats_list, "hapax_ratio"), 3),
                "avg_word_len": round(avg_stats(orig_stats_list, "avg_word_len"), 1),
                "std_word_len": round(avg_stats(orig_stats_list, "std_word_len"), 2),
                "avg_word_freq": round(avg_stats(orig_stats_list, "avg_word_freq"), 1),
                "std_word_freq": round(avg_stats(orig_stats_list, "std_word_freq"), 2),
                "readability_flesch": round(avg_stats(orig_stats_list, "readability_flesch"), 1),
            },
            "rewritten": {
                "avg_words": round(avg_stats(rewritten_stats_list, "num_words"), 1),
                "avg_sentences": round(avg_stats(rewritten_stats_list, "num_sentences"), 1),
                "avg_sentence_len": round(avg_stats(rewritten_stats_list, "avg_sentence_len"), 1),
                "sentence_len_cv": round(avg_stats(rewritten_stats_list, "sentence_len_cv"), 3),
                "type_token_ratio": round(avg_stats(rewritten_stats_list, "type_token_ratio"), 3),
                "hapax_ratio": round(avg_stats(rewritten_stats_list, "hapax_ratio"), 3),
                "avg_word_len": round(avg_stats(rewritten_stats_list, "avg_word_len"), 1),
                "std_word_len": round(avg_stats(rewritten_stats_list, "std_word_len"), 2),
                "avg_word_freq": round(avg_stats(rewritten_stats_list, "avg_word_freq"), 1),
                "std_word_freq": round(avg_stats(rewritten_stats_list, "std_word_freq"), 2),
                "readability_flesch": round(avg_stats(rewritten_stats_list, "readability_flesch"), 1),
            },
        },

        "dispersion_analysis": {
            "original": {
                "ttr": round(avg_disp(dispersion_orig, "ttr"), 3),
                "hapax": round(avg_disp(dispersion_orig, "hapax"), 3),
                "sent_cv": round(avg_disp(dispersion_orig, "sent_cv"), 3),
                "word_freq_std": round(avg_disp(dispersion_orig, "word_freq_std"), 3),
                "overall_human_likeness": round(avg_disp(dispersion_orig, "overall_human_likeness"), 3),
            },
            "rewritten": {
                "ttr": round(avg_disp(dispersion_rewritten, "ttr"), 3),
                "hapax": round(avg_disp(dispersion_rewritten, "hapax"), 3),
                "sent_cv": round(avg_disp(dispersion_rewritten, "sent_cv"), 3),
                "word_freq_std": round(avg_disp(dispersion_rewritten, "word_freq_std"), 3),
                "overall_human_likeness": round(avg_disp(dispersion_rewritten, "overall_human_likeness"), 3),
            },
        },

        "key_findings": [],
    }

    # Generate key findings
    orig_hl = report["dispersion_analysis"]["original"]["overall_human_likeness"]
    rew_hl = report["dispersion_analysis"]["rewritten"]["overall_human_likeness"]
    delta_hl = rew_hl - orig_hl

    findings = [
        f"Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})",
        f"TTR: {report['dispersion_analysis']['original']['ttr']:.3f} -> {report['dispersion_analysis']['rewritten']['ttr']:.3f} "
        f"({'increased' if report['dispersion_analysis']['rewritten']['ttr'] > report['dispersion_analysis']['original']['ttr'] else 'decreased'} vocabulary diversity)",
        f"Sentence CV: {report['dispersion_analysis']['original']['sent_cv']:.3f} -> {report['dispersion_analysis']['rewritten']['sent_cv']:.3f} "
        f"({'more' if report['dispersion_analysis']['rewritten']['sent_cv'] > report['dispersion_analysis']['original']['sent_cv'] else 'less'} bursty sentence structure)",
        f"Readability: {report['text_statistics']['original']['readability_flesch']:.0f} -> {report['text_statistics']['rewritten']['readability_flesch']:.0f} Flesch "
        f"({'easier' if report['text_statistics']['rewritten']['readability_flesch'] > report['text_statistics']['original']['readability_flesch'] else 'harder'} to read)",
        f"Word freq dispersion: {report['text_statistics']['original']['std_word_freq']:.2f} -> {report['text_statistics']['rewritten']['std_word_freq']:.2f} "
        f"({'higher' if report['text_statistics']['rewritten']['std_word_freq'] > report['text_statistics']['original']['std_word_freq'] else 'lower'} token dispersion)",
    ]
    report["key_findings"] = findings

    # Detector evasion potential (heuristic)
    evasion_potential = "LOW"
    if delta_hl > 0.15:
        evasion_potential = "HIGH"
    elif delta_hl > 0.05:
        evasion_potential = "MEDIUM"

    report["evasion_potential"] = {
        "rating": evasion_potential,
        "human_likeness_delta": round(delta_hl, 3),
        "note": "Statistical heuristic only. Real detector evaluation (Fast-DetectGPT, Binoculars, Pangram) requires Modal GPU — see next phase.",
    }

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(f"[Eval] Report saved to {output_path}")
    print(f"[Eval] Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})")
    print(f"[Eval] Evasion potential: {evasion_potential}")
    for f_ in findings:
        print(f"  - {f_}")


if __name__ == "__main__":
    input_file = sys.argv[1] if len(sys.argv) > 1 else "output/copa_modal_results.json"
    output_file = sys.argv[2] if len(sys.argv) > 2 else "output/eval_statistical_report.json"
    evaluate_copa_results(input_file, output_file)