""" Lightweight statistical evaluation — no torch/transformers needed. Computes the exact text dispersion metrics that AI detectors exploit: - Vocabulary richness (TTR, Hapax ratio) - Sentence length variance (burstiness) - Word frequency dispersion - Token repetition patterns - Readability scores These are the features detectors like GPTZero (perplexity + burstiness), Fast-DetectGPT (curvature), and Binoculars (cross-perplexity) exploit. """ from __future__ import annotations import json import math import os import re import sys from collections import Counter from dataclasses import dataclass, field @dataclass class TextStats: """Statistical profile of a single text.""" num_words: int = 0 num_sentences: int = 0 num_unique_words: int = 0 type_token_ratio: float = 0.0 # vocabulary richness hapax_ratio: float = 0.0 # words appearing exactly once avg_word_len: float = 0.0 std_word_len: float = 0.0 avg_sentence_len: float = 0.0 # mean sentence length std_sentence_len: float = 0.0 # burstiness proxy sentence_len_cv: float = 0.0 # coefficient of variation avg_word_freq: float = 0.0 # mean frequency of words std_word_freq: float = 0.0 # dispersion of word frequencies readability_flesch: float = 0.0 # Flesch Reading Ease def tokenize_sentences(text: str) -> list[str]: """Simple sentence tokenizer.""" return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip() and len(s.strip().split()) >= 3] def tokenize_words(text: str) -> list[str]: """Simple word tokenizer — lowercase, strip punctuation.""" return [w.lower().strip('.,;:!?()[]{}"\'-') for w in text.split() if w.strip('.,;:!?()[]{}"\'-')] def compute_stats(text: str) -> TextStats: """Compute all statistical metrics for a text.""" stats = TextStats() words = tokenize_words(text) sentences = tokenize_sentences(text) if not words: return stats # Word-level stats stats.num_words = len(words) stats.num_unique_words = len(set(words)) stats.type_token_ratio = stats.num_unique_words / max(stats.num_words, 1) word_counts = Counter(words) hapax = sum(1 for c in word_counts.values() if c == 1) stats.hapax_ratio = hapax / max(stats.num_unique_words, 1) word_lens = [len(w) for w in words] stats.avg_word_len = sum(word_lens) / max(len(word_lens), 1) stats.std_word_len = float( math.sqrt(sum((l - stats.avg_word_len) ** 2 for l in word_lens) / max(len(word_lens), 1)) ) # Word frequency dispersion freqs = list(word_counts.values()) stats.avg_word_freq = sum(freqs) / max(len(freqs), 1) stats.std_word_freq = float( math.sqrt(sum((f - stats.avg_word_freq) ** 2 for f in freqs) / max(len(freqs), 1)) ) # Sentence-level stats (burstiness) stats.num_sentences = len(sentences) sent_lens = [len(tokenize_words(s)) for s in sentences] if sent_lens: stats.avg_sentence_len = sum(sent_lens) / len(sent_lens) variance = sum((l - stats.avg_sentence_len) ** 2 for l in sent_lens) / len(sent_lens) stats.std_sentence_len = float(math.sqrt(variance)) stats.sentence_len_cv = stats.std_sentence_len / max(stats.avg_sentence_len, 0.01) # Flesch Reading Ease total_syllables = sum(count_syllables(w) for w in words) if stats.num_sentences > 0 and stats.num_words > 0: stats.readability_flesch = 206.835 - 1.015 * (stats.num_words / stats.num_sentences) \ - 84.6 * (total_syllables / stats.num_words) return stats def count_syllables(word: str) -> int: """Approximate syllable count.""" word = word.lower() if len(word) <= 3: return 1 vowels = "aeiouy" count = 0 prev_vowel = False for ch in word: is_vowel = ch in vowels if is_vowel and not prev_vowel: count += 1 prev_vowel = is_vowel if word.endswith("e"): count = max(1, count - 1) return max(1, count) def compute_dispersion_score(stats: TextStats) -> dict: """Compute a 'human-likeness' score based on dispersion metrics. AI text tends to have: - Lower TTR (more repetitive vocabulary) - Lower sentence length variance (less bursty) - Lower word frequency dispersion (tokens cluster in high-prob zones) - Higher readability (simpler, more uniform structure) Human text has HIGHER dispersion across all these dimensions. """ # Reference values for "AI-like" vs "Human-like" text # Based on literature (GPTZero burstiness, Fast-DetectGPT curvature) ai_typical = { "ttr": 0.35, # AI: ~0.30-0.40 TTR "hapax": 0.40, # AI: fewer rare words "sent_cv": 0.40, # AI: uniform sentence length "word_freq_std": 1.5, # AI: low dispersion (tokens cluster) } human_typical = { "ttr": 0.55, # Human: ~0.50-0.65 TTR "hapax": 0.55, # Human: more rare words "sent_cv": 0.75, # Human: varied sentence length "word_freq_std": 3.0, # Human: high dispersion (varied choices) } attr_map = { "ttr": "type_token_ratio", "hapax": "hapax_ratio", "sent_cv": "sentence_len_cv", "word_freq_std": "std_word_freq", } scores = {} for metric, ai_val in ai_typical.items(): human_val = human_typical[metric] actual = getattr(stats, attr_map[metric]) # Normalize: 0 = AI-like, 1 = Human-like normalized = (actual - ai_val) / max(human_val - ai_val, 0.001) normalized = max(0.0, min(1.0, normalized)) scores[metric] = round(normalized, 3) scores["overall_human_likeness"] = round(sum(scores.values()) / len(scores), 3) return scores def evaluate_copa_results(input_path: str, output_path: str) -> None: """Run statistical evaluation on CoPA results.""" with open(input_path, "r", encoding="utf-8") as f: data = json.load(f) results = data.get("results", []) if not results: print("[Eval] No results to evaluate.") return print(f"[Eval] Analyzing {len(results)} samples...") orig_stats_list = [] rewritten_stats_list = [] dispersion_orig = [] dispersion_rewritten = [] for i, r in enumerate(results): orig_text = r["original"] rewritten_text = r["rewritten"] orig_s = compute_stats(orig_text) rewritten_s = compute_stats(rewritten_text) orig_stats_list.append(orig_s) rewritten_stats_list.append(rewritten_s) disp_orig = compute_dispersion_score(orig_s) disp_rewritten = compute_dispersion_score(rewritten_s) dispersion_orig.append(disp_orig) dispersion_rewritten.append(disp_rewritten) # Aggregate def avg_stats(stats_list, attr): vals = [getattr(s, attr) for s in stats_list if getattr(s, attr) > 0] return sum(vals) / max(len(vals), 1) def avg_disp(disp_list, key): vals = [d[key] for d in disp_list] return sum(vals) / max(len(vals), 1) report = { "eval_type": "statistical_analysis", "model": data.get("model", data.get("config", {}).get("model", "unknown")), "num_samples": len(results), "status": data.get("status", "unknown"), "elapsed_seconds": data.get("elapsed_seconds", 0), "tokens_per_second": data.get("tokens_per_second", 0), "text_statistics": { "original": { "avg_words": round(avg_stats(orig_stats_list, "num_words"), 1), "avg_sentences": round(avg_stats(orig_stats_list, "num_sentences"), 1), "avg_sentence_len": round(avg_stats(orig_stats_list, "avg_sentence_len"), 1), "sentence_len_cv": round(avg_stats(orig_stats_list, "sentence_len_cv"), 3), "type_token_ratio": round(avg_stats(orig_stats_list, "type_token_ratio"), 3), "hapax_ratio": round(avg_stats(orig_stats_list, "hapax_ratio"), 3), "avg_word_len": round(avg_stats(orig_stats_list, "avg_word_len"), 1), "std_word_len": round(avg_stats(orig_stats_list, "std_word_len"), 2), "avg_word_freq": round(avg_stats(orig_stats_list, "avg_word_freq"), 1), "std_word_freq": round(avg_stats(orig_stats_list, "std_word_freq"), 2), "readability_flesch": round(avg_stats(orig_stats_list, "readability_flesch"), 1), }, "rewritten": { "avg_words": round(avg_stats(rewritten_stats_list, "num_words"), 1), "avg_sentences": round(avg_stats(rewritten_stats_list, "num_sentences"), 1), "avg_sentence_len": round(avg_stats(rewritten_stats_list, "avg_sentence_len"), 1), "sentence_len_cv": round(avg_stats(rewritten_stats_list, "sentence_len_cv"), 3), "type_token_ratio": round(avg_stats(rewritten_stats_list, "type_token_ratio"), 3), "hapax_ratio": round(avg_stats(rewritten_stats_list, "hapax_ratio"), 3), "avg_word_len": round(avg_stats(rewritten_stats_list, "avg_word_len"), 1), "std_word_len": round(avg_stats(rewritten_stats_list, "std_word_len"), 2), "avg_word_freq": round(avg_stats(rewritten_stats_list, "avg_word_freq"), 1), "std_word_freq": round(avg_stats(rewritten_stats_list, "std_word_freq"), 2), "readability_flesch": round(avg_stats(rewritten_stats_list, "readability_flesch"), 1), }, }, "dispersion_analysis": { "original": { "ttr": round(avg_disp(dispersion_orig, "ttr"), 3), "hapax": round(avg_disp(dispersion_orig, "hapax"), 3), "sent_cv": round(avg_disp(dispersion_orig, "sent_cv"), 3), "word_freq_std": round(avg_disp(dispersion_orig, "word_freq_std"), 3), "overall_human_likeness": round(avg_disp(dispersion_orig, "overall_human_likeness"), 3), }, "rewritten": { "ttr": round(avg_disp(dispersion_rewritten, "ttr"), 3), "hapax": round(avg_disp(dispersion_rewritten, "hapax"), 3), "sent_cv": round(avg_disp(dispersion_rewritten, "sent_cv"), 3), "word_freq_std": round(avg_disp(dispersion_rewritten, "word_freq_std"), 3), "overall_human_likeness": round(avg_disp(dispersion_rewritten, "overall_human_likeness"), 3), }, }, "key_findings": [], } # Generate key findings orig_hl = report["dispersion_analysis"]["original"]["overall_human_likeness"] rew_hl = report["dispersion_analysis"]["rewritten"]["overall_human_likeness"] delta_hl = rew_hl - orig_hl findings = [ f"Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})", f"TTR: {report['dispersion_analysis']['original']['ttr']:.3f} -> {report['dispersion_analysis']['rewritten']['ttr']:.3f} " f"({'increased' if report['dispersion_analysis']['rewritten']['ttr'] > report['dispersion_analysis']['original']['ttr'] else 'decreased'} vocabulary diversity)", f"Sentence CV: {report['dispersion_analysis']['original']['sent_cv']:.3f} -> {report['dispersion_analysis']['rewritten']['sent_cv']:.3f} " f"({'more' if report['dispersion_analysis']['rewritten']['sent_cv'] > report['dispersion_analysis']['original']['sent_cv'] else 'less'} bursty sentence structure)", f"Readability: {report['text_statistics']['original']['readability_flesch']:.0f} -> {report['text_statistics']['rewritten']['readability_flesch']:.0f} Flesch " f"({'easier' if report['text_statistics']['rewritten']['readability_flesch'] > report['text_statistics']['original']['readability_flesch'] else 'harder'} to read)", f"Word freq dispersion: {report['text_statistics']['original']['std_word_freq']:.2f} -> {report['text_statistics']['rewritten']['std_word_freq']:.2f} " f"({'higher' if report['text_statistics']['rewritten']['std_word_freq'] > report['text_statistics']['original']['std_word_freq'] else 'lower'} token dispersion)", ] report["key_findings"] = findings # Detector evasion potential (heuristic) evasion_potential = "LOW" if delta_hl > 0.15: evasion_potential = "HIGH" elif delta_hl > 0.05: evasion_potential = "MEDIUM" report["evasion_potential"] = { "rating": evasion_potential, "human_likeness_delta": round(delta_hl, 3), "note": "Statistical heuristic only. Real detector evaluation (Fast-DetectGPT, Binoculars, Pangram) requires Modal GPU — see next phase.", } os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"[Eval] Report saved to {output_path}") print(f"[Eval] Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})") print(f"[Eval] Evasion potential: {evasion_potential}") for f_ in findings: print(f" - {f_}") if __name__ == "__main__": input_file = sys.argv[1] if len(sys.argv) > 1 else "output/copa_modal_results.json" output_file = sys.argv[2] if len(sys.argv) > 2 else "output/eval_statistical_report.json" evaluate_copa_results(input_file, output_file)