| """ |
| Lightweight statistical evaluation — no torch/transformers needed. |
| Computes the exact text dispersion metrics that AI detectors exploit: |
| - Vocabulary richness (TTR, Hapax ratio) |
| - Sentence length variance (burstiness) |
| - Word frequency dispersion |
| - Token repetition patterns |
| - Readability scores |
| |
| These are the features detectors like GPTZero (perplexity + burstiness), |
| Fast-DetectGPT (curvature), and Binoculars (cross-perplexity) exploit. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import math |
| import os |
| import re |
| import sys |
| from collections import Counter |
| from dataclasses import dataclass, field |
|
|
|
|
| @dataclass |
| class TextStats: |
| """Statistical profile of a single text.""" |
| num_words: int = 0 |
| num_sentences: int = 0 |
| num_unique_words: int = 0 |
| type_token_ratio: float = 0.0 |
| hapax_ratio: float = 0.0 |
| avg_word_len: float = 0.0 |
| std_word_len: float = 0.0 |
| avg_sentence_len: float = 0.0 |
| std_sentence_len: float = 0.0 |
| sentence_len_cv: float = 0.0 |
| avg_word_freq: float = 0.0 |
| std_word_freq: float = 0.0 |
| readability_flesch: float = 0.0 |
|
|
|
|
| def tokenize_sentences(text: str) -> list[str]: |
| """Simple sentence tokenizer.""" |
| return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip() and len(s.strip().split()) >= 3] |
|
|
|
|
| def tokenize_words(text: str) -> list[str]: |
| """Simple word tokenizer — lowercase, strip punctuation.""" |
| return [w.lower().strip('.,;:!?()[]{}"\'-') for w in text.split() if w.strip('.,;:!?()[]{}"\'-')] |
|
|
|
|
| def compute_stats(text: str) -> TextStats: |
| """Compute all statistical metrics for a text.""" |
| stats = TextStats() |
| words = tokenize_words(text) |
| sentences = tokenize_sentences(text) |
|
|
| if not words: |
| return stats |
|
|
| |
| stats.num_words = len(words) |
| stats.num_unique_words = len(set(words)) |
| stats.type_token_ratio = stats.num_unique_words / max(stats.num_words, 1) |
|
|
| word_counts = Counter(words) |
| hapax = sum(1 for c in word_counts.values() if c == 1) |
| stats.hapax_ratio = hapax / max(stats.num_unique_words, 1) |
|
|
| word_lens = [len(w) for w in words] |
| stats.avg_word_len = sum(word_lens) / max(len(word_lens), 1) |
| stats.std_word_len = float( |
| math.sqrt(sum((l - stats.avg_word_len) ** 2 for l in word_lens) / max(len(word_lens), 1)) |
| ) |
|
|
| |
| freqs = list(word_counts.values()) |
| stats.avg_word_freq = sum(freqs) / max(len(freqs), 1) |
| stats.std_word_freq = float( |
| math.sqrt(sum((f - stats.avg_word_freq) ** 2 for f in freqs) / max(len(freqs), 1)) |
| ) |
|
|
| |
| stats.num_sentences = len(sentences) |
| sent_lens = [len(tokenize_words(s)) for s in sentences] |
| if sent_lens: |
| stats.avg_sentence_len = sum(sent_lens) / len(sent_lens) |
| variance = sum((l - stats.avg_sentence_len) ** 2 for l in sent_lens) / len(sent_lens) |
| stats.std_sentence_len = float(math.sqrt(variance)) |
| stats.sentence_len_cv = stats.std_sentence_len / max(stats.avg_sentence_len, 0.01) |
|
|
| |
| total_syllables = sum(count_syllables(w) for w in words) |
| if stats.num_sentences > 0 and stats.num_words > 0: |
| stats.readability_flesch = 206.835 - 1.015 * (stats.num_words / stats.num_sentences) \ |
| - 84.6 * (total_syllables / stats.num_words) |
|
|
| return stats |
|
|
|
|
| def count_syllables(word: str) -> int: |
| """Approximate syllable count.""" |
| word = word.lower() |
| if len(word) <= 3: |
| return 1 |
| vowels = "aeiouy" |
| count = 0 |
| prev_vowel = False |
| for ch in word: |
| is_vowel = ch in vowels |
| if is_vowel and not prev_vowel: |
| count += 1 |
| prev_vowel = is_vowel |
| if word.endswith("e"): |
| count = max(1, count - 1) |
| return max(1, count) |
|
|
|
|
| def compute_dispersion_score(stats: TextStats) -> dict: |
| """Compute a 'human-likeness' score based on dispersion metrics. |
| |
| AI text tends to have: |
| - Lower TTR (more repetitive vocabulary) |
| - Lower sentence length variance (less bursty) |
| - Lower word frequency dispersion (tokens cluster in high-prob zones) |
| - Higher readability (simpler, more uniform structure) |
| |
| Human text has HIGHER dispersion across all these dimensions. |
| """ |
| |
| |
| ai_typical = { |
| "ttr": 0.35, |
| "hapax": 0.40, |
| "sent_cv": 0.40, |
| "word_freq_std": 1.5, |
| } |
| human_typical = { |
| "ttr": 0.55, |
| "hapax": 0.55, |
| "sent_cv": 0.75, |
| "word_freq_std": 3.0, |
| } |
|
|
| attr_map = { |
| "ttr": "type_token_ratio", |
| "hapax": "hapax_ratio", |
| "sent_cv": "sentence_len_cv", |
| "word_freq_std": "std_word_freq", |
| } |
|
|
| scores = {} |
| for metric, ai_val in ai_typical.items(): |
| human_val = human_typical[metric] |
| actual = getattr(stats, attr_map[metric]) |
| |
| normalized = (actual - ai_val) / max(human_val - ai_val, 0.001) |
| normalized = max(0.0, min(1.0, normalized)) |
| scores[metric] = round(normalized, 3) |
|
|
| scores["overall_human_likeness"] = round(sum(scores.values()) / len(scores), 3) |
| return scores |
|
|
|
|
| def evaluate_copa_results(input_path: str, output_path: str) -> None: |
| """Run statistical evaluation on CoPA results.""" |
| with open(input_path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
|
|
| results = data.get("results", []) |
| if not results: |
| print("[Eval] No results to evaluate.") |
| return |
|
|
| print(f"[Eval] Analyzing {len(results)} samples...") |
|
|
| orig_stats_list = [] |
| rewritten_stats_list = [] |
| dispersion_orig = [] |
| dispersion_rewritten = [] |
|
|
| for i, r in enumerate(results): |
| orig_text = r["original"] |
| rewritten_text = r["rewritten"] |
|
|
| orig_s = compute_stats(orig_text) |
| rewritten_s = compute_stats(rewritten_text) |
|
|
| orig_stats_list.append(orig_s) |
| rewritten_stats_list.append(rewritten_s) |
|
|
| disp_orig = compute_dispersion_score(orig_s) |
| disp_rewritten = compute_dispersion_score(rewritten_s) |
| dispersion_orig.append(disp_orig) |
| dispersion_rewritten.append(disp_rewritten) |
|
|
| |
| def avg_stats(stats_list, attr): |
| vals = [getattr(s, attr) for s in stats_list if getattr(s, attr) > 0] |
| return sum(vals) / max(len(vals), 1) |
|
|
| def avg_disp(disp_list, key): |
| vals = [d[key] for d in disp_list] |
| return sum(vals) / max(len(vals), 1) |
|
|
| report = { |
| "eval_type": "statistical_analysis", |
| "model": data.get("model", data.get("config", {}).get("model", "unknown")), |
| "num_samples": len(results), |
| "status": data.get("status", "unknown"), |
| "elapsed_seconds": data.get("elapsed_seconds", 0), |
| "tokens_per_second": data.get("tokens_per_second", 0), |
|
|
| "text_statistics": { |
| "original": { |
| "avg_words": round(avg_stats(orig_stats_list, "num_words"), 1), |
| "avg_sentences": round(avg_stats(orig_stats_list, "num_sentences"), 1), |
| "avg_sentence_len": round(avg_stats(orig_stats_list, "avg_sentence_len"), 1), |
| "sentence_len_cv": round(avg_stats(orig_stats_list, "sentence_len_cv"), 3), |
| "type_token_ratio": round(avg_stats(orig_stats_list, "type_token_ratio"), 3), |
| "hapax_ratio": round(avg_stats(orig_stats_list, "hapax_ratio"), 3), |
| "avg_word_len": round(avg_stats(orig_stats_list, "avg_word_len"), 1), |
| "std_word_len": round(avg_stats(orig_stats_list, "std_word_len"), 2), |
| "avg_word_freq": round(avg_stats(orig_stats_list, "avg_word_freq"), 1), |
| "std_word_freq": round(avg_stats(orig_stats_list, "std_word_freq"), 2), |
| "readability_flesch": round(avg_stats(orig_stats_list, "readability_flesch"), 1), |
| }, |
| "rewritten": { |
| "avg_words": round(avg_stats(rewritten_stats_list, "num_words"), 1), |
| "avg_sentences": round(avg_stats(rewritten_stats_list, "num_sentences"), 1), |
| "avg_sentence_len": round(avg_stats(rewritten_stats_list, "avg_sentence_len"), 1), |
| "sentence_len_cv": round(avg_stats(rewritten_stats_list, "sentence_len_cv"), 3), |
| "type_token_ratio": round(avg_stats(rewritten_stats_list, "type_token_ratio"), 3), |
| "hapax_ratio": round(avg_stats(rewritten_stats_list, "hapax_ratio"), 3), |
| "avg_word_len": round(avg_stats(rewritten_stats_list, "avg_word_len"), 1), |
| "std_word_len": round(avg_stats(rewritten_stats_list, "std_word_len"), 2), |
| "avg_word_freq": round(avg_stats(rewritten_stats_list, "avg_word_freq"), 1), |
| "std_word_freq": round(avg_stats(rewritten_stats_list, "std_word_freq"), 2), |
| "readability_flesch": round(avg_stats(rewritten_stats_list, "readability_flesch"), 1), |
| }, |
| }, |
|
|
| "dispersion_analysis": { |
| "original": { |
| "ttr": round(avg_disp(dispersion_orig, "ttr"), 3), |
| "hapax": round(avg_disp(dispersion_orig, "hapax"), 3), |
| "sent_cv": round(avg_disp(dispersion_orig, "sent_cv"), 3), |
| "word_freq_std": round(avg_disp(dispersion_orig, "word_freq_std"), 3), |
| "overall_human_likeness": round(avg_disp(dispersion_orig, "overall_human_likeness"), 3), |
| }, |
| "rewritten": { |
| "ttr": round(avg_disp(dispersion_rewritten, "ttr"), 3), |
| "hapax": round(avg_disp(dispersion_rewritten, "hapax"), 3), |
| "sent_cv": round(avg_disp(dispersion_rewritten, "sent_cv"), 3), |
| "word_freq_std": round(avg_disp(dispersion_rewritten, "word_freq_std"), 3), |
| "overall_human_likeness": round(avg_disp(dispersion_rewritten, "overall_human_likeness"), 3), |
| }, |
| }, |
|
|
| "key_findings": [], |
| } |
|
|
| |
| orig_hl = report["dispersion_analysis"]["original"]["overall_human_likeness"] |
| rew_hl = report["dispersion_analysis"]["rewritten"]["overall_human_likeness"] |
| delta_hl = rew_hl - orig_hl |
|
|
| findings = [ |
| f"Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})", |
| f"TTR: {report['dispersion_analysis']['original']['ttr']:.3f} -> {report['dispersion_analysis']['rewritten']['ttr']:.3f} " |
| f"({'increased' if report['dispersion_analysis']['rewritten']['ttr'] > report['dispersion_analysis']['original']['ttr'] else 'decreased'} vocabulary diversity)", |
| f"Sentence CV: {report['dispersion_analysis']['original']['sent_cv']:.3f} -> {report['dispersion_analysis']['rewritten']['sent_cv']:.3f} " |
| f"({'more' if report['dispersion_analysis']['rewritten']['sent_cv'] > report['dispersion_analysis']['original']['sent_cv'] else 'less'} bursty sentence structure)", |
| f"Readability: {report['text_statistics']['original']['readability_flesch']:.0f} -> {report['text_statistics']['rewritten']['readability_flesch']:.0f} Flesch " |
| f"({'easier' if report['text_statistics']['rewritten']['readability_flesch'] > report['text_statistics']['original']['readability_flesch'] else 'harder'} to read)", |
| f"Word freq dispersion: {report['text_statistics']['original']['std_word_freq']:.2f} -> {report['text_statistics']['rewritten']['std_word_freq']:.2f} " |
| f"({'higher' if report['text_statistics']['rewritten']['std_word_freq'] > report['text_statistics']['original']['std_word_freq'] else 'lower'} token dispersion)", |
| ] |
| report["key_findings"] = findings |
|
|
| |
| evasion_potential = "LOW" |
| if delta_hl > 0.15: |
| evasion_potential = "HIGH" |
| elif delta_hl > 0.05: |
| evasion_potential = "MEDIUM" |
|
|
| report["evasion_potential"] = { |
| "rating": evasion_potential, |
| "human_likeness_delta": round(delta_hl, 3), |
| "note": "Statistical heuristic only. Real detector evaluation (Fast-DetectGPT, Binoculars, Pangram) requires Modal GPU — see next phase.", |
| } |
|
|
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| with open(output_path, "w", encoding="utf-8") as f: |
| json.dump(report, f, indent=2, ensure_ascii=False) |
|
|
| print(f"[Eval] Report saved to {output_path}") |
| print(f"[Eval] Human-likeness: {orig_hl:.3f} -> {rew_hl:.3f} (delta={delta_hl:+.3f})") |
| print(f"[Eval] Evasion potential: {evasion_potential}") |
| for f_ in findings: |
| print(f" - {f_}") |
|
|
|
|
| if __name__ == "__main__": |
| input_file = sys.argv[1] if len(sys.argv) > 1 else "output/copa_modal_results.json" |
| output_file = sys.argv[2] if len(sys.argv) > 2 else "output/eval_statistical_report.json" |
| evaluate_copa_results(input_file, output_file) |
|
|