"""
Stylometry analysis service.
Lightweight CPU-only feature extraction: char n-grams, function-word frequencies,
punctuation patterns, and read-time heuristics.
No external models required.
"""
import re
import math
from collections import Counter
from typing import Dict

# Common English function words
FUNCTION_WORDS = {
    "the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
    "it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
    "this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
    "or", "an", "will", "my", "one", "all", "would", "there", "their",
    "what", "so", "up", "out", "if", "about", "who", "get", "which", "go",
    "me", "when", "make", "can", "like", "time", "no", "just", "him",
    "know", "take", "people", "into", "year", "your", "good", "some",
}


def _char_ngrams(text: str, n: int = 3) -> Dict[str, int]:
    """Extract character n-gram frequency distribution."""
    ngrams = Counter()
    text_lower = text.lower()
    for i in range(len(text_lower) - n + 1):
        ngrams[text_lower[i:i + n]] += 1
    return dict(ngrams)


def _function_word_freq(text: str) -> float:
    """Compute ratio of function words to total words."""
    words = text.lower().split()
    if not words:
        return 0.0
    fw_count = sum(1 for w in words if w.strip(".,!?;:\"'") in FUNCTION_WORDS)
    return fw_count / len(words)


def _punctuation_pattern(text: str) -> Dict[str, float]:
    """Extract punctuation density and diversity metrics."""
    if not text:
        return {"density": 0.0, "diversity": 0.0}
    puncts = re.findall(r"[^\w\s]", text)
    density = len(puncts) / len(text)
    diversity = len(set(puncts)) / max(len(puncts), 1)
    return {"density": round(density, 4), "diversity": round(diversity, 4)}


def _readability_score(text: str) -> float:
    """Simple Automated Readability Index approximation."""
    sentences = max(len(re.split(r"[.!?]+", text)), 1)
    words = text.split()
    word_count = max(len(words), 1)
    char_count = sum(len(w) for w in words)
    ari = 4.71 * (char_count / word_count) + 0.5 * (word_count / sentences) - 21.43
    return max(0, min(20, ari))


def _sentence_length_variance(text: str) -> float:
    """Compute variance in sentence lengths (words per sentence)."""
    sentences = re.split(r"[.!?]+", text)
    lengths = [len(s.split()) for s in sentences if s.strip()]
    if len(lengths) < 2:
        return 0.0
    mean = sum(lengths) / len(lengths)
    variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
    return round(math.sqrt(variance), 4)


def compute_stylometry_score(text: str) -> float:
    """
    Compute a stylometry anomaly score (0-1).
    Higher scores indicate more anomalous writing patterns
    (potentially AI-generated or coordinated).

    Uses a combination of features compared against typical human baselines.
    """
    if not text or len(text) < 20:
        return 0.0

    features = []

    # Feature 1: Function word ratio (human ~0.4-0.55, AI tends to be more uniform)
    fw_ratio = _function_word_freq(text)
    fw_anomaly = abs(fw_ratio - 0.47) / 0.47  # Distance from typical human ratio
    features.append(min(1.0, fw_anomaly))

    # Feature 2: Punctuation patterns
    punct = _punctuation_pattern(text)
    # Very low or very high punctuation density is anomalous
    punct_anomaly = abs(punct["density"] - 0.06) / 0.06 if punct["density"] > 0 else 0.5
    features.append(min(1.0, punct_anomaly))

    # Feature 3: Sentence length variance (low variance = more AI-like)
    slv = _sentence_length_variance(text)
    # Typical human variance is 5-15 words; very low suggests AI
    slv_anomaly = max(0, 1.0 - slv / 10.0) if slv < 10 else 0.0
    features.append(slv_anomaly)

    # Feature 4: Readability consistency
    ari = _readability_score(text)
    # Very consistent readability (middle range) is more AI-like
    ari_anomaly = max(0, 1.0 - abs(ari - 10) / 10)
    features.append(ari_anomaly)

    # Feature 5: Character n-gram entropy
    ngrams = _char_ngrams(text, 3)
    if ngrams:
        total = sum(ngrams.values())
        probs = [c / total for c in ngrams.values()]
        entropy = -sum(p * math.log2(p) for p in probs if p > 0)
        max_entropy = math.log2(max(len(ngrams), 1))
        # Very high entropy = unusual; normalize
        norm_entropy = entropy / max_entropy if max_entropy > 0 else 0
        # AI text tends to have moderate-high entropy
        features.append(max(0, norm_entropy - 0.5) * 2)
    else:
        features.append(0.0)

    # Weighted average
    weights = [0.25, 0.15, 0.25, 0.15, 0.20]
    score = sum(f * w for f, w in zip(features, weights))
    return round(min(1.0, max(0.0, score)), 4)


def extract_features(text: str) -> Dict:
    """Extract all stylometry features for analysis."""
    return {
        "function_word_ratio": _function_word_freq(text),
        "punctuation": _punctuation_pattern(text),
        "readability_ari": _readability_score(text),
        "sentence_length_variance": _sentence_length_variance(text),
        "stylometry_score": compute_stylometry_score(text),
    }