"""Data quality analysis: low confidence, mixed language, duplicate detection."""

from __future__ import annotations

from collections import Counter
from typing import List

from app.models.schemas import AnalyzedEntry, DataQualityReport


def analyze_data_quality(entries: list[AnalyzedEntry]) -> DataQualityReport:
    """Generate data quality report from analyzed entries."""
    if not entries:
        return DataQualityReport(
            total_entries=0,
            low_confidence_count=0,
            low_confidence_entries=[],
            mixed_language_count=0,
            mixed_language_entries=[],
            duplicate_count=0,
            duplicate_entries=[],
            avg_confidence=0.0,
            language_distribution={},
        )

    # Low confidence predictions (< 0.5)
    low_conf = [e for e in entries if e.sentiment.confidence < 0.5]
    low_conf_ids = [e.id for e in low_conf[:50]]

    # Mixed language: entries where detected language differs from majority
    lang_counts = Counter(e.language.language for e in entries)
    majority_lang = lang_counts.most_common(1)[0][0] if lang_counts else "unknown"
    mixed_lang = [
        e for e in entries
        if e.language.language != majority_lang and e.language.language != "unknown"
    ]
    mixed_lang_ids = [e.id for e in mixed_lang[:50]]

    # Duplicate detection via text similarity (exact and near-duplicates)
    seen_texts: dict[str, str] = {}
    duplicate_ids = []
    for e in entries:
        normalized = e.text.strip().lower()[:200]
        if normalized in seen_texts:
            duplicate_ids.append(e.id)
        else:
            seen_texts[normalized] = e.id

    # Average confidence
    avg_conf = sum(e.sentiment.confidence for e in entries) / len(entries)

    return DataQualityReport(
        total_entries=len(entries),
        low_confidence_count=len(low_conf),
        low_confidence_entries=low_conf_ids,
        mixed_language_count=len(mixed_lang),
        mixed_language_entries=mixed_lang_ids,
        duplicate_count=len(duplicate_ids),
        duplicate_entries=duplicate_ids[:50],
        avg_confidence=round(avg_conf, 4),
        language_distribution=dict(lang_counts),
    )