topic-analysis / backend /app /services /data_quality.py
alexchilton
Initial deployment: Sentiment & Topic Analysis Dashboard
6242ddb
"""Data quality analysis: low confidence, mixed language, duplicate detection."""
from __future__ import annotations
from collections import Counter
from typing import List
from app.models.schemas import AnalyzedEntry, DataQualityReport
def analyze_data_quality(entries: list[AnalyzedEntry]) -> DataQualityReport:
"""Generate data quality report from analyzed entries."""
if not entries:
return DataQualityReport(
total_entries=0,
low_confidence_count=0,
low_confidence_entries=[],
mixed_language_count=0,
mixed_language_entries=[],
duplicate_count=0,
duplicate_entries=[],
avg_confidence=0.0,
language_distribution={},
)
# Low confidence predictions (< 0.5)
low_conf = [e for e in entries if e.sentiment.confidence < 0.5]
low_conf_ids = [e.id for e in low_conf[:50]]
# Mixed language: entries where detected language differs from majority
lang_counts = Counter(e.language.language for e in entries)
majority_lang = lang_counts.most_common(1)[0][0] if lang_counts else "unknown"
mixed_lang = [
e for e in entries
if e.language.language != majority_lang and e.language.language != "unknown"
]
mixed_lang_ids = [e.id for e in mixed_lang[:50]]
# Duplicate detection via text similarity (exact and near-duplicates)
seen_texts: dict[str, str] = {}
duplicate_ids = []
for e in entries:
normalized = e.text.strip().lower()[:200]
if normalized in seen_texts:
duplicate_ids.append(e.id)
else:
seen_texts[normalized] = e.id
# Average confidence
avg_conf = sum(e.sentiment.confidence for e in entries) / len(entries)
return DataQualityReport(
total_entries=len(entries),
low_confidence_count=len(low_conf),
low_confidence_entries=low_conf_ids,
mixed_language_count=len(mixed_lang),
mixed_language_entries=mixed_lang_ids,
duplicate_count=len(duplicate_ids),
duplicate_entries=duplicate_ids[:50],
avg_confidence=round(avg_conf, 4),
language_distribution=dict(lang_counts),
)