Spaces:
Sleeping
Sleeping
| """Data quality analysis: low confidence, mixed language, duplicate detection.""" | |
| from __future__ import annotations | |
| from collections import Counter | |
| from typing import List | |
| from app.models.schemas import AnalyzedEntry, DataQualityReport | |
| def analyze_data_quality(entries: list[AnalyzedEntry]) -> DataQualityReport: | |
| """Generate data quality report from analyzed entries.""" | |
| if not entries: | |
| return DataQualityReport( | |
| total_entries=0, | |
| low_confidence_count=0, | |
| low_confidence_entries=[], | |
| mixed_language_count=0, | |
| mixed_language_entries=[], | |
| duplicate_count=0, | |
| duplicate_entries=[], | |
| avg_confidence=0.0, | |
| language_distribution={}, | |
| ) | |
| # Low confidence predictions (< 0.5) | |
| low_conf = [e for e in entries if e.sentiment.confidence < 0.5] | |
| low_conf_ids = [e.id for e in low_conf[:50]] | |
| # Mixed language: entries where detected language differs from majority | |
| lang_counts = Counter(e.language.language for e in entries) | |
| majority_lang = lang_counts.most_common(1)[0][0] if lang_counts else "unknown" | |
| mixed_lang = [ | |
| e for e in entries | |
| if e.language.language != majority_lang and e.language.language != "unknown" | |
| ] | |
| mixed_lang_ids = [e.id for e in mixed_lang[:50]] | |
| # Duplicate detection via text similarity (exact and near-duplicates) | |
| seen_texts: dict[str, str] = {} | |
| duplicate_ids = [] | |
| for e in entries: | |
| normalized = e.text.strip().lower()[:200] | |
| if normalized in seen_texts: | |
| duplicate_ids.append(e.id) | |
| else: | |
| seen_texts[normalized] = e.id | |
| # Average confidence | |
| avg_conf = sum(e.sentiment.confidence for e in entries) / len(entries) | |
| return DataQualityReport( | |
| total_entries=len(entries), | |
| low_confidence_count=len(low_conf), | |
| low_confidence_entries=low_conf_ids, | |
| mixed_language_count=len(mixed_lang), | |
| mixed_language_entries=mixed_lang_ids, | |
| duplicate_count=len(duplicate_ids), | |
| duplicate_entries=duplicate_ids[:50], | |
| avg_confidence=round(avg_conf, 4), | |
| language_distribution=dict(lang_counts), | |
| ) | |