thomascerniglia's picture
Upload 8 files
d0326ea verified
from __future__ import annotations
from typing import Any, Dict, Mapping
from .explanation import explain_results
from .features import extract_features
from .normalization import normalize_text
from .scoring import DIALECTS, score_dialects
from .tokenization import tokenize
def classify_text(text: str, *, strip_diacritics: bool = False) -> Dict[str, Any]:
"""End-to-end dialect classification pipeline."""
normalized = normalize_text(text, strip_diacritics=strip_diacritics)
tokens = tokenize(normalized)
features = extract_features(tokens)
scores = score_dialects(features)
dialect = max(scores.items(), key=lambda kv: kv[1])[0] if scores else "Unknown"
confidence = (float(scores.get(dialect, 0.0)) / 100.0) if scores else 0.0
top_features: Dict[str, Any] = {}
contrib_map: Mapping[str, float] = (features.get("_contributions", {}) or {}).get(dialect, {}) # type: ignore[assignment]
for name, delta in sorted(contrib_map.items(), key=lambda kv: abs(kv[1]), reverse=True)[:6]:
top_features[name] = {"contribution": float(delta)}
explanation = explain_results(features, scores)
return {
"dialect": dialect,
"confidence": confidence,
"scores": scores,
"top_features": top_features,
"explanation": explanation,
}