from __future__ import annotations from typing import Any, Dict, Mapping from .explanation import explain_results from .features import extract_features from .normalization import normalize_text from .scoring import DIALECTS, score_dialects from .tokenization import tokenize def classify_text(text: str, *, strip_diacritics: bool = False) -> Dict[str, Any]: """End-to-end dialect classification pipeline.""" normalized = normalize_text(text, strip_diacritics=strip_diacritics) tokens = tokenize(normalized) features = extract_features(tokens) scores = score_dialects(features) dialect = max(scores.items(), key=lambda kv: kv[1])[0] if scores else "Unknown" confidence = (float(scores.get(dialect, 0.0)) / 100.0) if scores else 0.0 top_features: Dict[str, Any] = {} contrib_map: Mapping[str, float] = (features.get("_contributions", {}) or {}).get(dialect, {}) # type: ignore[assignment] for name, delta in sorted(contrib_map.items(), key=lambda kv: abs(kv[1]), reverse=True)[:6]: top_features[name] = {"contribution": float(delta)} explanation = explain_results(features, scores) return { "dialect": dialect, "confidence": confidence, "scores": scores, "top_features": top_features, "explanation": explanation, }