""" PhilVerify — Language Detector Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic. No heavy model needed — runs instantly. """ import re import logging from dataclasses import dataclass logger = logging.getLogger(__name__) # ── Filipino stopword set for heuristic ─────────────────────────────────────── _TL_MARKERS = { "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si", "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw", "ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung", "pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi", "may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino", } # English marker words (distinct from TL) _EN_MARKERS = { "the", "and", "is", "are", "was", "were", "this", "that", "with", "from", "have", "has", "had", "will", "would", "could", "should", "not", "been", "being", "they", "their", "there", } @dataclass class LanguageResult: language: str # "Tagalog" | "English" | "Taglish" | "Unknown" confidence: float # 0.0 – 1.0 tl_ratio: float en_ratio: float method: str # "heuristic" | "langdetect" | "combined" class LanguageDetector: """ Two-pass language detector: Pass 1 — Filipino stopword ratio (fast, handles code-switching) Pass 2 — langdetect (for confirmation when ratios are ambiguous) Decision rules: tl_ratio >= 0.25 and en_ratio < 0.15 → Tagalog en_ratio >= 0.25 and tl_ratio < 0.15 → English both >= 0.15 → Taglish fallback → langdetect result """ def _token_ratios(self, text: str) -> tuple[float, float]: tokens = re.findall(r"\b\w+\b", text.lower()) if not tokens: return 0.0, 0.0 tl_count = sum(1 for t in tokens if t in _TL_MARKERS) en_count = sum(1 for t in tokens if t in _EN_MARKERS) total = len(tokens) return tl_count / total, en_count / total def _langdetect(self, text: str) -> str: try: from langdetect import detect code = detect(text) # langdetect returns 'tl' for Tagalog if code == "tl": return "Tagalog" elif code == "en": return "English" else: return "Unknown" except Exception: return "Unknown" def detect(self, text: str) -> LanguageResult: if not text or len(text.strip()) < 5: return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic") tl_ratio, en_ratio = self._token_ratios(text) # Clear Tagalog if tl_ratio >= 0.25 and en_ratio < 0.15: return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic") # Clear English if en_ratio >= 0.25 and tl_ratio < 0.15: return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic") # Taglish — both markers present if tl_ratio >= 0.10 and en_ratio >= 0.10: confidence = (tl_ratio + en_ratio) / 2 return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic") # Ambiguous — fall back to langdetect ld_lang = self._langdetect(text) if ld_lang != "Unknown": confidence = max(tl_ratio, en_ratio, 0.5) return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect") return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")