philverify-api / nlp /language_detector.py
Ryan Christian D. Deniega
feat: PhilVerify Phase 1-3 β€” FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)
6c9b8f1
"""
PhilVerify β€” Language Detector
Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic.
No heavy model needed β€” runs instantly.
"""
import re
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# ── Filipino stopword set for heuristic ───────────────────────────────────────
_TL_MARKERS = {
"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw",
"ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung",
"pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi",
"may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino",
}
# English marker words (distinct from TL)
_EN_MARKERS = {
"the", "and", "is", "are", "was", "were", "this", "that", "with",
"from", "have", "has", "had", "will", "would", "could", "should",
"not", "been", "being", "they", "their", "there",
}
@dataclass
class LanguageResult:
language: str # "Tagalog" | "English" | "Taglish" | "Unknown"
confidence: float # 0.0 – 1.0
tl_ratio: float
en_ratio: float
method: str # "heuristic" | "langdetect" | "combined"
class LanguageDetector:
"""
Two-pass language detector:
Pass 1 β€” Filipino stopword ratio (fast, handles code-switching)
Pass 2 β€” langdetect (for confirmation when ratios are ambiguous)
Decision rules:
tl_ratio >= 0.25 and en_ratio < 0.15 β†’ Tagalog
en_ratio >= 0.25 and tl_ratio < 0.15 β†’ English
both >= 0.15 β†’ Taglish
fallback β†’ langdetect result
"""
def _token_ratios(self, text: str) -> tuple[float, float]:
tokens = re.findall(r"\b\w+\b", text.lower())
if not tokens:
return 0.0, 0.0
tl_count = sum(1 for t in tokens if t in _TL_MARKERS)
en_count = sum(1 for t in tokens if t in _EN_MARKERS)
total = len(tokens)
return tl_count / total, en_count / total
def _langdetect(self, text: str) -> str:
try:
from langdetect import detect
code = detect(text)
# langdetect returns 'tl' for Tagalog
if code == "tl":
return "Tagalog"
elif code == "en":
return "English"
else:
return "Unknown"
except Exception:
return "Unknown"
def detect(self, text: str) -> LanguageResult:
if not text or len(text.strip()) < 5:
return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic")
tl_ratio, en_ratio = self._token_ratios(text)
# Clear Tagalog
if tl_ratio >= 0.25 and en_ratio < 0.15:
return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic")
# Clear English
if en_ratio >= 0.25 and tl_ratio < 0.15:
return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic")
# Taglish β€” both markers present
if tl_ratio >= 0.10 and en_ratio >= 0.10:
confidence = (tl_ratio + en_ratio) / 2
return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic")
# Ambiguous β€” fall back to langdetect
ld_lang = self._langdetect(text)
if ld_lang != "Unknown":
confidence = max(tl_ratio, en_ratio, 0.5)
return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect")
return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")