Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / nlp /language_detector.py

Ryan Christian D. Deniega

feat: PhilVerify Phase 1-3 — FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)

6c9b8f1 24 days ago

3.71 kB

	"""
	PhilVerify — Language Detector
	Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic.
	No heavy model needed — runs instantly.
	"""
	import re
	import logging
	from dataclasses import dataclass

	logger = logging.getLogger(__name__)

	# ── Filipino stopword set for heuristic ───────────────────────────────────────
	_TL_MARKERS = {
	"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
	"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw",
	"ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung",
	"pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi",
	"may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino",
	}

	# English marker words (distinct from TL)
	_EN_MARKERS = {
	"the", "and", "is", "are", "was", "were", "this", "that", "with",
	"from", "have", "has", "had", "will", "would", "could", "should",
	"not", "been", "being", "they", "their", "there",
	}


	@dataclass
	class LanguageResult:
	language: str # "Tagalog" \| "English" \| "Taglish" \| "Unknown"
	confidence: float # 0.0 – 1.0
	tl_ratio: float
	en_ratio: float
	method: str # "heuristic" \| "langdetect" \| "combined"


	class LanguageDetector:
	"""
	Two-pass language detector:
	Pass 1 — Filipino stopword ratio (fast, handles code-switching)
	Pass 2 — langdetect (for confirmation when ratios are ambiguous)

	Decision rules:
	tl_ratio >= 0.25 and en_ratio < 0.15 → Tagalog
	en_ratio >= 0.25 and tl_ratio < 0.15 → English
	both >= 0.15 → Taglish
	fallback → langdetect result
	"""

	def _token_ratios(self, text: str) -> tuple[float, float]:
	tokens = re.findall(r"\b\w+\b", text.lower())
	if not tokens:
	return 0.0, 0.0
	tl_count = sum(1 for t in tokens if t in _TL_MARKERS)
	en_count = sum(1 for t in tokens if t in _EN_MARKERS)
	total = len(tokens)
	return tl_count / total, en_count / total

	def _langdetect(self, text: str) -> str:
	try:
	from langdetect import detect
	code = detect(text)
	# langdetect returns 'tl' for Tagalog
	if code == "tl":
	return "Tagalog"
	elif code == "en":
	return "English"
	else:
	return "Unknown"
	except Exception:
	return "Unknown"

	def detect(self, text: str) -> LanguageResult:
	if not text or len(text.strip()) < 5:
	return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic")

	tl_ratio, en_ratio = self._token_ratios(text)

	# Clear Tagalog
	if tl_ratio >= 0.25 and en_ratio < 0.15:
	return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic")

	# Clear English
	if en_ratio >= 0.25 and tl_ratio < 0.15:
	return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic")

	# Taglish — both markers present
	if tl_ratio >= 0.10 and en_ratio >= 0.10:
	confidence = (tl_ratio + en_ratio) / 2
	return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic")

	# Ambiguous — fall back to langdetect
	ld_lang = self._langdetect(text)
	if ld_lang != "Unknown":
	confidence = max(tl_ratio, en_ratio, 0.5)
	return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect")

	return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")