Spaces:
Running
Running
File size: 5,064 Bytes
6c9b8f1 c78c2c1 6c9b8f1 c78c2c1 6c9b8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
PhilVerify — Named Entity Recognition
Extracts persons, organizations, locations, and dates from text.
Uses spaCy en_core_web_sm with graceful fallback if model not installed.
"""
import logging
import re
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# Philippine-specific named entity hints
_PH_PERSONS = {
"marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
"bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
}
_PH_ORGS = {
"doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
"afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
"senate", "congress", "supreme court", "malacanang",
}
_PH_LOCATIONS = {
"manila", "quezon city", "makati", "pasig", "taguig", "cebu",
"davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
"batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
"metro manila", "ncr", "philippines", "pilipinas",
}
@dataclass
class NERResult:
persons: list[str] = field(default_factory=list)
organizations: list[str] = field(default_factory=list)
locations: list[str] = field(default_factory=list)
dates: list[str] = field(default_factory=list)
method: str = "spacy"
def to_dict(self) -> dict:
return {
"persons": self.persons,
"organizations": self.organizations,
"locations": self.locations,
"dates": self.dates,
}
class EntityExtractor:
"""
NER using calamanCy (tl_calamancy_lg) for Tagalog-aware entity extraction.
Falls back to spaCy en_core_web_sm, then to regex-based hint extraction.
calamanCy uses the same spaCy doc.ents interface so extract() is unchanged.
"""
def __init__(self):
self._nlp = None
self._loaded = False
def _load_model(self):
if self._loaded:
return
try:
import calamancy
self._nlp = calamancy.load("tl_calamancy_lg")
logger.info("calamanCy tl_calamancy_lg loaded")
except Exception:
try:
import spacy
self._nlp = spacy.load("en_core_web_sm")
logger.info("spaCy en_core_web_sm loaded (calamancy unavailable)")
except Exception as e:
logger.warning("spaCy not available (%s) — using hint-based NER", e)
self._nlp = None
self._loaded = True
def _hint_based_extract(self, text: str) -> NERResult:
"""Fallback: match PH-specific entity hint lists + date regex."""
lower = text.lower()
result = NERResult(method="hints")
result.persons = [p.title() for p in _PH_PERSONS if p in lower]
result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]
# Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
date_patterns = [
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
r"\b\d{4}-\d{2}-\d{2}\b",
r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
]
for pattern in date_patterns:
result.dates.extend(re.findall(pattern, text, re.IGNORECASE))
return result
def extract(self, text: str) -> NERResult:
self._load_model()
if self._nlp is None:
return self._hint_based_extract(text)
try:
doc = self._nlp(text[:5000]) # spaCy has a token limit
result = NERResult(method="spacy")
for ent in doc.ents:
ent_text = ent.text.strip()
if ent.label_ == "PERSON":
result.persons.append(ent_text)
elif ent.label_ in ("ORG", "NORP"):
result.organizations.append(ent_text)
elif ent.label_ in ("GPE", "LOC"):
result.locations.append(ent_text)
elif ent.label_ in ("DATE", "TIME"):
result.dates.append(ent_text)
# Deduplicate while preserving order
result.persons = list(dict.fromkeys(result.persons))
result.organizations = list(dict.fromkeys(result.organizations))
result.locations = list(dict.fromkeys(result.locations))
result.dates = list(dict.fromkeys(result.dates))
# Supplement with PH hints for entities spaCy may miss
hint_result = self._hint_based_extract(text)
for p in hint_result.persons:
if p not in result.persons:
result.persons.append(p)
for o in hint_result.organizations:
if o not in result.organizations:
result.organizations.append(o)
return result
except Exception as e:
logger.warning("spaCy extraction error: %s — falling back to hints", e)
return self._hint_based_extract(text)
|