Spaces:

SemiAutomat1c
/

philverify-api

Running

File size: 5,064 Bytes

"""
PhilVerify — Named Entity Recognition
Extracts persons, organizations, locations, and dates from text.
Uses spaCy en_core_web_sm with graceful fallback if model not installed.
"""
import logging
import re
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)

# Philippine-specific named entity hints
_PH_PERSONS = {
    "marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
    "bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
}
_PH_ORGS = {
    "doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
    "afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
    "senate", "congress", "supreme court", "malacanang",
}
_PH_LOCATIONS = {
    "manila", "quezon city", "makati", "pasig", "taguig", "cebu",
    "davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
    "batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
    "metro manila", "ncr", "philippines", "pilipinas",
}


@dataclass
class NERResult:
    persons: list[str] = field(default_factory=list)
    organizations: list[str] = field(default_factory=list)
    locations: list[str] = field(default_factory=list)
    dates: list[str] = field(default_factory=list)
    method: str = "spacy"

    def to_dict(self) -> dict:
        return {
            "persons": self.persons,
            "organizations": self.organizations,
            "locations": self.locations,
            "dates": self.dates,
        }


class EntityExtractor:
    """
    NER using calamanCy (tl_calamancy_lg) for Tagalog-aware entity extraction.
    Falls back to spaCy en_core_web_sm, then to regex-based hint extraction.
    calamanCy uses the same spaCy doc.ents interface so extract() is unchanged.
    """

    def __init__(self):
        self._nlp = None
        self._loaded = False

    def _load_model(self):
        if self._loaded:
            return
        try:
            import calamancy
            self._nlp = calamancy.load("tl_calamancy_lg")
            logger.info("calamanCy tl_calamancy_lg loaded")
        except Exception:
            try:
                import spacy
                self._nlp = spacy.load("en_core_web_sm")
                logger.info("spaCy en_core_web_sm loaded (calamancy unavailable)")
            except Exception as e:
                logger.warning("spaCy not available (%s) — using hint-based NER", e)
                self._nlp = None
        self._loaded = True

    def _hint_based_extract(self, text: str) -> NERResult:
        """Fallback: match PH-specific entity hint lists + date regex."""
        lower = text.lower()
        result = NERResult(method="hints")

        result.persons = [p.title() for p in _PH_PERSONS if p in lower]
        result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
        result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]

        # Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
        date_patterns = [
            r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
            r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
            r"\b\d{4}-\d{2}-\d{2}\b",
            r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
        ]
        for pattern in date_patterns:
            result.dates.extend(re.findall(pattern, text, re.IGNORECASE))

        return result

    def extract(self, text: str) -> NERResult:
        self._load_model()

        if self._nlp is None:
            return self._hint_based_extract(text)

        try:
            doc = self._nlp(text[:5000])  # spaCy has a token limit
            result = NERResult(method="spacy")

            for ent in doc.ents:
                ent_text = ent.text.strip()
                if ent.label_ == "PERSON":
                    result.persons.append(ent_text)
                elif ent.label_ in ("ORG", "NORP"):
                    result.organizations.append(ent_text)
                elif ent.label_ in ("GPE", "LOC"):
                    result.locations.append(ent_text)
                elif ent.label_ in ("DATE", "TIME"):
                    result.dates.append(ent_text)

            # Deduplicate while preserving order
            result.persons = list(dict.fromkeys(result.persons))
            result.organizations = list(dict.fromkeys(result.organizations))
            result.locations = list(dict.fromkeys(result.locations))
            result.dates = list(dict.fromkeys(result.dates))

            # Supplement with PH hints for entities spaCy may miss
            hint_result = self._hint_based_extract(text)
            for p in hint_result.persons:
                if p not in result.persons:
                    result.persons.append(p)
            for o in hint_result.organizations:
                if o not in result.organizations:
                    result.organizations.append(o)

            return result
        except Exception as e:
            logger.warning("spaCy extraction error: %s — falling back to hints", e)
            return self._hint_based_extract(text)