File size: 5,064 Bytes
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c78c2c1
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
c78c2c1
 
 
 
 
 
 
 
 
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
PhilVerify — Named Entity Recognition
Extracts persons, organizations, locations, and dates from text.
Uses spaCy en_core_web_sm with graceful fallback if model not installed.
"""
import logging
import re
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)

# Philippine-specific named entity hints
_PH_PERSONS = {
    "marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
    "bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
}
_PH_ORGS = {
    "doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
    "afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
    "senate", "congress", "supreme court", "malacanang",
}
_PH_LOCATIONS = {
    "manila", "quezon city", "makati", "pasig", "taguig", "cebu",
    "davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
    "batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
    "metro manila", "ncr", "philippines", "pilipinas",
}


@dataclass
class NERResult:
    persons: list[str] = field(default_factory=list)
    organizations: list[str] = field(default_factory=list)
    locations: list[str] = field(default_factory=list)
    dates: list[str] = field(default_factory=list)
    method: str = "spacy"

    def to_dict(self) -> dict:
        return {
            "persons": self.persons,
            "organizations": self.organizations,
            "locations": self.locations,
            "dates": self.dates,
        }


class EntityExtractor:
    """
    NER using calamanCy (tl_calamancy_lg) for Tagalog-aware entity extraction.
    Falls back to spaCy en_core_web_sm, then to regex-based hint extraction.
    calamanCy uses the same spaCy doc.ents interface so extract() is unchanged.
    """

    def __init__(self):
        self._nlp = None
        self._loaded = False

    def _load_model(self):
        if self._loaded:
            return
        try:
            import calamancy
            self._nlp = calamancy.load("tl_calamancy_lg")
            logger.info("calamanCy tl_calamancy_lg loaded")
        except Exception:
            try:
                import spacy
                self._nlp = spacy.load("en_core_web_sm")
                logger.info("spaCy en_core_web_sm loaded (calamancy unavailable)")
            except Exception as e:
                logger.warning("spaCy not available (%s) — using hint-based NER", e)
                self._nlp = None
        self._loaded = True

    def _hint_based_extract(self, text: str) -> NERResult:
        """Fallback: match PH-specific entity hint lists + date regex."""
        lower = text.lower()
        result = NERResult(method="hints")

        result.persons = [p.title() for p in _PH_PERSONS if p in lower]
        result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
        result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]

        # Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
        date_patterns = [
            r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
            r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
            r"\b\d{4}-\d{2}-\d{2}\b",
            r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
        ]
        for pattern in date_patterns:
            result.dates.extend(re.findall(pattern, text, re.IGNORECASE))

        return result

    def extract(self, text: str) -> NERResult:
        self._load_model()

        if self._nlp is None:
            return self._hint_based_extract(text)

        try:
            doc = self._nlp(text[:5000])  # spaCy has a token limit
            result = NERResult(method="spacy")

            for ent in doc.ents:
                ent_text = ent.text.strip()
                if ent.label_ == "PERSON":
                    result.persons.append(ent_text)
                elif ent.label_ in ("ORG", "NORP"):
                    result.organizations.append(ent_text)
                elif ent.label_ in ("GPE", "LOC"):
                    result.locations.append(ent_text)
                elif ent.label_ in ("DATE", "TIME"):
                    result.dates.append(ent_text)

            # Deduplicate while preserving order
            result.persons = list(dict.fromkeys(result.persons))
            result.organizations = list(dict.fromkeys(result.organizations))
            result.locations = list(dict.fromkeys(result.locations))
            result.dates = list(dict.fromkeys(result.dates))

            # Supplement with PH hints for entities spaCy may miss
            hint_result = self._hint_based_extract(text)
            for p in hint_result.persons:
                if p not in result.persons:
                    result.persons.append(p)
            for o in hint_result.organizations:
                if o not in result.organizations:
                    result.organizations.append(o)

            return result
        except Exception as e:
            logger.warning("spaCy extraction error: %s — falling back to hints", e)
            return self._hint_based_extract(text)