philverify-api / nlp /claim_extractor.py
Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1
"""
PhilVerify β€” Claim Extractor
Extracts the key falsifiable claim from noisy social media text.
Strategy: sentence scoring based on presence of named entities,
verbs, dates, and numbers β€” no heavy model required.
Filipino fake news headlines almost always embed the checkworthy
assertion in a sentence that contains a specific number/date + person/org
name + an attribution verb (sinabi, ayon, announced, confirmed, etc.).
Scoring these signals finds the right sentence faster and more reliably
than a summarization model that was trained on English news compression.
"""
import re
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
# Numbers, percentages, or month names signal a specific, verifiable claim
_DATE_OR_NUM = re.compile(
r"\b(\d[\d,.%]*"
r"|(?:January|February|March|April|May|June|July|August|"
r"September|October|November|December)"
r"|(?:Enero|Pebrero|Marso|Abril|Mayo|Hunyo|Hulyo|Agosto|"
r"Setyembre|Oktubre|Nobyembre|Disyembre))\b",
re.IGNORECASE,
)
# Attribution / assertion verbs in English and Filipino
_VERB_PATTERN = re.compile(
r"\b(is|are|was|were|has|have|had|will|would"
r"|said|says|announced|confirmed|reported|claims|showed"
r"|found|revealed|arrested|killed|died|signed|approved|ordered"
r"|sinabi|ipinahayag|inanunsyo|kinumpirma|ayon|nagpahayag"
r"|inihayag|iniutos|nagsabi|ipinag-utos)\b",
re.IGNORECASE,
)
@dataclass
class ClaimResult:
claim: str
method: str # "sentence_scoring" | "sentence_heuristic"
def _score_sentence(sent: str) -> float:
"""Score a sentence by how likely it is to contain a falsifiable claim."""
score = 0.0
if _DATE_OR_NUM.search(sent):
score += 2.0
score += min(3.0, len(_VERB_PATTERN.findall(sent)) * 1.0)
if len(sent) > 30:
score += 1.0
return score
class ClaimExtractor:
"""
Extracts the single most falsifiable claim from input text using
sentence scoring. No heavy model required β€” spaCy already loaded
for NER; this module uses only stdlib regex.
The highest-scoring sentence (by date/number + verb density) is
returned as the claim for downstream NewsAPI evidence retrieval.
"""
def extract(self, text: str) -> ClaimResult:
if not text or len(text.strip()) < 20:
return ClaimResult(claim=text.strip(), method="passthrough")
sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text.strip())]
candidates = [s for s in sentences if len(s) > 15]
if not candidates:
return ClaimResult(claim=text[:200].strip(), method="sentence_heuristic")
scored = [(s, _score_sentence(s)) for s in candidates]
best_sent, best_score = max(scored, key=lambda x: x[1])
if best_score > 0:
return ClaimResult(claim=best_sent, method="sentence_scoring")
# All scores zero β€” fall back to first two sentences
return ClaimResult(
claim=" ".join(candidates[:2]),
method="sentence_heuristic",
)