Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / nlp /claim_extractor.py

Ryan Christian D. Deniega

feat: extension button placement, text extraction, OCR display + ML improvements

c78c2c1 28 days ago

3.12 kB

	"""
	PhilVerify — Claim Extractor
	Extracts the key falsifiable claim from noisy social media text.

	Strategy: sentence scoring based on presence of named entities,
	verbs, dates, and numbers — no heavy model required.

	Filipino fake news headlines almost always embed the checkworthy
	assertion in a sentence that contains a specific number/date + person/org
	name + an attribution verb (sinabi, ayon, announced, confirmed, etc.).
	Scoring these signals finds the right sentence faster and more reliably
	than a summarization model that was trained on English news compression.
	"""
	import re
	import logging
	from dataclasses import dataclass

	logger = logging.getLogger(__name__)

	_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")

	# Numbers, percentages, or month names signal a specific, verifiable claim
	_DATE_OR_NUM = re.compile(
	r"\b(\d[\d,.%]*"
	r"\|(?:January\|February\|March\|April\|May\|June\|July\|August\|"
	r"September\|October\|November\|December)"
	r"\|(?:Enero\|Pebrero\|Marso\|Abril\|Mayo\|Hunyo\|Hulyo\|Agosto\|"
	r"Setyembre\|Oktubre\|Nobyembre\|Disyembre))\b",
	re.IGNORECASE,
	)

	# Attribution / assertion verbs in English and Filipino
	_VERB_PATTERN = re.compile(
	r"\b(is\|are\|was\|were\|has\|have\|had\|will\|would"
	r"\|said\|says\|announced\|confirmed\|reported\|claims\|showed"
	r"\|found\|revealed\|arrested\|killed\|died\|signed\|approved\|ordered"
	r"\|sinabi\|ipinahayag\|inanunsyo\|kinumpirma\|ayon\|nagpahayag"
	r"\|inihayag\|iniutos\|nagsabi\|ipinag-utos)\b",
	re.IGNORECASE,
	)


	@dataclass
	class ClaimResult:
	claim: str
	method: str # "sentence_scoring" \| "sentence_heuristic"


	def _score_sentence(sent: str) -> float:
	"""Score a sentence by how likely it is to contain a falsifiable claim."""
	score = 0.0
	if _DATE_OR_NUM.search(sent):
	score += 2.0
	score += min(3.0, len(_VERB_PATTERN.findall(sent)) * 1.0)
	if len(sent) > 30:
	score += 1.0
	return score


	class ClaimExtractor:
	"""
	Extracts the single most falsifiable claim from input text using
	sentence scoring. No heavy model required — spaCy already loaded
	for NER; this module uses only stdlib regex.

	The highest-scoring sentence (by date/number + verb density) is
	returned as the claim for downstream NewsAPI evidence retrieval.
	"""

	def extract(self, text: str) -> ClaimResult:
	if not text or len(text.strip()) < 20:
	return ClaimResult(claim=text.strip(), method="passthrough")

	sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text.strip())]
	candidates = [s for s in sentences if len(s) > 15]

	if not candidates:
	return ClaimResult(claim=text[:200].strip(), method="sentence_heuristic")

	scored = [(s, _score_sentence(s)) for s in candidates]
	best_sent, best_score = max(scored, key=lambda x: x[1])

	if best_score > 0:
	return ClaimResult(claim=best_sent, method="sentence_scoring")

	# All scores zero — fall back to first two sentences
	return ClaimResult(
	claim=" ".join(candidates[:2]),
	method="sentence_heuristic",
	)