Spaces:
Running
Running
Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1 | """ | |
| PhilVerify β Claim Extractor | |
| Extracts the key falsifiable claim from noisy social media text. | |
| Strategy: sentence scoring based on presence of named entities, | |
| verbs, dates, and numbers β no heavy model required. | |
| Filipino fake news headlines almost always embed the checkworthy | |
| assertion in a sentence that contains a specific number/date + person/org | |
| name + an attribution verb (sinabi, ayon, announced, confirmed, etc.). | |
| Scoring these signals finds the right sentence faster and more reliably | |
| than a summarization model that was trained on English news compression. | |
| """ | |
| import re | |
| import logging | |
| from dataclasses import dataclass | |
| logger = logging.getLogger(__name__) | |
| _SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+") | |
| # Numbers, percentages, or month names signal a specific, verifiable claim | |
| _DATE_OR_NUM = re.compile( | |
| r"\b(\d[\d,.%]*" | |
| r"|(?:January|February|March|April|May|June|July|August|" | |
| r"September|October|November|December)" | |
| r"|(?:Enero|Pebrero|Marso|Abril|Mayo|Hunyo|Hulyo|Agosto|" | |
| r"Setyembre|Oktubre|Nobyembre|Disyembre))\b", | |
| re.IGNORECASE, | |
| ) | |
| # Attribution / assertion verbs in English and Filipino | |
| _VERB_PATTERN = re.compile( | |
| r"\b(is|are|was|were|has|have|had|will|would" | |
| r"|said|says|announced|confirmed|reported|claims|showed" | |
| r"|found|revealed|arrested|killed|died|signed|approved|ordered" | |
| r"|sinabi|ipinahayag|inanunsyo|kinumpirma|ayon|nagpahayag" | |
| r"|inihayag|iniutos|nagsabi|ipinag-utos)\b", | |
| re.IGNORECASE, | |
| ) | |
| class ClaimResult: | |
| claim: str | |
| method: str # "sentence_scoring" | "sentence_heuristic" | |
| def _score_sentence(sent: str) -> float: | |
| """Score a sentence by how likely it is to contain a falsifiable claim.""" | |
| score = 0.0 | |
| if _DATE_OR_NUM.search(sent): | |
| score += 2.0 | |
| score += min(3.0, len(_VERB_PATTERN.findall(sent)) * 1.0) | |
| if len(sent) > 30: | |
| score += 1.0 | |
| return score | |
| class ClaimExtractor: | |
| """ | |
| Extracts the single most falsifiable claim from input text using | |
| sentence scoring. No heavy model required β spaCy already loaded | |
| for NER; this module uses only stdlib regex. | |
| The highest-scoring sentence (by date/number + verb density) is | |
| returned as the claim for downstream NewsAPI evidence retrieval. | |
| """ | |
| def extract(self, text: str) -> ClaimResult: | |
| if not text or len(text.strip()) < 20: | |
| return ClaimResult(claim=text.strip(), method="passthrough") | |
| sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text.strip())] | |
| candidates = [s for s in sentences if len(s) > 15] | |
| if not candidates: | |
| return ClaimResult(claim=text[:200].strip(), method="sentence_heuristic") | |
| scored = [(s, _score_sentence(s)) for s in candidates] | |
| best_sent, best_score = max(scored, key=lambda x: x[1]) | |
| if best_score > 0: | |
| return ClaimResult(claim=best_sent, method="sentence_scoring") | |
| # All scores zero β fall back to first two sentences | |
| return ClaimResult( | |
| claim=" ".join(candidates[:2]), | |
| method="sentence_heuristic", | |
| ) | |