philverify-api / nlp /clickbait.py
Ryan Christian D. Deniega
feat: PhilVerify Phase 1-3 β€” FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)
6c9b8f1
"""
PhilVerify β€” Clickbait Detector
Detects clickbait patterns common in Philippine fake news / viral content.
Uses regex patterns + feature flags (no model needed).
"""
import re
from dataclasses import dataclass, field
# ── Pattern library ───────────────────────────────────────────────────────────
_CLICKBAIT_PHRASES_EN = [
r"\byou won'?t believe\b", r"\bshocking\b", r"\bviral\b", r"\bbreaking\b",
r"\bexclusive\b", r"\bmust[\s-]?see\b", r"\bsecret\b", r"\bconfirmed\b",
r"\bexposed\b", r"\bscandal\b", r"\bunbelievable\b", r"\bmiraculous?\b",
r"\bhoax\b", r"\bfact[\s-]?check\b", r"\bthis is why\b", r"\bwatch this\b",
]
_CLICKBAIT_PHRASES_TL = [
r"\bgrabe\b", r"\bwow\b", r"\bsurprise\b", r"\bshocking\b", r"\btrending\b",
r"\bselo\b", r"\bbalita\b", r"\bnatuklasan\b", r"\bnahuli\b", r"\bsikat\b",
r"\bpakinggan\b", r"\bpanoorin\b", r"\bkumpirmado\b", r"\bkatotohanan\b",
]
_CAPS_WORD = re.compile(r"\b[A-Z]{2,}\b")
_EXCESSIVE_PUNCT = re.compile(r"[!?]{2,}")
_NUMBER_BAIT = re.compile(r"\b\d+\s+(?:reasons?|things?|ways?|tips?|signs?|bagay)\b", re.I)
_QUESTION_BAIT = re.compile(r"\b(?:ano|bakit|paano|kailan|sino|saan)\b.*\?", re.I)
_ALL_PHRASES = [re.compile(p, re.IGNORECASE) for p in _CLICKBAIT_PHRASES_EN + _CLICKBAIT_PHRASES_TL]
@dataclass
class ClickbaitResult:
is_clickbait: bool
score: float # 0.0 – 1.0
triggered_patterns: list[str] = field(default_factory=list)
class ClickbaitDetector:
"""
Feature-flag based clickbait detector optimized for PH social media.
Returns a continuous score based on how many patterns are triggered.
"""
def detect(self, text: str) -> ClickbaitResult:
triggered: list[str] = []
# ALL CAPS words (2+ in a short span)
caps_words = _CAPS_WORD.findall(text)
if len(caps_words) >= 2:
triggered.append(f"all_caps_words: {caps_words[:3]}")
# Excessive punctuation !! ???
if _EXCESSIVE_PUNCT.search(text):
triggered.append("excessive_punctuation")
# Number-based bait: "5 reasons why..."
if _NUMBER_BAIT.search(text):
triggered.append("number_bait")
# Rhetorical question bait (Tagalog)
if _QUESTION_BAIT.search(text):
triggered.append("question_bait")
# Title length signal (extremely short or extremely long)
word_count = len(text.split())
if word_count < 5:
triggered.append("title_too_short")
elif word_count > 30:
triggered.append("title_very_long")
# Phrase patterns
for pattern in _ALL_PHRASES:
m = pattern.search(text)
if m:
triggered.append(f"clickbait_phrase: '{m.group(0)}'")
# Score: each feature contributes a weight
weights = {
"excessive_punctuation": 0.20,
"all_caps_words": 0.20,
"number_bait": 0.15,
"question_bait": 0.10,
"title_too_short": 0.05,
"title_very_long": 0.05,
}
score = 0.0
for feat in triggered:
for key, w in weights.items():
if feat.startswith(key):
score += w
break
else:
# clickbait_phrase triggers
if feat.startswith("clickbait_phrase"):
score += 0.25
score = min(score, 1.0)
return ClickbaitResult(
is_clickbait=score >= 0.4,
score=round(score, 3),
triggered_patterns=triggered,
)