Spaces:
Running
Running
Ryan Christian D. Deniega commited on
Commit ·
6c9b8f1
0
Parent(s):
feat: PhilVerify Phase 1-3 — FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)
Browse files- .env.example +27 -0
- .gitignore +28 -0
- api/__init__.py +0 -0
- api/routes/__init__.py +0 -0
- api/routes/history.py +56 -0
- api/routes/trends.py +84 -0
- api/routes/verify.py +147 -0
- api/schemas.py +151 -0
- config.py +55 -0
- domain_credibility.json +33 -0
- evidence/__init__.py +0 -0
- evidence/news_fetcher.py +108 -0
- inputs/__init__.py +0 -0
- inputs/asr.py +49 -0
- inputs/ocr.py +33 -0
- inputs/url_scraper.py +71 -0
- main.py +127 -0
- ml/__init__.py +0 -0
- ml/tfidf_classifier.py +128 -0
- nlp/__init__.py +0 -0
- nlp/claim_extractor.py +84 -0
- nlp/clickbait.py +100 -0
- nlp/language_detector.py +99 -0
- nlp/ner.py +129 -0
- nlp/preprocessor.py +124 -0
- nlp/sentiment.py +141 -0
- pytest.ini +6 -0
- requirements.txt +47 -0
- scoring/__init__.py +0 -0
- scoring/engine.py +212 -0
- tests/__init__.py +0 -0
- tests/test_philverify.py +181 -0
.env.example
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── API Keys ──────────────────────────────────────────────────────────────────
|
| 2 |
+
NEWS_API_KEY=your_newsapi_key_here
|
| 3 |
+
GOOGLE_VISION_API_KEY=your_google_vision_key_here # Optional (alternative to Tesseract)
|
| 4 |
+
|
| 5 |
+
# ── Database ──────────────────────────────────────────────────────────────────
|
| 6 |
+
DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/philverify
|
| 7 |
+
|
| 8 |
+
# ── Redis Cache ───────────────────────────────────────────────────────────────
|
| 9 |
+
REDIS_URL=redis://localhost:6379/0
|
| 10 |
+
|
| 11 |
+
# ── App Settings ──────────────────────────────────────────────────────────────
|
| 12 |
+
APP_ENV=development # development | production
|
| 13 |
+
DEBUG=true
|
| 14 |
+
LOG_LEVEL=INFO
|
| 15 |
+
ALLOWED_ORIGINS=http://localhost:3000,http://localhost:5173
|
| 16 |
+
|
| 17 |
+
# ── Model Settings ────────────────────────────────────────────────────────────
|
| 18 |
+
# Options: xlm-roberta-base | joelito/roberta-tagalog-base | bert-base-multilingual-cased
|
| 19 |
+
ML_MODEL_NAME=xlm-roberta-base
|
| 20 |
+
WHISPER_MODEL_SIZE=base # base | medium | large-v3 (large-v3 for production)
|
| 21 |
+
USE_GPU=false
|
| 22 |
+
|
| 23 |
+
# ── Scoring Weights ───────────────────────────────────────────────────────────
|
| 24 |
+
ML_WEIGHT=0.40
|
| 25 |
+
EVIDENCE_WEIGHT=0.60
|
| 26 |
+
CREDIBLE_THRESHOLD=70.0
|
| 27 |
+
FAKE_THRESHOLD=40.0
|
.gitignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.py[cod]
|
| 5 |
+
*.pkl
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
|
| 10 |
+
# Environment
|
| 11 |
+
.env
|
| 12 |
+
|
| 13 |
+
# Cache
|
| 14 |
+
.cache/
|
| 15 |
+
.pytest_cache/
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
|
| 22 |
+
# OS
|
| 23 |
+
.DS_Store
|
| 24 |
+
|
| 25 |
+
# ML models (too large for git)
|
| 26 |
+
ml/models/*.pkl
|
| 27 |
+
ml/models/*.bin
|
| 28 |
+
ml/models/*.pt
|
api/__init__.py
ADDED
|
File without changes
|
api/routes/__init__.py
ADDED
|
File without changes
|
api/routes/history.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — History Route
|
| 3 |
+
GET /history — Returns past verification logs with pagination.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from fastapi import APIRouter, Query
|
| 7 |
+
from api.schemas import HistoryResponse, HistoryEntry, Verdict
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
router = APIRouter(prefix="/history", tags=["History"])
|
| 11 |
+
|
| 12 |
+
# In-memory store for development. Will be replaced by DB queries in Phase 7.
|
| 13 |
+
_HISTORY: list[dict] = []
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def record_verification(entry: dict) -> None:
|
| 17 |
+
"""Called by the scoring engine to persist each verification result."""
|
| 18 |
+
_HISTORY.append(entry)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@router.get(
|
| 22 |
+
"",
|
| 23 |
+
response_model=HistoryResponse,
|
| 24 |
+
summary="Get verification history",
|
| 25 |
+
description="Returns past verifications ordered by most recent. Supports pagination.",
|
| 26 |
+
)
|
| 27 |
+
async def get_history(
|
| 28 |
+
page: int = Query(1, ge=1, description="Page number"),
|
| 29 |
+
limit: int = Query(20, ge=1, le=100, description="Results per page"),
|
| 30 |
+
verdict_filter: Verdict | None = Query(None, alias="verdict", description="Filter by verdict"),
|
| 31 |
+
) -> HistoryResponse:
|
| 32 |
+
logger.info("GET /history | page=%d limit=%d", page, limit)
|
| 33 |
+
|
| 34 |
+
entries = list(reversed(_HISTORY)) # Most recent first
|
| 35 |
+
if verdict_filter:
|
| 36 |
+
entries = [e for e in entries if e.get("verdict") == verdict_filter.value]
|
| 37 |
+
|
| 38 |
+
total = len(entries)
|
| 39 |
+
start = (page - 1) * limit
|
| 40 |
+
paginated = entries[start : start + limit]
|
| 41 |
+
|
| 42 |
+
return HistoryResponse(
|
| 43 |
+
total=total,
|
| 44 |
+
entries=[
|
| 45 |
+
HistoryEntry(
|
| 46 |
+
id=e["id"],
|
| 47 |
+
timestamp=e["timestamp"],
|
| 48 |
+
input_type=e.get("input_type", "text"),
|
| 49 |
+
text_preview=e.get("text_preview", "")[:120],
|
| 50 |
+
verdict=Verdict(e["verdict"]),
|
| 51 |
+
confidence=e["confidence"],
|
| 52 |
+
final_score=e["final_score"],
|
| 53 |
+
)
|
| 54 |
+
for e in paginated
|
| 55 |
+
],
|
| 56 |
+
)
|
api/routes/trends.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Trends Route
|
| 3 |
+
GET /trends — Aggregates entities and topics from fake-news verifications.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from fastapi import APIRouter, Query
|
| 8 |
+
from api.schemas import TrendsResponse, TrendingEntity, TrendingTopic, Verdict
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
router = APIRouter(prefix="/trends", tags=["Trends"])
|
| 12 |
+
|
| 13 |
+
# Reads from the same in-memory store as history (Phase 7 → DB aggregation).
|
| 14 |
+
from api.routes.history import _HISTORY
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@router.get(
|
| 18 |
+
"",
|
| 19 |
+
response_model=TrendsResponse,
|
| 20 |
+
summary="Get trending entities & topics",
|
| 21 |
+
description="Aggregates NER entities and topics from recent verifications. Useful for identifying fake-news patterns.",
|
| 22 |
+
)
|
| 23 |
+
async def get_trends(
|
| 24 |
+
days: int = Query(7, ge=1, le=90, description="Lookback window in days"),
|
| 25 |
+
limit: int = Query(10, ge=1, le=50, description="Max results per category"),
|
| 26 |
+
) -> TrendsResponse:
|
| 27 |
+
logger.info("GET /trends | days=%d", days)
|
| 28 |
+
|
| 29 |
+
entity_counter: Counter = Counter()
|
| 30 |
+
entity_type_map: dict[str, str] = {}
|
| 31 |
+
entity_fake_counter: Counter = Counter()
|
| 32 |
+
topic_counter: Counter = Counter()
|
| 33 |
+
topic_verdict_map: dict[str, list[str]] = {}
|
| 34 |
+
|
| 35 |
+
for entry in _HISTORY:
|
| 36 |
+
is_fake = entry.get("verdict") in (Verdict.LIKELY_FAKE.value, Verdict.UNVERIFIED.value)
|
| 37 |
+
entities = entry.get("entities", {})
|
| 38 |
+
|
| 39 |
+
for person in entities.get("persons", []):
|
| 40 |
+
entity_counter[person] += 1
|
| 41 |
+
entity_type_map[person] = "person"
|
| 42 |
+
if is_fake:
|
| 43 |
+
entity_fake_counter[person] += 1
|
| 44 |
+
|
| 45 |
+
for org in entities.get("organizations", []):
|
| 46 |
+
entity_counter[org] += 1
|
| 47 |
+
entity_type_map[org] = "org"
|
| 48 |
+
if is_fake:
|
| 49 |
+
entity_fake_counter[org] += 1
|
| 50 |
+
|
| 51 |
+
for loc in entities.get("locations", []):
|
| 52 |
+
entity_counter[loc] += 1
|
| 53 |
+
entity_type_map[loc] = "location"
|
| 54 |
+
if is_fake:
|
| 55 |
+
entity_fake_counter[loc] += 1
|
| 56 |
+
|
| 57 |
+
claim = entry.get("claim_used", "")
|
| 58 |
+
if claim:
|
| 59 |
+
topic_counter[claim[:60]] += 1
|
| 60 |
+
topic_verdict_map.setdefault(claim[:60], []).append(entry.get("verdict", "Unverified"))
|
| 61 |
+
|
| 62 |
+
top_entities = [
|
| 63 |
+
TrendingEntity(
|
| 64 |
+
entity=entity,
|
| 65 |
+
entity_type=entity_type_map.get(entity, "unknown"),
|
| 66 |
+
count=count,
|
| 67 |
+
fake_count=entity_fake_counter.get(entity, 0),
|
| 68 |
+
fake_ratio=round(entity_fake_counter.get(entity, 0) / count, 2),
|
| 69 |
+
)
|
| 70 |
+
for entity, count in entity_counter.most_common(limit)
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
top_topics = [
|
| 74 |
+
TrendingTopic(
|
| 75 |
+
topic=topic,
|
| 76 |
+
count=count,
|
| 77 |
+
dominant_verdict=Verdict(
|
| 78 |
+
Counter(topic_verdict_map.get(topic, ["Unverified"])).most_common(1)[0][0]
|
| 79 |
+
),
|
| 80 |
+
)
|
| 81 |
+
for topic, count in topic_counter.most_common(limit)
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
return TrendsResponse(top_entities=top_entities, top_topics=top_topics)
|
api/routes/verify.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Verify Routes
|
| 3 |
+
POST /verify/text | /verify/url | /verify/image | /verify/video
|
| 4 |
+
All routes funnel through run_verification() in the scoring engine.
|
| 5 |
+
"""
|
| 6 |
+
import time
|
| 7 |
+
import logging
|
| 8 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, status
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
|
| 11 |
+
from api.schemas import (
|
| 12 |
+
TextVerifyRequest,
|
| 13 |
+
URLVerifyRequest,
|
| 14 |
+
VerificationResponse,
|
| 15 |
+
ErrorResponse,
|
| 16 |
+
)
|
| 17 |
+
from scoring.engine import run_verification
|
| 18 |
+
from inputs.url_scraper import scrape_url
|
| 19 |
+
from inputs.ocr import extract_text_from_image
|
| 20 |
+
from inputs.asr import transcribe_video
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
router = APIRouter(prefix="/verify", tags=["Verification"])
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ── Text ──────────────────────────────────────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
@router.post(
|
| 29 |
+
"/text",
|
| 30 |
+
response_model=VerificationResponse,
|
| 31 |
+
summary="Verify raw text",
|
| 32 |
+
description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
|
| 33 |
+
)
|
| 34 |
+
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
|
| 35 |
+
start = time.perf_counter()
|
| 36 |
+
logger.info("verify/text called | chars=%d", len(body.text))
|
| 37 |
+
try:
|
| 38 |
+
result = await run_verification(body.text, input_type="text")
|
| 39 |
+
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 40 |
+
return result
|
| 41 |
+
except Exception as exc:
|
| 42 |
+
logger.exception("verify/text error: %s", exc)
|
| 43 |
+
raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ── URL ───────────────────────────────────────────────────────────────────────
|
| 47 |
+
|
| 48 |
+
@router.post(
|
| 49 |
+
"/url",
|
| 50 |
+
response_model=VerificationResponse,
|
| 51 |
+
summary="Verify a URL",
|
| 52 |
+
description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
|
| 53 |
+
)
|
| 54 |
+
async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
|
| 55 |
+
start = time.perf_counter()
|
| 56 |
+
url_str = str(body.url)
|
| 57 |
+
logger.info("verify/url called | url=%s", url_str)
|
| 58 |
+
try:
|
| 59 |
+
text, domain = await scrape_url(url_str)
|
| 60 |
+
if not text or len(text.strip()) < 20:
|
| 61 |
+
raise HTTPException(
|
| 62 |
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
| 63 |
+
detail="Could not extract meaningful text from the URL. The page may be paywalled or bot-protected.",
|
| 64 |
+
)
|
| 65 |
+
result = await run_verification(text, input_type="url", source_domain=domain)
|
| 66 |
+
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 67 |
+
return result
|
| 68 |
+
except HTTPException:
|
| 69 |
+
raise
|
| 70 |
+
except Exception as exc:
|
| 71 |
+
logger.exception("verify/url error: %s", exc)
|
| 72 |
+
raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ── Image ─────────────────────────────────────────────────────────────────────
|
| 76 |
+
|
| 77 |
+
@router.post(
|
| 78 |
+
"/image",
|
| 79 |
+
response_model=VerificationResponse,
|
| 80 |
+
summary="Verify an image (OCR)",
|
| 81 |
+
description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
|
| 82 |
+
)
|
| 83 |
+
async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
|
| 84 |
+
start = time.perf_counter()
|
| 85 |
+
logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)
|
| 86 |
+
|
| 87 |
+
allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
|
| 88 |
+
if file.content_type not in allowed_types:
|
| 89 |
+
raise HTTPException(
|
| 90 |
+
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
|
| 91 |
+
detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
|
| 92 |
+
)
|
| 93 |
+
try:
|
| 94 |
+
image_bytes = await file.read()
|
| 95 |
+
text = await extract_text_from_image(image_bytes)
|
| 96 |
+
if not text or len(text.strip()) < 10:
|
| 97 |
+
raise HTTPException(
|
| 98 |
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
| 99 |
+
detail="No readable text found in the image.",
|
| 100 |
+
)
|
| 101 |
+
result = await run_verification(text, input_type="image")
|
| 102 |
+
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 103 |
+
return result
|
| 104 |
+
except HTTPException:
|
| 105 |
+
raise
|
| 106 |
+
except Exception as exc:
|
| 107 |
+
logger.exception("verify/image error: %s", exc)
|
| 108 |
+
raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ── Video ─────────────────────────────────────────────────────────────────────
|
| 112 |
+
|
| 113 |
+
@router.post(
|
| 114 |
+
"/video",
|
| 115 |
+
response_model=VerificationResponse,
|
| 116 |
+
summary="Verify a video/audio (Whisper ASR)",
|
| 117 |
+
description="Accepts a video or audio file. Runs Whisper ASR to transcribe, then verifies the transcript.",
|
| 118 |
+
)
|
| 119 |
+
async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
|
| 120 |
+
start = time.perf_counter()
|
| 121 |
+
logger.info("verify/video called | filename=%s", file.filename)
|
| 122 |
+
|
| 123 |
+
allowed_types = {
|
| 124 |
+
"video/mp4", "video/webm", "video/quicktime",
|
| 125 |
+
"audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
|
| 126 |
+
}
|
| 127 |
+
if file.content_type not in allowed_types:
|
| 128 |
+
raise HTTPException(
|
| 129 |
+
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
|
| 130 |
+
detail=f"Unsupported media type: {file.content_type}",
|
| 131 |
+
)
|
| 132 |
+
try:
|
| 133 |
+
media_bytes = await file.read()
|
| 134 |
+
text = await transcribe_video(media_bytes, filename=file.filename or "upload")
|
| 135 |
+
if not text or len(text.strip()) < 10:
|
| 136 |
+
raise HTTPException(
|
| 137 |
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
| 138 |
+
detail="Could not transcribe meaningful speech from the media file.",
|
| 139 |
+
)
|
| 140 |
+
result = await run_verification(text, input_type="video")
|
| 141 |
+
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 142 |
+
return result
|
| 143 |
+
except HTTPException:
|
| 144 |
+
raise
|
| 145 |
+
except Exception as exc:
|
| 146 |
+
logger.exception("verify/video error: %s", exc)
|
| 147 |
+
raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc
|
api/schemas.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Pydantic Request / Response Schemas
|
| 3 |
+
Matches the structured JSON output format from the system spec.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from enum import Enum
|
| 8 |
+
from typing import Optional
|
| 9 |
+
from pydantic import BaseModel, HttpUrl, Field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# ── Enums ─────────────────────────────────────────────────────────────────────
|
| 13 |
+
|
| 14 |
+
class Verdict(str, Enum):
|
| 15 |
+
CREDIBLE = "Credible"
|
| 16 |
+
UNVERIFIED = "Unverified"
|
| 17 |
+
LIKELY_FAKE = "Likely Fake"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class Stance(str, Enum):
|
| 21 |
+
SUPPORTS = "Supports"
|
| 22 |
+
REFUTES = "Refutes"
|
| 23 |
+
NOT_ENOUGH_INFO = "Not Enough Info"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Language(str, Enum):
|
| 27 |
+
TAGALOG = "Tagalog"
|
| 28 |
+
ENGLISH = "English"
|
| 29 |
+
TAGLISH = "Taglish"
|
| 30 |
+
UNKNOWN = "Unknown"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Sentiment(str, Enum):
|
| 34 |
+
POSITIVE = "positive"
|
| 35 |
+
NEGATIVE = "negative"
|
| 36 |
+
NEUTRAL = "neutral"
|
| 37 |
+
HIGH_POSITIVE = "high positive"
|
| 38 |
+
HIGH_NEGATIVE = "high negative"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class DomainTier(int, Enum):
|
| 42 |
+
CREDIBLE = 1
|
| 43 |
+
SATIRE_OPINION = 2
|
| 44 |
+
SUSPICIOUS = 3
|
| 45 |
+
KNOWN_FAKE = 4
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ── Request Models ─────────────────────────────────────────────────────────────
|
| 49 |
+
|
| 50 |
+
class TextVerifyRequest(BaseModel):
|
| 51 |
+
text: str = Field(..., min_length=10, max_length=10_000, description="Raw text to verify")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class URLVerifyRequest(BaseModel):
|
| 55 |
+
url: HttpUrl = Field(..., description="URL of the news article or social media post")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ── Nested Response Models ────────────────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
class EntitiesResult(BaseModel):
|
| 61 |
+
persons: list[str] = []
|
| 62 |
+
organizations: list[str] = []
|
| 63 |
+
locations: list[str] = []
|
| 64 |
+
dates: list[str] = []
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class Layer1Result(BaseModel):
|
| 68 |
+
verdict: Verdict
|
| 69 |
+
confidence: float = Field(..., ge=0.0, le=100.0, description="Confidence % from ML classifier")
|
| 70 |
+
triggered_features: list[str] = Field(
|
| 71 |
+
default_factory=list,
|
| 72 |
+
description="Human-readable list of suspicious features detected",
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class EvidenceSource(BaseModel):
|
| 77 |
+
title: str
|
| 78 |
+
url: str
|
| 79 |
+
similarity: float = Field(..., ge=0.0, le=1.0, description="Cosine similarity to input claim")
|
| 80 |
+
stance: Stance
|
| 81 |
+
domain_tier: DomainTier
|
| 82 |
+
published_at: Optional[str] = None
|
| 83 |
+
source_name: Optional[str] = None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class Layer2Result(BaseModel):
|
| 87 |
+
verdict: Verdict
|
| 88 |
+
evidence_score: float = Field(..., ge=0.0, le=100.0)
|
| 89 |
+
sources: list[EvidenceSource] = []
|
| 90 |
+
claim_used: Optional[str] = Field(None, description="Extracted claim sent to evidence search")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ── Main Response ─────────────────────────────────────────────────────────────
|
| 94 |
+
|
| 95 |
+
class VerificationResponse(BaseModel):
|
| 96 |
+
verdict: Verdict
|
| 97 |
+
confidence: float = Field(..., ge=0.0, le=100.0)
|
| 98 |
+
final_score: float = Field(..., ge=0.0, le=100.0)
|
| 99 |
+
layer1: Layer1Result
|
| 100 |
+
layer2: Layer2Result
|
| 101 |
+
entities: EntitiesResult
|
| 102 |
+
sentiment: str
|
| 103 |
+
emotion: str
|
| 104 |
+
language: Language
|
| 105 |
+
domain_credibility: Optional[DomainTier] = None
|
| 106 |
+
input_type: str = "text"
|
| 107 |
+
processing_time_ms: Optional[float] = None
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ── History / Trends ──────────────────────────────────────────────────────────
|
| 111 |
+
|
| 112 |
+
class HistoryEntry(BaseModel):
|
| 113 |
+
id: str
|
| 114 |
+
timestamp: str
|
| 115 |
+
input_type: str
|
| 116 |
+
text_preview: str
|
| 117 |
+
verdict: Verdict
|
| 118 |
+
confidence: float
|
| 119 |
+
final_score: float
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class HistoryResponse(BaseModel):
|
| 123 |
+
total: int
|
| 124 |
+
entries: list[HistoryEntry]
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class TrendingEntity(BaseModel):
|
| 128 |
+
entity: str
|
| 129 |
+
entity_type: str # person | org | location
|
| 130 |
+
count: int
|
| 131 |
+
fake_count: int
|
| 132 |
+
fake_ratio: float
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class TrendingTopic(BaseModel):
|
| 136 |
+
topic: str
|
| 137 |
+
count: int
|
| 138 |
+
dominant_verdict: Verdict
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class TrendsResponse(BaseModel):
|
| 142 |
+
top_entities: list[TrendingEntity]
|
| 143 |
+
top_topics: list[TrendingTopic]
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ── Error ─────────────────────────────────────────────────────────────────────
|
| 147 |
+
|
| 148 |
+
class ErrorResponse(BaseModel):
|
| 149 |
+
error: str
|
| 150 |
+
detail: Optional[str] = None
|
| 151 |
+
code: Optional[str] = None
|
config.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Application Settings
|
| 3 |
+
Loaded via pydantic-settings from environment variables / .env file.
|
| 4 |
+
"""
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Settings(BaseSettings):
|
| 10 |
+
model_config = SettingsConfigDict(
|
| 11 |
+
env_file=".env",
|
| 12 |
+
env_file_encoding="utf-8",
|
| 13 |
+
case_sensitive=False,
|
| 14 |
+
extra="ignore",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# ── API Keys ──────────────────────────────────────────────────────────────
|
| 18 |
+
news_api_key: str = ""
|
| 19 |
+
google_vision_api_key: str = ""
|
| 20 |
+
|
| 21 |
+
# ── Database ──────────────────────────────────────────────────────────────
|
| 22 |
+
database_url: str = "sqlite+aiosqlite:///./philverify_dev.db" # Dev fallback
|
| 23 |
+
|
| 24 |
+
# ── Redis ─────────────────────────────────────────────────────────────────
|
| 25 |
+
redis_url: str = "" # Empty = disable caching
|
| 26 |
+
|
| 27 |
+
# ── App ───────────────────────────────────────────────────────────────────
|
| 28 |
+
app_env: str = "development"
|
| 29 |
+
debug: bool = True
|
| 30 |
+
log_level: str = "INFO"
|
| 31 |
+
allowed_origins: list[str] = [
|
| 32 |
+
"http://localhost:3000",
|
| 33 |
+
"http://localhost:5173",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
# ── ML Models ─────────────────────────────────────────────────────────────
|
| 37 |
+
ml_model_name: str = "xlm-roberta-base"
|
| 38 |
+
whisper_model_size: str = "base"
|
| 39 |
+
use_gpu: bool = False
|
| 40 |
+
|
| 41 |
+
# ── Scoring Weights ───────────────────────────────────────────────────────
|
| 42 |
+
ml_weight: float = 0.40
|
| 43 |
+
evidence_weight: float = 0.60
|
| 44 |
+
credible_threshold: float = 70.0
|
| 45 |
+
fake_threshold: float = 40.0
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def is_production(self) -> bool:
|
| 49 |
+
return self.app_env == "production"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@lru_cache
|
| 53 |
+
def get_settings() -> Settings:
|
| 54 |
+
"""Return a cached singleton Settings instance."""
|
| 55 |
+
return Settings()
|
domain_credibility.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tier1": {
|
| 3 |
+
"description": "Established credible Philippine news organizations",
|
| 4 |
+
"score": 100,
|
| 5 |
+
"domains": [
|
| 6 |
+
"rappler.com", "inquirer.net", "gmanetwork.com", "abs-cbn.com",
|
| 7 |
+
"mb.com.ph", "philstar.com", "manilatimes.net", "sunstar.com.ph",
|
| 8 |
+
"businessmirror.com.ph", "bworldonline.com", "pna.gov.ph",
|
| 9 |
+
"doh.gov.ph", "official.deped.gov.ph", "senate.gov.ph", "congress.gov.ph"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
"tier2": {
|
| 13 |
+
"description": "Satire, opinion blogs, or entertainment sites",
|
| 14 |
+
"score": 50,
|
| 15 |
+
"domains": [
|
| 16 |
+
"knowyourmeme.com", "9gag.com", "buzzfeed.com",
|
| 17 |
+
"opinion.inquirer.net", "interaksyon.com"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
"tier3": {
|
| 21 |
+
"description": "Unknown / unverified sources — newly registered or low-authority",
|
| 22 |
+
"score": 25,
|
| 23 |
+
"domains": []
|
| 24 |
+
},
|
| 25 |
+
"tier4": {
|
| 26 |
+
"description": "Known fake news / misinformation sites (Vera Files blacklist)",
|
| 27 |
+
"score": 0,
|
| 28 |
+
"domains": [
|
| 29 |
+
"duterte.news", "pinoyakoblog.com", "filipinonewsalert.com",
|
| 30 |
+
"pilipinostar.com", "pinoytrending.net", "maharlikanews.com"
|
| 31 |
+
]
|
| 32 |
+
}
|
| 33 |
+
}
|
evidence/__init__.py
ADDED
|
File without changes
|
evidence/news_fetcher.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Evidence Retrieval Module
|
| 3 |
+
Fetches related articles from NewsAPI, computes cosine similarity,
|
| 4 |
+
and produces an evidence score for Layer 2 of the scoring engine.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import hashlib
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Simple file-based cache to respect NewsAPI 100 req/day free tier limit
|
| 15 |
+
_CACHE_DIR = Path(__file__).parent.parent / ".cache" / "newsapi"
|
| 16 |
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class ArticleResult:
|
| 21 |
+
title: str
|
| 22 |
+
url: str
|
| 23 |
+
description: str
|
| 24 |
+
source_name: str
|
| 25 |
+
published_at: str
|
| 26 |
+
similarity: float = 0.0
|
| 27 |
+
stance: str = "Not Enough Info"
|
| 28 |
+
domain_tier: int = 3
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class EvidenceResult:
|
| 33 |
+
verdict: str # "Supported" | "Contradicted" | "Insufficient"
|
| 34 |
+
evidence_score: float # 0–100
|
| 35 |
+
sources: list[ArticleResult] = field(default_factory=list)
|
| 36 |
+
claim_used: str = ""
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _cache_key(claim: str) -> str:
|
| 40 |
+
return hashlib.md5(claim.lower().strip().encode()).hexdigest()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _load_cache(key: str) -> list[dict] | None:
|
| 44 |
+
path = _CACHE_DIR / f"{key}.json"
|
| 45 |
+
if path.exists():
|
| 46 |
+
try:
|
| 47 |
+
return json.loads(path.read_text())
|
| 48 |
+
except Exception:
|
| 49 |
+
return None
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _save_cache(key: str, data: list[dict]) -> None:
|
| 54 |
+
path = _CACHE_DIR / f"{key}.json"
|
| 55 |
+
path.write_text(json.dumps(data))
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
async def fetch_evidence(claim: str, api_key: str, max_results: int = 5) -> list[dict]:
|
| 59 |
+
"""Fetch top articles from NewsAPI for the given claim. Cached."""
|
| 60 |
+
key = _cache_key(claim)
|
| 61 |
+
cached = _load_cache(key)
|
| 62 |
+
if cached is not None:
|
| 63 |
+
logger.info("NewsAPI cache hit for claim hash %s", key[:8])
|
| 64 |
+
return cached
|
| 65 |
+
|
| 66 |
+
if not api_key:
|
| 67 |
+
logger.warning("NEWS_API_KEY not set — returning empty evidence")
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
from newsapi import NewsApiClient
|
| 72 |
+
client = NewsApiClient(api_key=api_key)
|
| 73 |
+
# Use first 100 chars of claim as query
|
| 74 |
+
query = claim[:100]
|
| 75 |
+
resp = client.get_everything(
|
| 76 |
+
q=query,
|
| 77 |
+
language="en",
|
| 78 |
+
sort_by="relevancy",
|
| 79 |
+
page_size=max_results,
|
| 80 |
+
)
|
| 81 |
+
articles = resp.get("articles", [])
|
| 82 |
+
_save_cache(key, articles)
|
| 83 |
+
logger.info("NewsAPI returned %d articles for query '%s...'", len(articles), query[:30])
|
| 84 |
+
return articles
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.warning("NewsAPI fetch error: %s", e)
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def compute_similarity(claim: str, article_text: str) -> float:
|
| 91 |
+
"""
|
| 92 |
+
Compute cosine similarity between claim and article using sentence-transformers.
|
| 93 |
+
Falls back to simple word-overlap Jaccard similarity.
|
| 94 |
+
"""
|
| 95 |
+
try:
|
| 96 |
+
from sentence_transformers import SentenceTransformer, util
|
| 97 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 98 |
+
emb_claim = model.encode(claim, convert_to_tensor=True)
|
| 99 |
+
emb_article = model.encode(article_text[:512], convert_to_tensor=True)
|
| 100 |
+
score = float(util.cos_sim(emb_claim, emb_article)[0][0])
|
| 101 |
+
return round(max(0.0, min(1.0, score)), 3)
|
| 102 |
+
except Exception:
|
| 103 |
+
# Jaccard fallback
|
| 104 |
+
a = set(claim.lower().split())
|
| 105 |
+
b = set(article_text.lower().split())
|
| 106 |
+
if not a or not b:
|
| 107 |
+
return 0.0
|
| 108 |
+
return round(len(a & b) / len(a | b), 3)
|
inputs/__init__.py
ADDED
|
File without changes
|
inputs/asr.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Whisper ASR Module
|
| 3 |
+
Transcribes video/audio files using OpenAI Whisper.
|
| 4 |
+
Recommended model: large-v3 (best Filipino speech accuracy).
|
| 5 |
+
"""
|
| 6 |
+
import io
|
| 7 |
+
import logging
|
| 8 |
+
import tempfile
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
async def transcribe_video(media_bytes: bytes, filename: str = "upload") -> str:
|
| 15 |
+
"""
|
| 16 |
+
Transcribe audio/video bytes using Whisper.
|
| 17 |
+
Saves bytes to a temp file (Whisper requires file path, not bytes).
|
| 18 |
+
Returns the transcript string.
|
| 19 |
+
"""
|
| 20 |
+
try:
|
| 21 |
+
import whisper
|
| 22 |
+
from config import get_settings
|
| 23 |
+
settings = get_settings()
|
| 24 |
+
|
| 25 |
+
model_size = settings.whisper_model_size
|
| 26 |
+
logger.info("Loading Whisper model: %s", model_size)
|
| 27 |
+
|
| 28 |
+
model = whisper.load_model(model_size)
|
| 29 |
+
|
| 30 |
+
# Whisper needs a file path — write bytes to temp file
|
| 31 |
+
suffix = os.path.splitext(filename)[-1] or ".mp4"
|
| 32 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 33 |
+
tmp.write(media_bytes)
|
| 34 |
+
tmp_path = tmp.name
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
result = model.transcribe(tmp_path, language=None) # Auto-detect language
|
| 38 |
+
transcript = result.get("text", "").strip()
|
| 39 |
+
logger.info("Whisper transcribed %d chars (lang=%s)", len(transcript), result.get("language"))
|
| 40 |
+
return transcript
|
| 41 |
+
finally:
|
| 42 |
+
os.unlink(tmp_path) # Clean up temp file
|
| 43 |
+
|
| 44 |
+
except ImportError:
|
| 45 |
+
logger.warning("openai-whisper not installed — ASR unavailable")
|
| 46 |
+
return ""
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error("Whisper transcription failed: %s", e)
|
| 49 |
+
return ""
|
inputs/ocr.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — OCR Module (Tesseract)
|
| 3 |
+
Extracts text from images using pytesseract.
|
| 4 |
+
Falls back gracefully if Tesseract not installed.
|
| 5 |
+
"""
|
| 6 |
+
import io
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
# Supported languages: Filipino (fil) + English (eng)
|
| 12 |
+
_TESSERACT_LANG = "fil+eng"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
async def extract_text_from_image(image_bytes: bytes) -> str:
|
| 16 |
+
"""
|
| 17 |
+
Run Tesseract OCR on image bytes. Returns extracted text string.
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
import pytesseract
|
| 21 |
+
from PIL import Image
|
| 22 |
+
|
| 23 |
+
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 24 |
+
text = pytesseract.image_to_string(image, lang=_TESSERACT_LANG)
|
| 25 |
+
text = text.strip()
|
| 26 |
+
logger.info("OCR extracted %d chars from image", len(text))
|
| 27 |
+
return text
|
| 28 |
+
except ImportError:
|
| 29 |
+
logger.warning("pytesseract / Pillow not installed — OCR unavailable")
|
| 30 |
+
return ""
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error("OCR failed: %s", e)
|
| 33 |
+
return ""
|
inputs/url_scraper.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — URL Scraper (BeautifulSoup)
|
| 3 |
+
Extracts article text from news URLs. Respects robots.txt.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import re
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
from urllib.robotparser import RobotFileParser
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
_UNWANTED_TAGS = {"script", "style", "nav", "footer", "header", "aside", "figure", "figcaption"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _get_domain(url: str) -> str:
|
| 16 |
+
return urlparse(url).netloc.replace("www.", "")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _robots_allow(url: str) -> bool:
|
| 20 |
+
try:
|
| 21 |
+
parsed = urlparse(url)
|
| 22 |
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 23 |
+
rp = RobotFileParser()
|
| 24 |
+
rp.set_url(robots_url)
|
| 25 |
+
rp.read()
|
| 26 |
+
return rp.can_fetch("*", url)
|
| 27 |
+
except Exception:
|
| 28 |
+
return True # Allow by default if robots.txt fetch fails
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
async def scrape_url(url: str) -> tuple[str, str]:
|
| 32 |
+
"""
|
| 33 |
+
Returns (article_text, domain).
|
| 34 |
+
Raises ValueError if robots.txt disallows scraping.
|
| 35 |
+
"""
|
| 36 |
+
domain = _get_domain(url)
|
| 37 |
+
|
| 38 |
+
if not _robots_allow(url):
|
| 39 |
+
logger.warning("robots.txt disallows scraping %s", url)
|
| 40 |
+
raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
import httpx
|
| 44 |
+
from bs4 import BeautifulSoup
|
| 45 |
+
|
| 46 |
+
headers = {"User-Agent": "PhilVerifyBot/1.0 (fact-checking research)"}
|
| 47 |
+
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
| 48 |
+
resp = await client.get(url, headers=headers)
|
| 49 |
+
resp.raise_for_status()
|
| 50 |
+
|
| 51 |
+
soup = BeautifulSoup(resp.text, "lxml")
|
| 52 |
+
|
| 53 |
+
# Remove unwanted tags
|
| 54 |
+
for tag in soup(list(_UNWANTED_TAGS)):
|
| 55 |
+
tag.decompose()
|
| 56 |
+
|
| 57 |
+
# Try article tag first, fall back to body
|
| 58 |
+
article = soup.find("article") or soup.find("main") or soup.body
|
| 59 |
+
if article is None:
|
| 60 |
+
return "", domain
|
| 61 |
+
|
| 62 |
+
paragraphs = article.find_all("p")
|
| 63 |
+
text = " ".join(p.get_text(separator=" ", strip=True) for p in paragraphs)
|
| 64 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 65 |
+
|
| 66 |
+
logger.info("Scraped %d chars from %s", len(text), domain)
|
| 67 |
+
return text, domain
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error("URL scraping failed for %s: %s", url, e)
|
| 71 |
+
return "", domain
|
main.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — FastAPI Application Entry Point
|
| 3 |
+
Run: uvicorn main:app --reload --port 8000
|
| 4 |
+
Docs: http://localhost:8000/docs
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
from contextlib import asynccontextmanager
|
| 9 |
+
|
| 10 |
+
from fastapi import FastAPI, Request, status
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
|
| 14 |
+
from config import get_settings
|
| 15 |
+
from api.routes.verify import router as verify_router
|
| 16 |
+
from api.routes.history import router as history_router
|
| 17 |
+
from api.routes.trends import router as trends_router
|
| 18 |
+
|
| 19 |
+
# ── Logging ───────────────────────────────────────────────────────────────────
|
| 20 |
+
logging.basicConfig(
|
| 21 |
+
level=getattr(logging, get_settings().log_level.upper(), logging.INFO),
|
| 22 |
+
format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
|
| 23 |
+
)
|
| 24 |
+
logger = logging.getLogger("philverify")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ── Lifespan (startup / shutdown) ─────────────────────────────────────────────
|
| 28 |
+
|
| 29 |
+
@asynccontextmanager
|
| 30 |
+
async def lifespan(app: FastAPI):
|
| 31 |
+
"""Warm up NLP models on startup so first request isn't slow."""
|
| 32 |
+
logger.info("🚀 PhilVerify starting up...")
|
| 33 |
+
try:
|
| 34 |
+
# Lazy-import to avoid crashing if heavy deps not yet installed
|
| 35 |
+
from nlp.language_detector import LanguageDetector
|
| 36 |
+
from nlp.preprocessor import TextPreprocessor
|
| 37 |
+
from ml.tfidf_classifier import TFIDFClassifier
|
| 38 |
+
|
| 39 |
+
app.state.preprocessor = TextPreprocessor()
|
| 40 |
+
app.state.language_detector = LanguageDetector()
|
| 41 |
+
classifier = TFIDFClassifier()
|
| 42 |
+
classifier.train() # Trains on seed dataset if model not persisted
|
| 43 |
+
app.state.classifier = classifier
|
| 44 |
+
|
| 45 |
+
logger.info("✅ NLP models ready")
|
| 46 |
+
except ImportError as e:
|
| 47 |
+
logger.warning("⚠️ Some NLP modules not installed yet: %s — stubs will be used", e)
|
| 48 |
+
|
| 49 |
+
yield # ── App is running ──
|
| 50 |
+
|
| 51 |
+
logger.info("👋 PhilVerify shutting down")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ── App ───────────────────────────────────────────────────────────────────────
|
| 55 |
+
|
| 56 |
+
settings = get_settings()
|
| 57 |
+
|
| 58 |
+
app = FastAPI(
|
| 59 |
+
title="PhilVerify API",
|
| 60 |
+
description=(
|
| 61 |
+
"Multimodal fake news detection for Philippine social media. "
|
| 62 |
+
"Supports text, URL, image (OCR), and video (Whisper ASR) inputs."
|
| 63 |
+
),
|
| 64 |
+
version="0.1.0",
|
| 65 |
+
docs_url="/docs",
|
| 66 |
+
redoc_url="/redoc",
|
| 67 |
+
lifespan=lifespan,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ── CORS ──────────────────────────────────────────────────────────────────────
|
| 72 |
+
|
| 73 |
+
app.add_middleware(
|
| 74 |
+
CORSMiddleware,
|
| 75 |
+
allow_origins=settings.allowed_origins,
|
| 76 |
+
allow_credentials=True,
|
| 77 |
+
allow_methods=["*"],
|
| 78 |
+
allow_headers=["*"],
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ── Global Error Handler ──────────────────────────────────────────────────────
|
| 83 |
+
|
| 84 |
+
@app.exception_handler(Exception)
|
| 85 |
+
async def global_exception_handler(request: Request, exc: Exception):
|
| 86 |
+
logger.exception("Unhandled error on %s %s: %s", request.method, request.url.path, exc)
|
| 87 |
+
return JSONResponse(
|
| 88 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 89 |
+
content={"error": "Internal server error", "detail": str(exc)},
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ── Routers ───────────────────────────────────────────────────────────────────
|
| 94 |
+
|
| 95 |
+
app.include_router(verify_router)
|
| 96 |
+
app.include_router(history_router)
|
| 97 |
+
app.include_router(trends_router)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ── Health ────────────────────────────────────────────────────────────────────
|
| 101 |
+
|
| 102 |
+
@app.get("/", tags=["Health"])
|
| 103 |
+
async def root():
|
| 104 |
+
return {
|
| 105 |
+
"service": "PhilVerify",
|
| 106 |
+
"version": "0.1.0",
|
| 107 |
+
"status": "operational",
|
| 108 |
+
"docs": "/docs",
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.get("/health", tags=["Health"])
|
| 113 |
+
async def health():
|
| 114 |
+
return {"status": "ok", "env": settings.app_env}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ── Dev runner ────────────────────────────────────────────────────────────────
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
import uvicorn
|
| 121 |
+
uvicorn.run(
|
| 122 |
+
"main:app",
|
| 123 |
+
host="0.0.0.0",
|
| 124 |
+
port=int(os.getenv("PORT", 8000)),
|
| 125 |
+
reload=settings.debug,
|
| 126 |
+
log_level=settings.log_level.lower(),
|
| 127 |
+
)
|
ml/__init__.py
ADDED
|
File without changes
|
ml/tfidf_classifier.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
|
| 3 |
+
Seed dataset of 30 labeled PH news headlines (10 per class).
|
| 4 |
+
Replaced by fine-tuned XLM-RoBERTa in Phase 10.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
import pickle
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"
|
| 15 |
+
|
| 16 |
+
# ── Seed dataset (30 samples — 10 per class) ──────────────────────────────────
|
| 17 |
+
# Labels: 0=Credible, 1=Unverified, 2=Fake
|
| 18 |
+
SEED_DATA = [
|
| 19 |
+
# Credible (0)
|
| 20 |
+
("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
|
| 21 |
+
("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
|
| 22 |
+
("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
|
| 23 |
+
("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
|
| 24 |
+
("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
|
| 25 |
+
("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
|
| 26 |
+
("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
|
| 27 |
+
("DOF confirms revenue collection targets met for fiscal year 2025", 0),
|
| 28 |
+
("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
|
| 29 |
+
("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),
|
| 30 |
+
|
| 31 |
+
# Unverified (1)
|
| 32 |
+
("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
|
| 33 |
+
("VIRAL: Celebrity spotted at secret meeting with government official", 1),
|
| 34 |
+
("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
|
| 35 |
+
("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
|
| 36 |
+
("CLAIM: Government hiding true COVID-19 death count from public", 1),
|
| 37 |
+
("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
|
| 38 |
+
("REPORT: Certain barangay officials accepting bribes according to residents", 1),
|
| 39 |
+
("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
|
| 40 |
+
("CLAIM: New mandatory vaccine policy for all government employees", 1),
|
| 41 |
+
("Source says: Manila Water to increase rates by 20% next month", 1),
|
| 42 |
+
|
| 43 |
+
# Fake (2)
|
| 44 |
+
("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
|
| 45 |
+
("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
|
| 46 |
+
("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
|
| 47 |
+
("WATCH: Senator caught stealing money in Senate vault - full video", 2),
|
| 48 |
+
("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
|
| 49 |
+
("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
|
| 50 |
+
("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
|
| 51 |
+
("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
|
| 52 |
+
("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
|
| 53 |
+
("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class Layer1Result:
|
| 59 |
+
verdict: str # "Credible" | "Unverified" | "Fake"
|
| 60 |
+
confidence: float # 0.0 – 100.0
|
| 61 |
+
triggered_features: list[str] = field(default_factory=list)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class TFIDFClassifier:
|
| 65 |
+
"""
|
| 66 |
+
TF-IDF + Logistic Regression baseline.
|
| 67 |
+
Train() fits on the seed dataset and saves to disk.
|
| 68 |
+
Predict() loads persisted model first call.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 72 |
+
|
| 73 |
+
def __init__(self):
|
| 74 |
+
self._vectorizer = None
|
| 75 |
+
self._clf = None
|
| 76 |
+
|
| 77 |
+
def train(self) -> None:
|
| 78 |
+
"""Fit on seed data. Skips training if persisted model exists."""
|
| 79 |
+
if MODEL_PATH.exists():
|
| 80 |
+
self._load()
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 84 |
+
from sklearn.linear_model import LogisticRegression
|
| 85 |
+
|
| 86 |
+
texts, labels = zip(*SEED_DATA)
|
| 87 |
+
self._vectorizer = TfidfVectorizer(
|
| 88 |
+
ngram_range=(1, 2),
|
| 89 |
+
max_features=1000,
|
| 90 |
+
sublinear_tf=True,
|
| 91 |
+
)
|
| 92 |
+
X = self._vectorizer.fit_transform(texts)
|
| 93 |
+
self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
|
| 94 |
+
self._clf.fit(X, labels)
|
| 95 |
+
|
| 96 |
+
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 97 |
+
with open(MODEL_PATH, "wb") as f:
|
| 98 |
+
pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
|
| 99 |
+
logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)
|
| 100 |
+
|
| 101 |
+
def _load(self) -> None:
|
| 102 |
+
with open(MODEL_PATH, "rb") as f:
|
| 103 |
+
data = pickle.load(f)
|
| 104 |
+
self._vectorizer = data["vectorizer"]
|
| 105 |
+
self._clf = data["clf"]
|
| 106 |
+
logger.info("TF-IDF model loaded from %s", MODEL_PATH)
|
| 107 |
+
|
| 108 |
+
def predict(self, text: str) -> Layer1Result:
|
| 109 |
+
if self._vectorizer is None:
|
| 110 |
+
self.train()
|
| 111 |
+
|
| 112 |
+
X = self._vectorizer.transform([text])
|
| 113 |
+
pred_label = int(self._clf.predict(X)[0])
|
| 114 |
+
proba = self._clf.predict_proba(X)[0]
|
| 115 |
+
confidence = round(float(max(proba)) * 100, 1)
|
| 116 |
+
verdict = self._LABELS[pred_label]
|
| 117 |
+
|
| 118 |
+
# Extract top TF-IDF features as human-readable triggers
|
| 119 |
+
feature_names = self._vectorizer.get_feature_names_out()
|
| 120 |
+
tfidf_scores = X.toarray()[0]
|
| 121 |
+
top_indices = tfidf_scores.argsort()[-5:][::-1]
|
| 122 |
+
triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]
|
| 123 |
+
|
| 124 |
+
return Layer1Result(
|
| 125 |
+
verdict=verdict,
|
| 126 |
+
confidence=confidence,
|
| 127 |
+
triggered_features=triggered,
|
| 128 |
+
)
|
nlp/__init__.py
ADDED
|
File without changes
|
nlp/claim_extractor.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Claim Extractor
|
| 3 |
+
Extracts the key falsifiable claim from noisy social media text.
|
| 4 |
+
Primary: HuggingFace summarization (t5-small)
|
| 5 |
+
Fallback: First 2 sentence heuristic
|
| 6 |
+
"""
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class ClaimResult:
|
| 18 |
+
claim: str
|
| 19 |
+
method: str # "summarization" | "sentence_heuristic"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ClaimExtractor:
|
| 23 |
+
"""
|
| 24 |
+
Extracts the single most falsifiable claim from input text.
|
| 25 |
+
This claim is then sent to the NewsAPI evidence retrieval step.
|
| 26 |
+
|
| 27 |
+
Prompt engineering guide:
|
| 28 |
+
The summarization model is given a task-specific prefix to bias it
|
| 29 |
+
toward extracting assertions rather than summaries.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
_TASK_PREFIX = "Extract the main factual claim: "
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
self._pipe = None
|
| 36 |
+
self._loaded = False
|
| 37 |
+
|
| 38 |
+
def _load_model(self):
|
| 39 |
+
if self._loaded:
|
| 40 |
+
return
|
| 41 |
+
try:
|
| 42 |
+
from transformers import pipeline
|
| 43 |
+
self._pipe = pipeline(
|
| 44 |
+
"summarization",
|
| 45 |
+
model="sshleifer/distilbart-cnn-6-6",
|
| 46 |
+
max_length=80,
|
| 47 |
+
min_length=10,
|
| 48 |
+
do_sample=False,
|
| 49 |
+
)
|
| 50 |
+
logger.info("Claim extractor model loaded (distilbart-cnn-6-6)")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.warning("Summarization model not available (%s) — using sentence heuristic", e)
|
| 53 |
+
self._loaded = True
|
| 54 |
+
|
| 55 |
+
def _sentence_heuristic(self, text: str) -> str:
|
| 56 |
+
"""Return the first 1-2 sentences as the claim (fast fallback)."""
|
| 57 |
+
sentences = _SENTENCE_SPLIT.split(text.strip())
|
| 58 |
+
candidates = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 59 |
+
if not candidates:
|
| 60 |
+
return text[:200].strip()
|
| 61 |
+
return " ".join(candidates[:2])
|
| 62 |
+
|
| 63 |
+
def extract(self, text: str) -> ClaimResult:
|
| 64 |
+
self._load_model()
|
| 65 |
+
|
| 66 |
+
if not text or len(text.strip()) < 20:
|
| 67 |
+
return ClaimResult(claim=text.strip(), method="passthrough")
|
| 68 |
+
|
| 69 |
+
if self._pipe:
|
| 70 |
+
try:
|
| 71 |
+
input_text = self._TASK_PREFIX + text[:1024]
|
| 72 |
+
out = self._pipe(input_text, truncation=True)
|
| 73 |
+
claim = out[0]["summary_text"].strip()
|
| 74 |
+
# Strip the task prefix echo if model includes it
|
| 75 |
+
claim = re.sub(r"^extract the main factual claim:?\s*", "", claim, flags=re.I)
|
| 76 |
+
if len(claim) > 15:
|
| 77 |
+
return ClaimResult(claim=claim, method="summarization")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.warning("Summarization inference error: %s", e)
|
| 80 |
+
|
| 81 |
+
return ClaimResult(
|
| 82 |
+
claim=self._sentence_heuristic(text),
|
| 83 |
+
method="sentence_heuristic",
|
| 84 |
+
)
|
nlp/clickbait.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Clickbait Detector
|
| 3 |
+
Detects clickbait patterns common in Philippine fake news / viral content.
|
| 4 |
+
Uses regex patterns + feature flags (no model needed).
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
|
| 9 |
+
# ── Pattern library ───────────────────────────────────────────────────────────
|
| 10 |
+
_CLICKBAIT_PHRASES_EN = [
|
| 11 |
+
r"\byou won'?t believe\b", r"\bshocking\b", r"\bviral\b", r"\bbreaking\b",
|
| 12 |
+
r"\bexclusive\b", r"\bmust[\s-]?see\b", r"\bsecret\b", r"\bconfirmed\b",
|
| 13 |
+
r"\bexposed\b", r"\bscandal\b", r"\bunbelievable\b", r"\bmiraculous?\b",
|
| 14 |
+
r"\bhoax\b", r"\bfact[\s-]?check\b", r"\bthis is why\b", r"\bwatch this\b",
|
| 15 |
+
]
|
| 16 |
+
_CLICKBAIT_PHRASES_TL = [
|
| 17 |
+
r"\bgrabe\b", r"\bwow\b", r"\bsurprise\b", r"\bshocking\b", r"\btrending\b",
|
| 18 |
+
r"\bselo\b", r"\bbalita\b", r"\bnatuklasan\b", r"\bnahuli\b", r"\bsikat\b",
|
| 19 |
+
r"\bpakinggan\b", r"\bpanoorin\b", r"\bkumpirmado\b", r"\bkatotohanan\b",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
_CAPS_WORD = re.compile(r"\b[A-Z]{2,}\b")
|
| 23 |
+
_EXCESSIVE_PUNCT = re.compile(r"[!?]{2,}")
|
| 24 |
+
_NUMBER_BAIT = re.compile(r"\b\d+\s+(?:reasons?|things?|ways?|tips?|signs?|bagay)\b", re.I)
|
| 25 |
+
_QUESTION_BAIT = re.compile(r"\b(?:ano|bakit|paano|kailan|sino|saan)\b.*\?", re.I)
|
| 26 |
+
_ALL_PHRASES = [re.compile(p, re.IGNORECASE) for p in _CLICKBAIT_PHRASES_EN + _CLICKBAIT_PHRASES_TL]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ClickbaitResult:
|
| 31 |
+
is_clickbait: bool
|
| 32 |
+
score: float # 0.0 – 1.0
|
| 33 |
+
triggered_patterns: list[str] = field(default_factory=list)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ClickbaitDetector:
|
| 37 |
+
"""
|
| 38 |
+
Feature-flag based clickbait detector optimized for PH social media.
|
| 39 |
+
Returns a continuous score based on how many patterns are triggered.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def detect(self, text: str) -> ClickbaitResult:
|
| 43 |
+
triggered: list[str] = []
|
| 44 |
+
|
| 45 |
+
# ALL CAPS words (2+ in a short span)
|
| 46 |
+
caps_words = _CAPS_WORD.findall(text)
|
| 47 |
+
if len(caps_words) >= 2:
|
| 48 |
+
triggered.append(f"all_caps_words: {caps_words[:3]}")
|
| 49 |
+
|
| 50 |
+
# Excessive punctuation !! ???
|
| 51 |
+
if _EXCESSIVE_PUNCT.search(text):
|
| 52 |
+
triggered.append("excessive_punctuation")
|
| 53 |
+
|
| 54 |
+
# Number-based bait: "5 reasons why..."
|
| 55 |
+
if _NUMBER_BAIT.search(text):
|
| 56 |
+
triggered.append("number_bait")
|
| 57 |
+
|
| 58 |
+
# Rhetorical question bait (Tagalog)
|
| 59 |
+
if _QUESTION_BAIT.search(text):
|
| 60 |
+
triggered.append("question_bait")
|
| 61 |
+
|
| 62 |
+
# Title length signal (extremely short or extremely long)
|
| 63 |
+
word_count = len(text.split())
|
| 64 |
+
if word_count < 5:
|
| 65 |
+
triggered.append("title_too_short")
|
| 66 |
+
elif word_count > 30:
|
| 67 |
+
triggered.append("title_very_long")
|
| 68 |
+
|
| 69 |
+
# Phrase patterns
|
| 70 |
+
for pattern in _ALL_PHRASES:
|
| 71 |
+
m = pattern.search(text)
|
| 72 |
+
if m:
|
| 73 |
+
triggered.append(f"clickbait_phrase: '{m.group(0)}'")
|
| 74 |
+
|
| 75 |
+
# Score: each feature contributes a weight
|
| 76 |
+
weights = {
|
| 77 |
+
"excessive_punctuation": 0.20,
|
| 78 |
+
"all_caps_words": 0.20,
|
| 79 |
+
"number_bait": 0.15,
|
| 80 |
+
"question_bait": 0.10,
|
| 81 |
+
"title_too_short": 0.05,
|
| 82 |
+
"title_very_long": 0.05,
|
| 83 |
+
}
|
| 84 |
+
score = 0.0
|
| 85 |
+
for feat in triggered:
|
| 86 |
+
for key, w in weights.items():
|
| 87 |
+
if feat.startswith(key):
|
| 88 |
+
score += w
|
| 89 |
+
break
|
| 90 |
+
else:
|
| 91 |
+
# clickbait_phrase triggers
|
| 92 |
+
if feat.startswith("clickbait_phrase"):
|
| 93 |
+
score += 0.25
|
| 94 |
+
|
| 95 |
+
score = min(score, 1.0)
|
| 96 |
+
return ClickbaitResult(
|
| 97 |
+
is_clickbait=score >= 0.4,
|
| 98 |
+
score=round(score, 3),
|
| 99 |
+
triggered_patterns=triggered,
|
| 100 |
+
)
|
nlp/language_detector.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Language Detector
|
| 3 |
+
Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic.
|
| 4 |
+
No heavy model needed — runs instantly.
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# ── Filipino stopword set for heuristic ───────────────────────────────────────
|
| 13 |
+
_TL_MARKERS = {
|
| 14 |
+
"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
|
| 15 |
+
"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw",
|
| 16 |
+
"ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung",
|
| 17 |
+
"pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi",
|
| 18 |
+
"may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# English marker words (distinct from TL)
|
| 22 |
+
_EN_MARKERS = {
|
| 23 |
+
"the", "and", "is", "are", "was", "were", "this", "that", "with",
|
| 24 |
+
"from", "have", "has", "had", "will", "would", "could", "should",
|
| 25 |
+
"not", "been", "being", "they", "their", "there",
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class LanguageResult:
|
| 31 |
+
language: str # "Tagalog" | "English" | "Taglish" | "Unknown"
|
| 32 |
+
confidence: float # 0.0 – 1.0
|
| 33 |
+
tl_ratio: float
|
| 34 |
+
en_ratio: float
|
| 35 |
+
method: str # "heuristic" | "langdetect" | "combined"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class LanguageDetector:
|
| 39 |
+
"""
|
| 40 |
+
Two-pass language detector:
|
| 41 |
+
Pass 1 — Filipino stopword ratio (fast, handles code-switching)
|
| 42 |
+
Pass 2 — langdetect (for confirmation when ratios are ambiguous)
|
| 43 |
+
|
| 44 |
+
Decision rules:
|
| 45 |
+
tl_ratio >= 0.25 and en_ratio < 0.15 → Tagalog
|
| 46 |
+
en_ratio >= 0.25 and tl_ratio < 0.15 → English
|
| 47 |
+
both >= 0.15 → Taglish
|
| 48 |
+
fallback → langdetect result
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def _token_ratios(self, text: str) -> tuple[float, float]:
|
| 52 |
+
tokens = re.findall(r"\b\w+\b", text.lower())
|
| 53 |
+
if not tokens:
|
| 54 |
+
return 0.0, 0.0
|
| 55 |
+
tl_count = sum(1 for t in tokens if t in _TL_MARKERS)
|
| 56 |
+
en_count = sum(1 for t in tokens if t in _EN_MARKERS)
|
| 57 |
+
total = len(tokens)
|
| 58 |
+
return tl_count / total, en_count / total
|
| 59 |
+
|
| 60 |
+
def _langdetect(self, text: str) -> str:
|
| 61 |
+
try:
|
| 62 |
+
from langdetect import detect
|
| 63 |
+
code = detect(text)
|
| 64 |
+
# langdetect returns 'tl' for Tagalog
|
| 65 |
+
if code == "tl":
|
| 66 |
+
return "Tagalog"
|
| 67 |
+
elif code == "en":
|
| 68 |
+
return "English"
|
| 69 |
+
else:
|
| 70 |
+
return "Unknown"
|
| 71 |
+
except Exception:
|
| 72 |
+
return "Unknown"
|
| 73 |
+
|
| 74 |
+
def detect(self, text: str) -> LanguageResult:
|
| 75 |
+
if not text or len(text.strip()) < 5:
|
| 76 |
+
return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic")
|
| 77 |
+
|
| 78 |
+
tl_ratio, en_ratio = self._token_ratios(text)
|
| 79 |
+
|
| 80 |
+
# Clear Tagalog
|
| 81 |
+
if tl_ratio >= 0.25 and en_ratio < 0.15:
|
| 82 |
+
return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic")
|
| 83 |
+
|
| 84 |
+
# Clear English
|
| 85 |
+
if en_ratio >= 0.25 and tl_ratio < 0.15:
|
| 86 |
+
return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic")
|
| 87 |
+
|
| 88 |
+
# Taglish — both markers present
|
| 89 |
+
if tl_ratio >= 0.10 and en_ratio >= 0.10:
|
| 90 |
+
confidence = (tl_ratio + en_ratio) / 2
|
| 91 |
+
return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic")
|
| 92 |
+
|
| 93 |
+
# Ambiguous — fall back to langdetect
|
| 94 |
+
ld_lang = self._langdetect(text)
|
| 95 |
+
if ld_lang != "Unknown":
|
| 96 |
+
confidence = max(tl_ratio, en_ratio, 0.5)
|
| 97 |
+
return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect")
|
| 98 |
+
|
| 99 |
+
return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")
|
nlp/ner.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Named Entity Recognition
|
| 3 |
+
Extracts persons, organizations, locations, and dates from text.
|
| 4 |
+
Uses spaCy en_core_web_sm with graceful fallback if model not installed.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import re
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# Philippine-specific named entity hints
|
| 13 |
+
_PH_PERSONS = {
|
| 14 |
+
"marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
|
| 15 |
+
"bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
|
| 16 |
+
}
|
| 17 |
+
_PH_ORGS = {
|
| 18 |
+
"doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
|
| 19 |
+
"afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
|
| 20 |
+
"senate", "congress", "supreme court", "malacanang",
|
| 21 |
+
}
|
| 22 |
+
_PH_LOCATIONS = {
|
| 23 |
+
"manila", "quezon city", "makati", "pasig", "taguig", "cebu",
|
| 24 |
+
"davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
|
| 25 |
+
"batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
|
| 26 |
+
"metro manila", "ncr", "philippines", "pilipinas",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class NERResult:
|
| 32 |
+
persons: list[str] = field(default_factory=list)
|
| 33 |
+
organizations: list[str] = field(default_factory=list)
|
| 34 |
+
locations: list[str] = field(default_factory=list)
|
| 35 |
+
dates: list[str] = field(default_factory=list)
|
| 36 |
+
method: str = "spacy"
|
| 37 |
+
|
| 38 |
+
def to_dict(self) -> dict:
|
| 39 |
+
return {
|
| 40 |
+
"persons": self.persons,
|
| 41 |
+
"organizations": self.organizations,
|
| 42 |
+
"locations": self.locations,
|
| 43 |
+
"dates": self.dates,
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class EntityExtractor:
|
| 48 |
+
"""
|
| 49 |
+
NER using spaCy (en_core_web_sm) + Philippine entity hint layer.
|
| 50 |
+
Falls back to regex-based date extraction if spaCy not installed.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(self):
|
| 54 |
+
self._nlp = None
|
| 55 |
+
self._loaded = False
|
| 56 |
+
|
| 57 |
+
def _load_model(self):
|
| 58 |
+
if self._loaded:
|
| 59 |
+
return
|
| 60 |
+
try:
|
| 61 |
+
import spacy
|
| 62 |
+
self._nlp = spacy.load("en_core_web_sm")
|
| 63 |
+
logger.info("spaCy en_core_web_sm loaded")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.warning("spaCy not available (%s) — using hint-based NER", e)
|
| 66 |
+
self._nlp = None
|
| 67 |
+
self._loaded = True
|
| 68 |
+
|
| 69 |
+
def _hint_based_extract(self, text: str) -> NERResult:
|
| 70 |
+
"""Fallback: match PH-specific entity hint lists + date regex."""
|
| 71 |
+
lower = text.lower()
|
| 72 |
+
result = NERResult(method="hints")
|
| 73 |
+
|
| 74 |
+
result.persons = [p.title() for p in _PH_PERSONS if p in lower]
|
| 75 |
+
result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
|
| 76 |
+
result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]
|
| 77 |
+
|
| 78 |
+
# Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
|
| 79 |
+
date_patterns = [
|
| 80 |
+
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
|
| 81 |
+
r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
|
| 82 |
+
r"\b\d{4}-\d{2}-\d{2}\b",
|
| 83 |
+
r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
|
| 84 |
+
]
|
| 85 |
+
for pattern in date_patterns:
|
| 86 |
+
result.dates.extend(re.findall(pattern, text, re.IGNORECASE))
|
| 87 |
+
|
| 88 |
+
return result
|
| 89 |
+
|
| 90 |
+
def extract(self, text: str) -> NERResult:
|
| 91 |
+
self._load_model()
|
| 92 |
+
|
| 93 |
+
if self._nlp is None:
|
| 94 |
+
return self._hint_based_extract(text)
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
doc = self._nlp(text[:5000]) # spaCy has a token limit
|
| 98 |
+
result = NERResult(method="spacy")
|
| 99 |
+
|
| 100 |
+
for ent in doc.ents:
|
| 101 |
+
ent_text = ent.text.strip()
|
| 102 |
+
if ent.label_ == "PERSON":
|
| 103 |
+
result.persons.append(ent_text)
|
| 104 |
+
elif ent.label_ in ("ORG", "NORP"):
|
| 105 |
+
result.organizations.append(ent_text)
|
| 106 |
+
elif ent.label_ in ("GPE", "LOC"):
|
| 107 |
+
result.locations.append(ent_text)
|
| 108 |
+
elif ent.label_ in ("DATE", "TIME"):
|
| 109 |
+
result.dates.append(ent_text)
|
| 110 |
+
|
| 111 |
+
# Deduplicate while preserving order
|
| 112 |
+
result.persons = list(dict.fromkeys(result.persons))
|
| 113 |
+
result.organizations = list(dict.fromkeys(result.organizations))
|
| 114 |
+
result.locations = list(dict.fromkeys(result.locations))
|
| 115 |
+
result.dates = list(dict.fromkeys(result.dates))
|
| 116 |
+
|
| 117 |
+
# Supplement with PH hints for entities spaCy may miss
|
| 118 |
+
hint_result = self._hint_based_extract(text)
|
| 119 |
+
for p in hint_result.persons:
|
| 120 |
+
if p not in result.persons:
|
| 121 |
+
result.persons.append(p)
|
| 122 |
+
for o in hint_result.organizations:
|
| 123 |
+
if o not in result.organizations:
|
| 124 |
+
result.organizations.append(o)
|
| 125 |
+
|
| 126 |
+
return result
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.warning("spaCy extraction error: %s — falling back to hints", e)
|
| 129 |
+
return self._hint_based_extract(text)
|
nlp/preprocessor.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Text Preprocessor
|
| 3 |
+
Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
|
| 4 |
+
"""
|
| 5 |
+
import re
|
| 6 |
+
import string
|
| 7 |
+
import unicodedata
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
|
| 10 |
+
# ── Filipino + English stopwords ──────────────────────────────────────────────
|
| 11 |
+
TAGALOG_STOPWORDS = {
|
| 12 |
+
"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
|
| 13 |
+
"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
|
| 14 |
+
"dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
|
| 15 |
+
"lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
|
| 16 |
+
"kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
|
| 17 |
+
"ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
|
| 18 |
+
"hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
ENGLISH_STOPWORDS = {
|
| 22 |
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
|
| 23 |
+
"for", "of", "with", "by", "from", "is", "are", "was", "were",
|
| 24 |
+
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
| 25 |
+
"will", "would", "could", "should", "may", "might", "shall", "can",
|
| 26 |
+
"not", "no", "nor", "so", "yet", "both", "either", "neither",
|
| 27 |
+
"this", "that", "these", "those", "it", "its", "i", "me", "my",
|
| 28 |
+
"we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS
|
| 32 |
+
|
| 33 |
+
# ── Patterns ──────────────────────────────────────────────────────────────────
|
| 34 |
+
_URL_PATTERN = re.compile(
|
| 35 |
+
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
| 36 |
+
)
|
| 37 |
+
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
| 38 |
+
_MENTION_PATTERN = re.compile(r"@\w+")
|
| 39 |
+
_HASHTAG_PATTERN = re.compile(r"#\w+")
|
| 40 |
+
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}") # "graaabe" → "grabe"
|
| 41 |
+
_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
|
| 42 |
+
_WHITESPACE_PATTERN = re.compile(r"\s+")
|
| 43 |
+
|
| 44 |
+
# Emoji removal via unicode category
|
| 45 |
+
def _remove_emojis(text: str) -> str:
|
| 46 |
+
return "".join(
|
| 47 |
+
ch for ch in text
|
| 48 |
+
if not unicodedata.category(ch).startswith("So") # Symbol, Other
|
| 49 |
+
and unicodedata.category(ch) not in ("Mn",) # Modifier letters
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class PreprocessResult:
|
| 55 |
+
original: str
|
| 56 |
+
cleaned: str
|
| 57 |
+
normalized: str
|
| 58 |
+
tokens: list[str] = field(default_factory=list)
|
| 59 |
+
filtered_tokens: list[str] = field(default_factory=list)
|
| 60 |
+
char_count: int = 0
|
| 61 |
+
word_count: int = 0
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class TextPreprocessor:
|
| 65 |
+
"""
|
| 66 |
+
Multi-step text cleaner for Tagalog / English / Taglish content.
|
| 67 |
+
|
| 68 |
+
Pipeline:
|
| 69 |
+
1. strip_html — remove HTML tags
|
| 70 |
+
2. strip_urls — remove hyperlinks
|
| 71 |
+
3. strip_mentions — remove @user
|
| 72 |
+
4. strip_hashtags — remove #tag text (keep token)
|
| 73 |
+
5. strip_emojis — remove Unicode emoji
|
| 74 |
+
6. lowercase — normalize case
|
| 75 |
+
7. normalize_chars — collapse repeated chars, excessive !??
|
| 76 |
+
8. strip_punct — remove punctuation except apostrophe
|
| 77 |
+
9. tokenize — split on whitespace
|
| 78 |
+
10. remove_stopwords — drop EN + TL stopwords
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
def clean(self, text: str) -> str:
|
| 82 |
+
"""Steps 1-6: structural cleaning."""
|
| 83 |
+
text = _HTML_TAG_PATTERN.sub(" ", text)
|
| 84 |
+
text = _URL_PATTERN.sub(" ", text)
|
| 85 |
+
text = _MENTION_PATTERN.sub(" ", text)
|
| 86 |
+
text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text) # Keep word, drop #
|
| 87 |
+
text = _remove_emojis(text)
|
| 88 |
+
text = text.lower()
|
| 89 |
+
return _WHITESPACE_PATTERN.sub(" ", text).strip()
|
| 90 |
+
|
| 91 |
+
def normalize(self, text: str) -> str:
|
| 92 |
+
"""Steps 7-8: character-level normalization."""
|
| 93 |
+
text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text) # "graaabe" → "graabe"
|
| 94 |
+
text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text) # "!!!" → "!"
|
| 95 |
+
# Keep apostrophes (di, 'di, hindi), remove other punct
|
| 96 |
+
text = "".join(
|
| 97 |
+
ch if ch not in string.punctuation or ch == "'" else " "
|
| 98 |
+
for ch in text
|
| 99 |
+
)
|
| 100 |
+
return _WHITESPACE_PATTERN.sub(" ", text).strip()
|
| 101 |
+
|
| 102 |
+
def tokenize(self, text: str) -> list[str]:
|
| 103 |
+
"""Step 9: whitespace tokenization."""
|
| 104 |
+
return [t for t in text.split() if len(t) > 1]
|
| 105 |
+
|
| 106 |
+
def remove_stopwords(self, tokens: list[str]) -> list[str]:
|
| 107 |
+
"""Step 10: remove EN + TL stopwords."""
|
| 108 |
+
return [t for t in tokens if t not in ALL_STOPWORDS]
|
| 109 |
+
|
| 110 |
+
def preprocess(self, text: str) -> PreprocessResult:
|
| 111 |
+
"""Run the full pipeline and return a structured result."""
|
| 112 |
+
cleaned = self.clean(text)
|
| 113 |
+
normalized = self.normalize(cleaned)
|
| 114 |
+
tokens = self.tokenize(normalized)
|
| 115 |
+
filtered = self.remove_stopwords(tokens)
|
| 116 |
+
return PreprocessResult(
|
| 117 |
+
original=text,
|
| 118 |
+
cleaned=cleaned,
|
| 119 |
+
normalized=normalized,
|
| 120 |
+
tokens=tokens,
|
| 121 |
+
filtered_tokens=filtered,
|
| 122 |
+
char_count=len(normalized),
|
| 123 |
+
word_count=len(tokens),
|
| 124 |
+
)
|
nlp/sentiment.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Sentiment & Emotion Analyzer
|
| 3 |
+
Uses HuggingFace transformers with graceful fallback to lexicon-based scoring.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
# ── Simple lexicons for fallback ──────────────────────────────────────────────
|
| 11 |
+
_NEGATIVE_WORDS = {
|
| 12 |
+
"fake", "false", "lie", "liar", "hoax", "scam", "fraud", "corrupt",
|
| 13 |
+
"criminal", "illegal", "murder", "die", "death", "dead", "kill",
|
| 14 |
+
"patay", "namatay", "peke", "sinungaling", "corrupt", "magnanakaw",
|
| 15 |
+
"kasamaan", "krimen", "karahasan", "pandemic", "sakit", "epidemya",
|
| 16 |
+
"grabe", "nakakatakot", "nakakainis", "nakakagalit", "kahiya",
|
| 17 |
+
}
|
| 18 |
+
_POSITIVE_WORDS = {
|
| 19 |
+
"good", "great", "excellent", "amazing", "wonderful", "positive",
|
| 20 |
+
"success", "win", "victory", "help", "support", "safe", "free",
|
| 21 |
+
"maganda", "magaling", "mahusay", "maayos", "tagumpay", "ligtas",
|
| 22 |
+
"masaya", "mabuti", "mahalaga", "mahal", "salamat", "pagbabago",
|
| 23 |
+
}
|
| 24 |
+
_FEAR_WORDS = {
|
| 25 |
+
"takot", "fear", "scared", "afraid", "terror", "danger", "dangerous",
|
| 26 |
+
"banta", "panganib", "nakakatakot", "kalamidad", "lindol",
|
| 27 |
+
}
|
| 28 |
+
_ANGER_WORDS = {
|
| 29 |
+
"galit", "angry", "anger", "furious", "rage", "outrage", "poot",
|
| 30 |
+
"nakakagalit", "nakakaasar", "sumpain", "putang", "gago",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class SentimentResult:
|
| 36 |
+
sentiment: str # positive | negative | neutral | high positive | high negative
|
| 37 |
+
sentiment_score: float # -1.0 to 1.0
|
| 38 |
+
emotion: str # anger | fear | joy | sadness | neutral
|
| 39 |
+
emotion_score: float # 0.0 to 1.0
|
| 40 |
+
method: str # "transformer" | "lexicon"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class SentimentAnalyzer:
|
| 44 |
+
"""
|
| 45 |
+
Two-strategy sentiment analysis:
|
| 46 |
+
Primary — cardiffnlp/twitter-roberta-base-sentiment-latest (social media optimized)
|
| 47 |
+
Fallback — lexicon-based word counting
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self):
|
| 51 |
+
self._sentiment_pipe = None
|
| 52 |
+
self._emotion_pipe = None
|
| 53 |
+
self._loaded = False
|
| 54 |
+
|
| 55 |
+
def _load_models(self):
|
| 56 |
+
if self._loaded:
|
| 57 |
+
return
|
| 58 |
+
try:
|
| 59 |
+
from transformers import pipeline
|
| 60 |
+
self._sentiment_pipe = pipeline(
|
| 61 |
+
"text-classification",
|
| 62 |
+
model="cardiffnlp/twitter-roberta-base-sentiment-latest",
|
| 63 |
+
top_k=1,
|
| 64 |
+
)
|
| 65 |
+
self._emotion_pipe = pipeline(
|
| 66 |
+
"text-classification",
|
| 67 |
+
model="j-hartmann/emotion-english-distilroberta-base",
|
| 68 |
+
top_k=1,
|
| 69 |
+
)
|
| 70 |
+
logger.info("Sentiment / emotion models loaded")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.warning("Transformer models not available (%s) — using lexicon fallback", e)
|
| 73 |
+
self._loaded = True
|
| 74 |
+
|
| 75 |
+
def _lexicon_analyze(self, text: str) -> SentimentResult:
|
| 76 |
+
words = set(text.lower().split())
|
| 77 |
+
neg = len(words & _NEGATIVE_WORDS)
|
| 78 |
+
pos = len(words & _POSITIVE_WORDS)
|
| 79 |
+
fear = len(words & _FEAR_WORDS)
|
| 80 |
+
anger = len(words & _ANGER_WORDS)
|
| 81 |
+
|
| 82 |
+
total = neg + pos
|
| 83 |
+
if total == 0:
|
| 84 |
+
score = 0.0
|
| 85 |
+
else:
|
| 86 |
+
score = (pos - neg) / total
|
| 87 |
+
|
| 88 |
+
if score > 0.3:
|
| 89 |
+
sentiment = "high positive" if score > 0.6 else "positive"
|
| 90 |
+
elif score < -0.3:
|
| 91 |
+
sentiment = "high negative" if score < -0.6 else "negative"
|
| 92 |
+
else:
|
| 93 |
+
sentiment = "neutral"
|
| 94 |
+
|
| 95 |
+
emotion_score = 0.0
|
| 96 |
+
if fear > anger:
|
| 97 |
+
emotion = "fear"
|
| 98 |
+
emotion_score = min(fear / max(len(words), 1) * 5, 1.0)
|
| 99 |
+
elif anger > 0:
|
| 100 |
+
emotion = "anger"
|
| 101 |
+
emotion_score = min(anger / max(len(words), 1) * 5, 1.0)
|
| 102 |
+
elif pos > neg:
|
| 103 |
+
emotion = "joy"
|
| 104 |
+
emotion_score = min(pos / max(len(words), 1) * 5, 1.0)
|
| 105 |
+
elif neg > 0:
|
| 106 |
+
emotion = "sadness"
|
| 107 |
+
emotion_score = min(neg / max(len(words), 1) * 5, 1.0)
|
| 108 |
+
else:
|
| 109 |
+
emotion = "neutral"
|
| 110 |
+
emotion_score = 0.0
|
| 111 |
+
|
| 112 |
+
return SentimentResult(sentiment, round(score, 3), emotion, round(emotion_score, 3), "lexicon")
|
| 113 |
+
|
| 114 |
+
def analyze(self, text: str) -> SentimentResult:
|
| 115 |
+
self._load_models()
|
| 116 |
+
snippet = text[:512] # Transformer token limit
|
| 117 |
+
|
| 118 |
+
if self._sentiment_pipe and self._emotion_pipe:
|
| 119 |
+
try:
|
| 120 |
+
s_out = self._sentiment_pipe(snippet)[0]
|
| 121 |
+
e_out = self._emotion_pipe(snippet)[0]
|
| 122 |
+
|
| 123 |
+
raw_label = s_out["label"].lower()
|
| 124 |
+
score = s_out["score"]
|
| 125 |
+
if "positive" in raw_label:
|
| 126 |
+
sentiment = "high positive" if score > 0.85 else "positive"
|
| 127 |
+
s_score = score
|
| 128 |
+
elif "negative" in raw_label:
|
| 129 |
+
sentiment = "high negative" if score > 0.85 else "negative"
|
| 130 |
+
s_score = -score
|
| 131 |
+
else:
|
| 132 |
+
sentiment = "neutral"
|
| 133 |
+
s_score = 0.0
|
| 134 |
+
|
| 135 |
+
emotion = e_out["label"].lower()
|
| 136 |
+
emotion_score = e_out["score"]
|
| 137 |
+
return SentimentResult(sentiment, round(s_score, 3), emotion, round(emotion_score, 3), "transformer")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.warning("Transformer inference error: %s — falling back to lexicon", e)
|
| 140 |
+
|
| 141 |
+
return self._lexicon_analyze(text)
|
pytest.ini
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
asyncio_mode = auto
|
| 3 |
+
testpaths = tests
|
| 4 |
+
python_files = test_*.py
|
| 5 |
+
python_classes = Test*
|
| 6 |
+
python_functions = test_*
|
requirements.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Core Framework ────────────────────────────────────────────────────────────
|
| 2 |
+
fastapi==0.115.6
|
| 3 |
+
uvicorn[standard]==0.32.1
|
| 4 |
+
python-multipart==0.0.17 # File upload support
|
| 5 |
+
pydantic==2.9.2
|
| 6 |
+
pydantic-settings==2.6.1
|
| 7 |
+
|
| 8 |
+
# ── NLP & ML ──────────────────────────────────────────────────────────────────
|
| 9 |
+
transformers==4.46.3
|
| 10 |
+
torch==2.5.1
|
| 11 |
+
sentence-transformers==3.3.1
|
| 12 |
+
scikit-learn==1.5.2
|
| 13 |
+
spacy==3.8.2
|
| 14 |
+
langdetect==1.0.9
|
| 15 |
+
nltk==3.9.1
|
| 16 |
+
|
| 17 |
+
# ── Input Modules ─────────────────────────────────────────────────────────────
|
| 18 |
+
pytesseract==0.3.13 # OCR
|
| 19 |
+
Pillow==11.0.0 # Image processing
|
| 20 |
+
openai-whisper==20240930 # ASR (Filipino speech)
|
| 21 |
+
beautifulsoup4==4.12.3 # URL scraping
|
| 22 |
+
requests==2.32.3
|
| 23 |
+
lxml==5.3.0
|
| 24 |
+
|
| 25 |
+
# ── Evidence Retrieval ────────────────────────────────────────────────────────
|
| 26 |
+
newsapi-python==0.2.7
|
| 27 |
+
|
| 28 |
+
# ── Database ──────────────────────────────────────────────────────────────────
|
| 29 |
+
sqlalchemy==2.0.36
|
| 30 |
+
asyncpg==0.30.0 # Async PostgreSQL driver
|
| 31 |
+
alembic==1.14.0
|
| 32 |
+
|
| 33 |
+
# ── Caching ───────────────────────────────────────────────────────────────────
|
| 34 |
+
redis==5.2.1
|
| 35 |
+
cachetools==5.5.0
|
| 36 |
+
|
| 37 |
+
# ── Utilities ─────────────────────────────────────────────────────────────────
|
| 38 |
+
python-dotenv==1.0.1
|
| 39 |
+
httpx==0.28.1 # Async HTTP client
|
| 40 |
+
aiofiles==24.1.0
|
| 41 |
+
tqdm==4.67.1
|
| 42 |
+
numpy==1.26.4
|
| 43 |
+
|
| 44 |
+
# ── Testing ───────────────────────────────────────────────────────────────────
|
| 45 |
+
pytest==8.3.4
|
| 46 |
+
pytest-asyncio==0.24.0
|
| 47 |
+
httpx==0.28.1 # FastAPI TestClient
|
scoring/__init__.py
ADDED
|
File without changes
|
scoring/engine.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Scoring Engine (Orchestrator)
|
| 3 |
+
Ties together all NLP modules, Layer 1, and Layer 2 into a final VerificationResponse.
|
| 4 |
+
Final Score = (ML Confidence × 0.40) + (Evidence Score × 0.60)
|
| 5 |
+
"""
|
| 6 |
+
import asyncio
|
| 7 |
+
import json
|
| 8 |
+
import logging
|
| 9 |
+
import uuid
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
from config import get_settings
|
| 14 |
+
from api.schemas import (
|
| 15 |
+
VerificationResponse, Verdict, Language, DomainTier,
|
| 16 |
+
Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
settings = get_settings()
|
| 21 |
+
|
| 22 |
+
# ── Domain credibility lookup ─────────────────────────────────────────────────
|
| 23 |
+
_DOMAIN_DB_PATH = Path(__file__).parent.parent / "domain_credibility.json"
|
| 24 |
+
_DOMAIN_DB: dict = {}
|
| 25 |
+
|
| 26 |
+
def _load_domain_db() -> dict:
|
| 27 |
+
global _DOMAIN_DB
|
| 28 |
+
if not _DOMAIN_DB:
|
| 29 |
+
try:
|
| 30 |
+
_DOMAIN_DB = json.loads(_DOMAIN_DB_PATH.read_text())
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.warning("Could not load domain_credibility.json: %s", e)
|
| 33 |
+
return _DOMAIN_DB
|
| 34 |
+
|
| 35 |
+
def get_domain_tier(domain: str) -> DomainTier | None:
|
| 36 |
+
if not domain:
|
| 37 |
+
return None
|
| 38 |
+
db = _load_domain_db()
|
| 39 |
+
domain = domain.lower().replace("www.", "")
|
| 40 |
+
for tier_key, tier_data in db.items():
|
| 41 |
+
if domain in tier_data.get("domains", []):
|
| 42 |
+
return DomainTier(int(tier_key[-1]))
|
| 43 |
+
return DomainTier.SUSPICIOUS # Unknown domains default to Tier 3
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _map_verdict(final_score: float) -> Verdict:
|
| 47 |
+
if final_score >= settings.credible_threshold:
|
| 48 |
+
return Verdict.CREDIBLE
|
| 49 |
+
elif final_score >= settings.fake_threshold:
|
| 50 |
+
return Verdict.UNVERIFIED
|
| 51 |
+
else:
|
| 52 |
+
return Verdict.LIKELY_FAKE
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
async def run_verification(
|
| 56 |
+
text: str,
|
| 57 |
+
input_type: str = "text",
|
| 58 |
+
source_domain: str | None = None,
|
| 59 |
+
) -> VerificationResponse:
|
| 60 |
+
"""
|
| 61 |
+
Full verification pipeline orchestrator.
|
| 62 |
+
Runs NLP analysis and ML classifier synchronously, evidence retrieval async.
|
| 63 |
+
"""
|
| 64 |
+
# ── Lazy imports so app starts without heavy deps ─────────────────────────
|
| 65 |
+
from nlp.preprocessor import TextPreprocessor
|
| 66 |
+
from nlp.language_detector import LanguageDetector
|
| 67 |
+
from nlp.ner import EntityExtractor
|
| 68 |
+
from nlp.sentiment import SentimentAnalyzer
|
| 69 |
+
from nlp.clickbait import ClickbaitDetector
|
| 70 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 71 |
+
from ml.tfidf_classifier import TFIDFClassifier
|
| 72 |
+
from evidence.news_fetcher import fetch_evidence, compute_similarity
|
| 73 |
+
|
| 74 |
+
# ── Step 1: Preprocess ────────────────────────────────────────────────────
|
| 75 |
+
preprocessor = TextPreprocessor()
|
| 76 |
+
proc = preprocessor.preprocess(text)
|
| 77 |
+
|
| 78 |
+
# ── Step 2: Language detection ────────────────────────────────────────────
|
| 79 |
+
lang_detector = LanguageDetector()
|
| 80 |
+
lang_result = lang_detector.detect(text)
|
| 81 |
+
language = Language(lang_result.language) if lang_result.language in Language._value2member_map_ else Language.TAGLISH
|
| 82 |
+
|
| 83 |
+
# ── Steps 3–6: NLP analysis (run concurrently) ───────────────────────────
|
| 84 |
+
ner_extractor = EntityExtractor()
|
| 85 |
+
sentiment_analyzer = SentimentAnalyzer()
|
| 86 |
+
clickbait_detector = ClickbaitDetector()
|
| 87 |
+
claim_extractor = ClaimExtractor()
|
| 88 |
+
|
| 89 |
+
ner_result = ner_extractor.extract(text)
|
| 90 |
+
sentiment_result = sentiment_analyzer.analyze(proc.cleaned)
|
| 91 |
+
clickbait_result = clickbait_detector.detect(text)
|
| 92 |
+
claim_result = claim_extractor.extract(proc.cleaned)
|
| 93 |
+
|
| 94 |
+
# ── Step 7: Layer 1 — ML Classifier ──────────────────────────────────────
|
| 95 |
+
classifier = TFIDFClassifier()
|
| 96 |
+
classifier.train()
|
| 97 |
+
l1 = classifier.predict(proc.cleaned)
|
| 98 |
+
|
| 99 |
+
# Enrich triggered features with NLP signals
|
| 100 |
+
if clickbait_result.is_clickbait:
|
| 101 |
+
l1.triggered_features.extend(clickbait_result.triggered_patterns[:3])
|
| 102 |
+
if sentiment_result.sentiment in ("high negative",):
|
| 103 |
+
l1.triggered_features.append("high emotional language")
|
| 104 |
+
|
| 105 |
+
layer1 = Layer1Result(
|
| 106 |
+
verdict=Verdict(l1.verdict),
|
| 107 |
+
confidence=l1.confidence,
|
| 108 |
+
triggered_features=l1.triggered_features,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# ── Step 8: Layer 2 — Evidence Retrieval ──────────────────────────────────
|
| 112 |
+
evidence_score = 50.0 # Neutral default when API key absent
|
| 113 |
+
evidence_sources: list[EvidenceSource] = []
|
| 114 |
+
l2_verdict = Verdict.UNVERIFIED
|
| 115 |
+
|
| 116 |
+
if settings.news_api_key:
|
| 117 |
+
try:
|
| 118 |
+
articles = await fetch_evidence(claim_result.claim, settings.news_api_key)
|
| 119 |
+
for art in articles[:5]:
|
| 120 |
+
article_text = f"{art.get('title', '')} {art.get('description', '')}"
|
| 121 |
+
sim = compute_similarity(claim_result.claim, article_text)
|
| 122 |
+
domain = (art.get("source", {}) or {}).get("name", "unknown").lower()
|
| 123 |
+
tier = get_domain_tier(domain)
|
| 124 |
+
|
| 125 |
+
# Simple stance heuristic — negative title keywords → Refutes
|
| 126 |
+
title_lower = (art.get("title") or "").lower()
|
| 127 |
+
stance = Stance.NOT_ENOUGH_INFO
|
| 128 |
+
if any(w in title_lower for w in ["false", "fake", "hoax", "wrong", "debunked", "fact check"]):
|
| 129 |
+
stance = Stance.REFUTES
|
| 130 |
+
elif sim > 0.6:
|
| 131 |
+
stance = Stance.SUPPORTS
|
| 132 |
+
|
| 133 |
+
evidence_sources.append(EvidenceSource(
|
| 134 |
+
title=art.get("title", ""),
|
| 135 |
+
url=art.get("url", ""),
|
| 136 |
+
similarity=sim,
|
| 137 |
+
stance=stance,
|
| 138 |
+
domain_tier=tier or DomainTier.SUSPICIOUS,
|
| 139 |
+
published_at=art.get("publishedAt"),
|
| 140 |
+
source_name=art.get("source", {}).get("name"),
|
| 141 |
+
))
|
| 142 |
+
|
| 143 |
+
# Evidence score: average similarity × 100, penalized for refuting sources
|
| 144 |
+
if evidence_sources:
|
| 145 |
+
supporting = [s for s in evidence_sources if s.stance == Stance.SUPPORTS]
|
| 146 |
+
refuting = [s for s in evidence_sources if s.stance == Stance.REFUTES]
|
| 147 |
+
avg_sim = sum(s.similarity for s in evidence_sources) / len(evidence_sources)
|
| 148 |
+
refute_penalty = len(refuting) * 15
|
| 149 |
+
evidence_score = max(0.0, min(100.0, avg_sim * 100 - refute_penalty))
|
| 150 |
+
|
| 151 |
+
if len(refuting) > len(supporting):
|
| 152 |
+
l2_verdict = Verdict.LIKELY_FAKE
|
| 153 |
+
elif len(supporting) >= 2:
|
| 154 |
+
l2_verdict = Verdict.CREDIBLE
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.warning("Evidence retrieval failed: %s — using neutral score", e)
|
| 157 |
+
|
| 158 |
+
layer2 = Layer2Result(
|
| 159 |
+
verdict=l2_verdict,
|
| 160 |
+
evidence_score=round(evidence_score, 1),
|
| 161 |
+
sources=evidence_sources,
|
| 162 |
+
claim_used=claim_result.claim,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# ── Step 9: Final Score ───────────────────────────────────────────────────
|
| 166 |
+
# ML confidence is 0-100 where high = more credible for the predicted class.
|
| 167 |
+
# Adjust: if ML says Fake, its confidence works against credibility.
|
| 168 |
+
ml_credibility = l1.confidence if l1.verdict == "Credible" else (100 - l1.confidence)
|
| 169 |
+
final_score = round(
|
| 170 |
+
(ml_credibility * settings.ml_weight) + (evidence_score * settings.evidence_weight),
|
| 171 |
+
1,
|
| 172 |
+
)
|
| 173 |
+
verdict = _map_verdict(final_score)
|
| 174 |
+
|
| 175 |
+
# ── Step 10: Assemble response ────────────────────────────────────────────
|
| 176 |
+
result = VerificationResponse(
|
| 177 |
+
verdict=verdict,
|
| 178 |
+
confidence=round(max(l1.confidence, evidence_score / 100 * 100), 1),
|
| 179 |
+
final_score=final_score,
|
| 180 |
+
layer1=layer1,
|
| 181 |
+
layer2=layer2,
|
| 182 |
+
entities=EntitiesResult(
|
| 183 |
+
persons=ner_result.persons,
|
| 184 |
+
organizations=ner_result.organizations,
|
| 185 |
+
locations=ner_result.locations,
|
| 186 |
+
dates=ner_result.dates,
|
| 187 |
+
),
|
| 188 |
+
sentiment=sentiment_result.sentiment,
|
| 189 |
+
emotion=sentiment_result.emotion,
|
| 190 |
+
language=language,
|
| 191 |
+
domain_credibility=get_domain_tier(source_domain) if source_domain else None,
|
| 192 |
+
input_type=input_type,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# ── Record to history ─────────────────────────────────────────────────────
|
| 196 |
+
try:
|
| 197 |
+
from api.routes.history import record_verification
|
| 198 |
+
record_verification({
|
| 199 |
+
"id": str(uuid.uuid4()),
|
| 200 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 201 |
+
"input_type": input_type,
|
| 202 |
+
"text_preview": text[:120],
|
| 203 |
+
"verdict": verdict.value,
|
| 204 |
+
"confidence": result.confidence,
|
| 205 |
+
"final_score": final_score,
|
| 206 |
+
"entities": ner_result.to_dict(),
|
| 207 |
+
"claim_used": claim_result.claim,
|
| 208 |
+
})
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.warning("Failed to record history: %s", e)
|
| 211 |
+
|
| 212 |
+
return result
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_philverify.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify — Unit Tests
|
| 3 |
+
Covers: text preprocessor, language detector, clickbait detector, and scoring engine.
|
| 4 |
+
Run: pytest tests/ -v
|
| 5 |
+
"""
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Ensure project root is on PYTHONPATH
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ── TextPreprocessor ──────────────────────────────────────────────────────────
|
| 16 |
+
|
| 17 |
+
class TestTextPreprocessor:
|
| 18 |
+
def setup_method(self):
|
| 19 |
+
from nlp.preprocessor import TextPreprocessor
|
| 20 |
+
self.preprocessor = TextPreprocessor()
|
| 21 |
+
|
| 22 |
+
def test_lowercases_text(self):
|
| 23 |
+
result = self.preprocessor.clean("HELLO WORLD")
|
| 24 |
+
assert result == "hello world"
|
| 25 |
+
|
| 26 |
+
def test_strips_urls(self):
|
| 27 |
+
result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
|
| 28 |
+
assert "https://" not in result
|
| 29 |
+
assert "rappler.com" not in result
|
| 30 |
+
|
| 31 |
+
def test_strips_html_tags(self):
|
| 32 |
+
result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
|
| 33 |
+
assert "<" not in result and ">" not in result
|
| 34 |
+
|
| 35 |
+
def test_strips_mentions(self):
|
| 36 |
+
result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
|
| 37 |
+
assert "@" not in result
|
| 38 |
+
|
| 39 |
+
def test_removes_stopwords(self):
|
| 40 |
+
filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
|
| 41 |
+
assert "ang" not in filtered
|
| 42 |
+
assert "fake" in filtered
|
| 43 |
+
|
| 44 |
+
def test_normalizes_repeated_chars(self):
|
| 45 |
+
result = self.preprocessor.normalize("graaabe ang gaaalit ko")
|
| 46 |
+
assert "graaabe" not in result
|
| 47 |
+
|
| 48 |
+
def test_full_pipeline_returns_result(self):
|
| 49 |
+
from nlp.preprocessor import PreprocessResult
|
| 50 |
+
result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
|
| 51 |
+
assert isinstance(result, PreprocessResult)
|
| 52 |
+
assert result.char_count > 0
|
| 53 |
+
assert len(result.tokens) > 0
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ── LanguageDetector ──────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
class TestLanguageDetector:
|
| 59 |
+
def setup_method(self):
|
| 60 |
+
from nlp.language_detector import LanguageDetector
|
| 61 |
+
self.detector = LanguageDetector()
|
| 62 |
+
|
| 63 |
+
def test_detects_tagalog(self):
|
| 64 |
+
result = self.detector.detect(
|
| 65 |
+
"Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
|
| 66 |
+
)
|
| 67 |
+
assert result.language in ("Tagalog", "Taglish")
|
| 68 |
+
|
| 69 |
+
def test_detects_english(self):
|
| 70 |
+
result = self.detector.detect(
|
| 71 |
+
"The Supreme Court ruled in favor of the petition filed by the opposition."
|
| 72 |
+
)
|
| 73 |
+
assert result.language in ("English", "Taglish")
|
| 74 |
+
|
| 75 |
+
def test_detects_taglish(self):
|
| 76 |
+
result = self.detector.detect(
|
| 77 |
+
"Grabe ang news ngayon! The president announced na libre ang lahat!"
|
| 78 |
+
)
|
| 79 |
+
# Should detect either Taglish or remain consistent
|
| 80 |
+
assert result.language in ("Tagalog", "English", "Taglish")
|
| 81 |
+
|
| 82 |
+
def test_unknown_for_empty(self):
|
| 83 |
+
result = self.detector.detect("")
|
| 84 |
+
assert result.language == "Unknown"
|
| 85 |
+
|
| 86 |
+
def test_confidence_between_0_and_1(self):
|
| 87 |
+
result = self.detector.detect("Ang balita ay napakalaki!")
|
| 88 |
+
assert 0.0 <= result.confidence <= 1.0
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ── ClickbaitDetector ─────────────────────────────────────────────────────────
|
| 92 |
+
|
| 93 |
+
class TestClickbaitDetector:
|
| 94 |
+
def setup_method(self):
|
| 95 |
+
from nlp.clickbait import ClickbaitDetector
|
| 96 |
+
self.detector = ClickbaitDetector()
|
| 97 |
+
|
| 98 |
+
def test_detects_clickbait_all_caps(self):
|
| 99 |
+
result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
|
| 100 |
+
assert result.is_clickbait is True
|
| 101 |
+
assert result.score > 0.3
|
| 102 |
+
|
| 103 |
+
def test_detects_clickbait_tagalog(self):
|
| 104 |
+
result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
|
| 105 |
+
assert result.score > 0.3
|
| 106 |
+
|
| 107 |
+
def test_clean_headline_not_clickbait(self):
|
| 108 |
+
result = self.detector.detect(
|
| 109 |
+
"DOH reports 500 new cases as vaccination drive continues in Metro Manila"
|
| 110 |
+
)
|
| 111 |
+
assert result.is_clickbait is False
|
| 112 |
+
|
| 113 |
+
def test_score_between_0_and_1(self):
|
| 114 |
+
result = self.detector.detect("Breaking news today")
|
| 115 |
+
assert 0.0 <= result.score <= 1.0
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# ── TF-IDF Classifier ─────────────────────────────────────────────────────────
|
| 119 |
+
|
| 120 |
+
class TestTFIDFClassifier:
|
| 121 |
+
def setup_method(self):
|
| 122 |
+
from ml.tfidf_classifier import TFIDFClassifier
|
| 123 |
+
self.clf = TFIDFClassifier()
|
| 124 |
+
self.clf.train()
|
| 125 |
+
|
| 126 |
+
def test_predict_returns_valid_verdict(self):
|
| 127 |
+
result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
|
| 128 |
+
assert result.verdict in ("Credible", "Unverified", "Fake")
|
| 129 |
+
|
| 130 |
+
def test_confidence_in_valid_range(self):
|
| 131 |
+
result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
|
| 132 |
+
assert 0.0 <= result.confidence <= 100.0
|
| 133 |
+
|
| 134 |
+
def test_triggered_features_are_strings(self):
|
| 135 |
+
result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
|
| 136 |
+
assert all(isinstance(f, str) for f in result.triggered_features)
|
| 137 |
+
|
| 138 |
+
def test_seed_fake_news_detected(self):
|
| 139 |
+
result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
|
| 140 |
+
# Should not be Credible for obvious fake claim
|
| 141 |
+
assert result.verdict in ("Unverified", "Fake", "Likely Fake")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ── Scoring Engine (lightweight integration) ──────────────────────────────────
|
| 145 |
+
|
| 146 |
+
class TestScoringEngine:
|
| 147 |
+
"""Integration test — no API keys needed, evidence score defaults to 50."""
|
| 148 |
+
|
| 149 |
+
@pytest.mark.asyncio
|
| 150 |
+
async def test_verify_text_returns_response(self):
|
| 151 |
+
from scoring.engine import run_verification
|
| 152 |
+
from api.schemas import VerificationResponse
|
| 153 |
+
|
| 154 |
+
result = await run_verification(
|
| 155 |
+
"GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
|
| 156 |
+
input_type="text",
|
| 157 |
+
)
|
| 158 |
+
assert isinstance(result, VerificationResponse)
|
| 159 |
+
assert result.verdict is not None
|
| 160 |
+
assert 0.0 <= result.final_score <= 100.0
|
| 161 |
+
|
| 162 |
+
@pytest.mark.asyncio
|
| 163 |
+
async def test_verify_credible_text(self):
|
| 164 |
+
from scoring.engine import run_verification
|
| 165 |
+
|
| 166 |
+
result = await run_verification(
|
| 167 |
+
"DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
|
| 168 |
+
input_type="text",
|
| 169 |
+
)
|
| 170 |
+
assert result.final_score is not None
|
| 171 |
+
assert result.language is not None
|
| 172 |
+
|
| 173 |
+
@pytest.mark.asyncio
|
| 174 |
+
async def test_entities_extracted(self):
|
| 175 |
+
from scoring.engine import run_verification
|
| 176 |
+
|
| 177 |
+
result = await run_verification(
|
| 178 |
+
"President Marcos announced new policies in Manila regarding the AFP and PNP.",
|
| 179 |
+
input_type="text",
|
| 180 |
+
)
|
| 181 |
+
assert result.entities is not None
|