Ryan Christian D. Deniega commited on
Commit
6c9b8f1
·
0 Parent(s):

feat: PhilVerify Phase 1-3 — FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)

Browse files
.env.example ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── API Keys ──────────────────────────────────────────────────────────────────
2
+ NEWS_API_KEY=your_newsapi_key_here
3
+ GOOGLE_VISION_API_KEY=your_google_vision_key_here # Optional (alternative to Tesseract)
4
+
5
+ # ── Database ──────────────────────────────────────────────────────────────────
6
+ DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/philverify
7
+
8
+ # ── Redis Cache ───────────────────────────────────────────────────────────────
9
+ REDIS_URL=redis://localhost:6379/0
10
+
11
+ # ── App Settings ──────────────────────────────────────────────────────────────
12
+ APP_ENV=development # development | production
13
+ DEBUG=true
14
+ LOG_LEVEL=INFO
15
+ ALLOWED_ORIGINS=http://localhost:3000,http://localhost:5173
16
+
17
+ # ── Model Settings ────────────────────────────────────────────────────────────
18
+ # Options: xlm-roberta-base | joelito/roberta-tagalog-base | bert-base-multilingual-cased
19
+ ML_MODEL_NAME=xlm-roberta-base
20
+ WHISPER_MODEL_SIZE=base # base | medium | large-v3 (large-v3 for production)
21
+ USE_GPU=false
22
+
23
+ # ── Scoring Weights ───────────────────────────────────────────────────────────
24
+ ML_WEIGHT=0.40
25
+ EVIDENCE_WEIGHT=0.60
26
+ CREDIBLE_THRESHOLD=70.0
27
+ FAKE_THRESHOLD=40.0
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ venv/
3
+ __pycache__/
4
+ *.py[cod]
5
+ *.pkl
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+
10
+ # Environment
11
+ .env
12
+
13
+ # Cache
14
+ .cache/
15
+ .pytest_cache/
16
+
17
+ # IDE
18
+ .vscode/
19
+ .idea/
20
+ *.swp
21
+
22
+ # OS
23
+ .DS_Store
24
+
25
+ # ML models (too large for git)
26
+ ml/models/*.pkl
27
+ ml/models/*.bin
28
+ ml/models/*.pt
api/__init__.py ADDED
File without changes
api/routes/__init__.py ADDED
File without changes
api/routes/history.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — History Route
3
+ GET /history — Returns past verification logs with pagination.
4
+ """
5
+ import logging
6
+ from fastapi import APIRouter, Query
7
+ from api.schemas import HistoryResponse, HistoryEntry, Verdict
8
+
9
+ logger = logging.getLogger(__name__)
10
+ router = APIRouter(prefix="/history", tags=["History"])
11
+
12
+ # In-memory store for development. Will be replaced by DB queries in Phase 7.
13
+ _HISTORY: list[dict] = []
14
+
15
+
16
+ def record_verification(entry: dict) -> None:
17
+ """Called by the scoring engine to persist each verification result."""
18
+ _HISTORY.append(entry)
19
+
20
+
21
+ @router.get(
22
+ "",
23
+ response_model=HistoryResponse,
24
+ summary="Get verification history",
25
+ description="Returns past verifications ordered by most recent. Supports pagination.",
26
+ )
27
+ async def get_history(
28
+ page: int = Query(1, ge=1, description="Page number"),
29
+ limit: int = Query(20, ge=1, le=100, description="Results per page"),
30
+ verdict_filter: Verdict | None = Query(None, alias="verdict", description="Filter by verdict"),
31
+ ) -> HistoryResponse:
32
+ logger.info("GET /history | page=%d limit=%d", page, limit)
33
+
34
+ entries = list(reversed(_HISTORY)) # Most recent first
35
+ if verdict_filter:
36
+ entries = [e for e in entries if e.get("verdict") == verdict_filter.value]
37
+
38
+ total = len(entries)
39
+ start = (page - 1) * limit
40
+ paginated = entries[start : start + limit]
41
+
42
+ return HistoryResponse(
43
+ total=total,
44
+ entries=[
45
+ HistoryEntry(
46
+ id=e["id"],
47
+ timestamp=e["timestamp"],
48
+ input_type=e.get("input_type", "text"),
49
+ text_preview=e.get("text_preview", "")[:120],
50
+ verdict=Verdict(e["verdict"]),
51
+ confidence=e["confidence"],
52
+ final_score=e["final_score"],
53
+ )
54
+ for e in paginated
55
+ ],
56
+ )
api/routes/trends.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Trends Route
3
+ GET /trends — Aggregates entities and topics from fake-news verifications.
4
+ """
5
+ import logging
6
+ from collections import Counter
7
+ from fastapi import APIRouter, Query
8
+ from api.schemas import TrendsResponse, TrendingEntity, TrendingTopic, Verdict
9
+
10
+ logger = logging.getLogger(__name__)
11
+ router = APIRouter(prefix="/trends", tags=["Trends"])
12
+
13
+ # Reads from the same in-memory store as history (Phase 7 → DB aggregation).
14
+ from api.routes.history import _HISTORY
15
+
16
+
17
+ @router.get(
18
+ "",
19
+ response_model=TrendsResponse,
20
+ summary="Get trending entities & topics",
21
+ description="Aggregates NER entities and topics from recent verifications. Useful for identifying fake-news patterns.",
22
+ )
23
+ async def get_trends(
24
+ days: int = Query(7, ge=1, le=90, description="Lookback window in days"),
25
+ limit: int = Query(10, ge=1, le=50, description="Max results per category"),
26
+ ) -> TrendsResponse:
27
+ logger.info("GET /trends | days=%d", days)
28
+
29
+ entity_counter: Counter = Counter()
30
+ entity_type_map: dict[str, str] = {}
31
+ entity_fake_counter: Counter = Counter()
32
+ topic_counter: Counter = Counter()
33
+ topic_verdict_map: dict[str, list[str]] = {}
34
+
35
+ for entry in _HISTORY:
36
+ is_fake = entry.get("verdict") in (Verdict.LIKELY_FAKE.value, Verdict.UNVERIFIED.value)
37
+ entities = entry.get("entities", {})
38
+
39
+ for person in entities.get("persons", []):
40
+ entity_counter[person] += 1
41
+ entity_type_map[person] = "person"
42
+ if is_fake:
43
+ entity_fake_counter[person] += 1
44
+
45
+ for org in entities.get("organizations", []):
46
+ entity_counter[org] += 1
47
+ entity_type_map[org] = "org"
48
+ if is_fake:
49
+ entity_fake_counter[org] += 1
50
+
51
+ for loc in entities.get("locations", []):
52
+ entity_counter[loc] += 1
53
+ entity_type_map[loc] = "location"
54
+ if is_fake:
55
+ entity_fake_counter[loc] += 1
56
+
57
+ claim = entry.get("claim_used", "")
58
+ if claim:
59
+ topic_counter[claim[:60]] += 1
60
+ topic_verdict_map.setdefault(claim[:60], []).append(entry.get("verdict", "Unverified"))
61
+
62
+ top_entities = [
63
+ TrendingEntity(
64
+ entity=entity,
65
+ entity_type=entity_type_map.get(entity, "unknown"),
66
+ count=count,
67
+ fake_count=entity_fake_counter.get(entity, 0),
68
+ fake_ratio=round(entity_fake_counter.get(entity, 0) / count, 2),
69
+ )
70
+ for entity, count in entity_counter.most_common(limit)
71
+ ]
72
+
73
+ top_topics = [
74
+ TrendingTopic(
75
+ topic=topic,
76
+ count=count,
77
+ dominant_verdict=Verdict(
78
+ Counter(topic_verdict_map.get(topic, ["Unverified"])).most_common(1)[0][0]
79
+ ),
80
+ )
81
+ for topic, count in topic_counter.most_common(limit)
82
+ ]
83
+
84
+ return TrendsResponse(top_entities=top_entities, top_topics=top_topics)
api/routes/verify.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Verify Routes
3
+ POST /verify/text | /verify/url | /verify/image | /verify/video
4
+ All routes funnel through run_verification() in the scoring engine.
5
+ """
6
+ import time
7
+ import logging
8
+ from fastapi import APIRouter, HTTPException, UploadFile, File, status
9
+ from fastapi.responses import JSONResponse
10
+
11
+ from api.schemas import (
12
+ TextVerifyRequest,
13
+ URLVerifyRequest,
14
+ VerificationResponse,
15
+ ErrorResponse,
16
+ )
17
+ from scoring.engine import run_verification
18
+ from inputs.url_scraper import scrape_url
19
+ from inputs.ocr import extract_text_from_image
20
+ from inputs.asr import transcribe_video
21
+
22
+ logger = logging.getLogger(__name__)
23
+ router = APIRouter(prefix="/verify", tags=["Verification"])
24
+
25
+
26
+ # ── Text ──────────────────────────────────────────────────────────────────────
27
+
28
+ @router.post(
29
+ "/text",
30
+ response_model=VerificationResponse,
31
+ summary="Verify raw text",
32
+ description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
33
+ )
34
+ async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
35
+ start = time.perf_counter()
36
+ logger.info("verify/text called | chars=%d", len(body.text))
37
+ try:
38
+ result = await run_verification(body.text, input_type="text")
39
+ result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
40
+ return result
41
+ except Exception as exc:
42
+ logger.exception("verify/text error: %s", exc)
43
+ raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc
44
+
45
+
46
+ # ── URL ───────────────────────────────────────────────────────────────────────
47
+
48
+ @router.post(
49
+ "/url",
50
+ response_model=VerificationResponse,
51
+ summary="Verify a URL",
52
+ description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
53
+ )
54
+ async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
55
+ start = time.perf_counter()
56
+ url_str = str(body.url)
57
+ logger.info("verify/url called | url=%s", url_str)
58
+ try:
59
+ text, domain = await scrape_url(url_str)
60
+ if not text or len(text.strip()) < 20:
61
+ raise HTTPException(
62
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
63
+ detail="Could not extract meaningful text from the URL. The page may be paywalled or bot-protected.",
64
+ )
65
+ result = await run_verification(text, input_type="url", source_domain=domain)
66
+ result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
67
+ return result
68
+ except HTTPException:
69
+ raise
70
+ except Exception as exc:
71
+ logger.exception("verify/url error: %s", exc)
72
+ raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc
73
+
74
+
75
+ # ── Image ─────────────────────────────────────────────────────────────────────
76
+
77
+ @router.post(
78
+ "/image",
79
+ response_model=VerificationResponse,
80
+ summary="Verify an image (OCR)",
81
+ description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
82
+ )
83
+ async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
84
+ start = time.perf_counter()
85
+ logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)
86
+
87
+ allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
88
+ if file.content_type not in allowed_types:
89
+ raise HTTPException(
90
+ status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
91
+ detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
92
+ )
93
+ try:
94
+ image_bytes = await file.read()
95
+ text = await extract_text_from_image(image_bytes)
96
+ if not text or len(text.strip()) < 10:
97
+ raise HTTPException(
98
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
99
+ detail="No readable text found in the image.",
100
+ )
101
+ result = await run_verification(text, input_type="image")
102
+ result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
103
+ return result
104
+ except HTTPException:
105
+ raise
106
+ except Exception as exc:
107
+ logger.exception("verify/image error: %s", exc)
108
+ raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc
109
+
110
+
111
+ # ── Video ─────────────────────────────────────────────────────────────────────
112
+
113
+ @router.post(
114
+ "/video",
115
+ response_model=VerificationResponse,
116
+ summary="Verify a video/audio (Whisper ASR)",
117
+ description="Accepts a video or audio file. Runs Whisper ASR to transcribe, then verifies the transcript.",
118
+ )
119
+ async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
120
+ start = time.perf_counter()
121
+ logger.info("verify/video called | filename=%s", file.filename)
122
+
123
+ allowed_types = {
124
+ "video/mp4", "video/webm", "video/quicktime",
125
+ "audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
126
+ }
127
+ if file.content_type not in allowed_types:
128
+ raise HTTPException(
129
+ status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
130
+ detail=f"Unsupported media type: {file.content_type}",
131
+ )
132
+ try:
133
+ media_bytes = await file.read()
134
+ text = await transcribe_video(media_bytes, filename=file.filename or "upload")
135
+ if not text or len(text.strip()) < 10:
136
+ raise HTTPException(
137
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
138
+ detail="Could not transcribe meaningful speech from the media file.",
139
+ )
140
+ result = await run_verification(text, input_type="video")
141
+ result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
142
+ return result
143
+ except HTTPException:
144
+ raise
145
+ except Exception as exc:
146
+ logger.exception("verify/video error: %s", exc)
147
+ raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc
api/schemas.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Pydantic Request / Response Schemas
3
+ Matches the structured JSON output format from the system spec.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from enum import Enum
8
+ from typing import Optional
9
+ from pydantic import BaseModel, HttpUrl, Field
10
+
11
+
12
+ # ── Enums ─────────────────────────────────────────────────────────────────────
13
+
14
+ class Verdict(str, Enum):
15
+ CREDIBLE = "Credible"
16
+ UNVERIFIED = "Unverified"
17
+ LIKELY_FAKE = "Likely Fake"
18
+
19
+
20
+ class Stance(str, Enum):
21
+ SUPPORTS = "Supports"
22
+ REFUTES = "Refutes"
23
+ NOT_ENOUGH_INFO = "Not Enough Info"
24
+
25
+
26
+ class Language(str, Enum):
27
+ TAGALOG = "Tagalog"
28
+ ENGLISH = "English"
29
+ TAGLISH = "Taglish"
30
+ UNKNOWN = "Unknown"
31
+
32
+
33
+ class Sentiment(str, Enum):
34
+ POSITIVE = "positive"
35
+ NEGATIVE = "negative"
36
+ NEUTRAL = "neutral"
37
+ HIGH_POSITIVE = "high positive"
38
+ HIGH_NEGATIVE = "high negative"
39
+
40
+
41
+ class DomainTier(int, Enum):
42
+ CREDIBLE = 1
43
+ SATIRE_OPINION = 2
44
+ SUSPICIOUS = 3
45
+ KNOWN_FAKE = 4
46
+
47
+
48
+ # ── Request Models ─────────────────────────────────────────────────────────────
49
+
50
+ class TextVerifyRequest(BaseModel):
51
+ text: str = Field(..., min_length=10, max_length=10_000, description="Raw text to verify")
52
+
53
+
54
+ class URLVerifyRequest(BaseModel):
55
+ url: HttpUrl = Field(..., description="URL of the news article or social media post")
56
+
57
+
58
+ # ── Nested Response Models ────────────────────────────────────────────────────
59
+
60
+ class EntitiesResult(BaseModel):
61
+ persons: list[str] = []
62
+ organizations: list[str] = []
63
+ locations: list[str] = []
64
+ dates: list[str] = []
65
+
66
+
67
+ class Layer1Result(BaseModel):
68
+ verdict: Verdict
69
+ confidence: float = Field(..., ge=0.0, le=100.0, description="Confidence % from ML classifier")
70
+ triggered_features: list[str] = Field(
71
+ default_factory=list,
72
+ description="Human-readable list of suspicious features detected",
73
+ )
74
+
75
+
76
+ class EvidenceSource(BaseModel):
77
+ title: str
78
+ url: str
79
+ similarity: float = Field(..., ge=0.0, le=1.0, description="Cosine similarity to input claim")
80
+ stance: Stance
81
+ domain_tier: DomainTier
82
+ published_at: Optional[str] = None
83
+ source_name: Optional[str] = None
84
+
85
+
86
+ class Layer2Result(BaseModel):
87
+ verdict: Verdict
88
+ evidence_score: float = Field(..., ge=0.0, le=100.0)
89
+ sources: list[EvidenceSource] = []
90
+ claim_used: Optional[str] = Field(None, description="Extracted claim sent to evidence search")
91
+
92
+
93
+ # ── Main Response ─────────────────────────────────────────────────────────────
94
+
95
+ class VerificationResponse(BaseModel):
96
+ verdict: Verdict
97
+ confidence: float = Field(..., ge=0.0, le=100.0)
98
+ final_score: float = Field(..., ge=0.0, le=100.0)
99
+ layer1: Layer1Result
100
+ layer2: Layer2Result
101
+ entities: EntitiesResult
102
+ sentiment: str
103
+ emotion: str
104
+ language: Language
105
+ domain_credibility: Optional[DomainTier] = None
106
+ input_type: str = "text"
107
+ processing_time_ms: Optional[float] = None
108
+
109
+
110
+ # ── History / Trends ──────────────────────────────────────────────────────────
111
+
112
+ class HistoryEntry(BaseModel):
113
+ id: str
114
+ timestamp: str
115
+ input_type: str
116
+ text_preview: str
117
+ verdict: Verdict
118
+ confidence: float
119
+ final_score: float
120
+
121
+
122
+ class HistoryResponse(BaseModel):
123
+ total: int
124
+ entries: list[HistoryEntry]
125
+
126
+
127
+ class TrendingEntity(BaseModel):
128
+ entity: str
129
+ entity_type: str # person | org | location
130
+ count: int
131
+ fake_count: int
132
+ fake_ratio: float
133
+
134
+
135
+ class TrendingTopic(BaseModel):
136
+ topic: str
137
+ count: int
138
+ dominant_verdict: Verdict
139
+
140
+
141
+ class TrendsResponse(BaseModel):
142
+ top_entities: list[TrendingEntity]
143
+ top_topics: list[TrendingTopic]
144
+
145
+
146
+ # ── Error ─────────────────────────────────────────────────────────────────────
147
+
148
+ class ErrorResponse(BaseModel):
149
+ error: str
150
+ detail: Optional[str] = None
151
+ code: Optional[str] = None
config.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Application Settings
3
+ Loaded via pydantic-settings from environment variables / .env file.
4
+ """
5
+ from functools import lru_cache
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ model_config = SettingsConfigDict(
11
+ env_file=".env",
12
+ env_file_encoding="utf-8",
13
+ case_sensitive=False,
14
+ extra="ignore",
15
+ )
16
+
17
+ # ── API Keys ──────────────────────────────────────────────────────────────
18
+ news_api_key: str = ""
19
+ google_vision_api_key: str = ""
20
+
21
+ # ── Database ──────────────────────────────────────────────────────────────
22
+ database_url: str = "sqlite+aiosqlite:///./philverify_dev.db" # Dev fallback
23
+
24
+ # ── Redis ─────────────────────────────────────────────────────────────────
25
+ redis_url: str = "" # Empty = disable caching
26
+
27
+ # ── App ───────────────────────────────────────────────────────────────────
28
+ app_env: str = "development"
29
+ debug: bool = True
30
+ log_level: str = "INFO"
31
+ allowed_origins: list[str] = [
32
+ "http://localhost:3000",
33
+ "http://localhost:5173",
34
+ ]
35
+
36
+ # ── ML Models ─────────────────────────────────────────────────────────────
37
+ ml_model_name: str = "xlm-roberta-base"
38
+ whisper_model_size: str = "base"
39
+ use_gpu: bool = False
40
+
41
+ # ── Scoring Weights ───────────────────────────────────────────────────────
42
+ ml_weight: float = 0.40
43
+ evidence_weight: float = 0.60
44
+ credible_threshold: float = 70.0
45
+ fake_threshold: float = 40.0
46
+
47
+ @property
48
+ def is_production(self) -> bool:
49
+ return self.app_env == "production"
50
+
51
+
52
+ @lru_cache
53
+ def get_settings() -> Settings:
54
+ """Return a cached singleton Settings instance."""
55
+ return Settings()
domain_credibility.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tier1": {
3
+ "description": "Established credible Philippine news organizations",
4
+ "score": 100,
5
+ "domains": [
6
+ "rappler.com", "inquirer.net", "gmanetwork.com", "abs-cbn.com",
7
+ "mb.com.ph", "philstar.com", "manilatimes.net", "sunstar.com.ph",
8
+ "businessmirror.com.ph", "bworldonline.com", "pna.gov.ph",
9
+ "doh.gov.ph", "official.deped.gov.ph", "senate.gov.ph", "congress.gov.ph"
10
+ ]
11
+ },
12
+ "tier2": {
13
+ "description": "Satire, opinion blogs, or entertainment sites",
14
+ "score": 50,
15
+ "domains": [
16
+ "knowyourmeme.com", "9gag.com", "buzzfeed.com",
17
+ "opinion.inquirer.net", "interaksyon.com"
18
+ ]
19
+ },
20
+ "tier3": {
21
+ "description": "Unknown / unverified sources — newly registered or low-authority",
22
+ "score": 25,
23
+ "domains": []
24
+ },
25
+ "tier4": {
26
+ "description": "Known fake news / misinformation sites (Vera Files blacklist)",
27
+ "score": 0,
28
+ "domains": [
29
+ "duterte.news", "pinoyakoblog.com", "filipinonewsalert.com",
30
+ "pilipinostar.com", "pinoytrending.net", "maharlikanews.com"
31
+ ]
32
+ }
33
+ }
evidence/__init__.py ADDED
File without changes
evidence/news_fetcher.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Evidence Retrieval Module
3
+ Fetches related articles from NewsAPI, computes cosine similarity,
4
+ and produces an evidence score for Layer 2 of the scoring engine.
5
+ """
6
+ import logging
7
+ import hashlib
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ import json
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Simple file-based cache to respect NewsAPI 100 req/day free tier limit
15
+ _CACHE_DIR = Path(__file__).parent.parent / ".cache" / "newsapi"
16
+ _CACHE_DIR.mkdir(parents=True, exist_ok=True)
17
+
18
+
19
+ @dataclass
20
+ class ArticleResult:
21
+ title: str
22
+ url: str
23
+ description: str
24
+ source_name: str
25
+ published_at: str
26
+ similarity: float = 0.0
27
+ stance: str = "Not Enough Info"
28
+ domain_tier: int = 3
29
+
30
+
31
+ @dataclass
32
+ class EvidenceResult:
33
+ verdict: str # "Supported" | "Contradicted" | "Insufficient"
34
+ evidence_score: float # 0–100
35
+ sources: list[ArticleResult] = field(default_factory=list)
36
+ claim_used: str = ""
37
+
38
+
39
+ def _cache_key(claim: str) -> str:
40
+ return hashlib.md5(claim.lower().strip().encode()).hexdigest()
41
+
42
+
43
+ def _load_cache(key: str) -> list[dict] | None:
44
+ path = _CACHE_DIR / f"{key}.json"
45
+ if path.exists():
46
+ try:
47
+ return json.loads(path.read_text())
48
+ except Exception:
49
+ return None
50
+ return None
51
+
52
+
53
+ def _save_cache(key: str, data: list[dict]) -> None:
54
+ path = _CACHE_DIR / f"{key}.json"
55
+ path.write_text(json.dumps(data))
56
+
57
+
58
+ async def fetch_evidence(claim: str, api_key: str, max_results: int = 5) -> list[dict]:
59
+ """Fetch top articles from NewsAPI for the given claim. Cached."""
60
+ key = _cache_key(claim)
61
+ cached = _load_cache(key)
62
+ if cached is not None:
63
+ logger.info("NewsAPI cache hit for claim hash %s", key[:8])
64
+ return cached
65
+
66
+ if not api_key:
67
+ logger.warning("NEWS_API_KEY not set — returning empty evidence")
68
+ return []
69
+
70
+ try:
71
+ from newsapi import NewsApiClient
72
+ client = NewsApiClient(api_key=api_key)
73
+ # Use first 100 chars of claim as query
74
+ query = claim[:100]
75
+ resp = client.get_everything(
76
+ q=query,
77
+ language="en",
78
+ sort_by="relevancy",
79
+ page_size=max_results,
80
+ )
81
+ articles = resp.get("articles", [])
82
+ _save_cache(key, articles)
83
+ logger.info("NewsAPI returned %d articles for query '%s...'", len(articles), query[:30])
84
+ return articles
85
+ except Exception as e:
86
+ logger.warning("NewsAPI fetch error: %s", e)
87
+ return []
88
+
89
+
90
+ def compute_similarity(claim: str, article_text: str) -> float:
91
+ """
92
+ Compute cosine similarity between claim and article using sentence-transformers.
93
+ Falls back to simple word-overlap Jaccard similarity.
94
+ """
95
+ try:
96
+ from sentence_transformers import SentenceTransformer, util
97
+ model = SentenceTransformer("all-MiniLM-L6-v2")
98
+ emb_claim = model.encode(claim, convert_to_tensor=True)
99
+ emb_article = model.encode(article_text[:512], convert_to_tensor=True)
100
+ score = float(util.cos_sim(emb_claim, emb_article)[0][0])
101
+ return round(max(0.0, min(1.0, score)), 3)
102
+ except Exception:
103
+ # Jaccard fallback
104
+ a = set(claim.lower().split())
105
+ b = set(article_text.lower().split())
106
+ if not a or not b:
107
+ return 0.0
108
+ return round(len(a & b) / len(a | b), 3)
inputs/__init__.py ADDED
File without changes
inputs/asr.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Whisper ASR Module
3
+ Transcribes video/audio files using OpenAI Whisper.
4
+ Recommended model: large-v3 (best Filipino speech accuracy).
5
+ """
6
+ import io
7
+ import logging
8
+ import tempfile
9
+ import os
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ async def transcribe_video(media_bytes: bytes, filename: str = "upload") -> str:
15
+ """
16
+ Transcribe audio/video bytes using Whisper.
17
+ Saves bytes to a temp file (Whisper requires file path, not bytes).
18
+ Returns the transcript string.
19
+ """
20
+ try:
21
+ import whisper
22
+ from config import get_settings
23
+ settings = get_settings()
24
+
25
+ model_size = settings.whisper_model_size
26
+ logger.info("Loading Whisper model: %s", model_size)
27
+
28
+ model = whisper.load_model(model_size)
29
+
30
+ # Whisper needs a file path — write bytes to temp file
31
+ suffix = os.path.splitext(filename)[-1] or ".mp4"
32
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
33
+ tmp.write(media_bytes)
34
+ tmp_path = tmp.name
35
+
36
+ try:
37
+ result = model.transcribe(tmp_path, language=None) # Auto-detect language
38
+ transcript = result.get("text", "").strip()
39
+ logger.info("Whisper transcribed %d chars (lang=%s)", len(transcript), result.get("language"))
40
+ return transcript
41
+ finally:
42
+ os.unlink(tmp_path) # Clean up temp file
43
+
44
+ except ImportError:
45
+ logger.warning("openai-whisper not installed — ASR unavailable")
46
+ return ""
47
+ except Exception as e:
48
+ logger.error("Whisper transcription failed: %s", e)
49
+ return ""
inputs/ocr.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — OCR Module (Tesseract)
3
+ Extracts text from images using pytesseract.
4
+ Falls back gracefully if Tesseract not installed.
5
+ """
6
+ import io
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # Supported languages: Filipino (fil) + English (eng)
12
+ _TESSERACT_LANG = "fil+eng"
13
+
14
+
15
+ async def extract_text_from_image(image_bytes: bytes) -> str:
16
+ """
17
+ Run Tesseract OCR on image bytes. Returns extracted text string.
18
+ """
19
+ try:
20
+ import pytesseract
21
+ from PIL import Image
22
+
23
+ image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
24
+ text = pytesseract.image_to_string(image, lang=_TESSERACT_LANG)
25
+ text = text.strip()
26
+ logger.info("OCR extracted %d chars from image", len(text))
27
+ return text
28
+ except ImportError:
29
+ logger.warning("pytesseract / Pillow not installed — OCR unavailable")
30
+ return ""
31
+ except Exception as e:
32
+ logger.error("OCR failed: %s", e)
33
+ return ""
inputs/url_scraper.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — URL Scraper (BeautifulSoup)
3
+ Extracts article text from news URLs. Respects robots.txt.
4
+ """
5
+ import logging
6
+ import re
7
+ from urllib.parse import urlparse
8
+ from urllib.robotparser import RobotFileParser
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _UNWANTED_TAGS = {"script", "style", "nav", "footer", "header", "aside", "figure", "figcaption"}
13
+
14
+
15
+ def _get_domain(url: str) -> str:
16
+ return urlparse(url).netloc.replace("www.", "")
17
+
18
+
19
+ def _robots_allow(url: str) -> bool:
20
+ try:
21
+ parsed = urlparse(url)
22
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
23
+ rp = RobotFileParser()
24
+ rp.set_url(robots_url)
25
+ rp.read()
26
+ return rp.can_fetch("*", url)
27
+ except Exception:
28
+ return True # Allow by default if robots.txt fetch fails
29
+
30
+
31
+ async def scrape_url(url: str) -> tuple[str, str]:
32
+ """
33
+ Returns (article_text, domain).
34
+ Raises ValueError if robots.txt disallows scraping.
35
+ """
36
+ domain = _get_domain(url)
37
+
38
+ if not _robots_allow(url):
39
+ logger.warning("robots.txt disallows scraping %s", url)
40
+ raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
41
+
42
+ try:
43
+ import httpx
44
+ from bs4 import BeautifulSoup
45
+
46
+ headers = {"User-Agent": "PhilVerifyBot/1.0 (fact-checking research)"}
47
+ async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
48
+ resp = await client.get(url, headers=headers)
49
+ resp.raise_for_status()
50
+
51
+ soup = BeautifulSoup(resp.text, "lxml")
52
+
53
+ # Remove unwanted tags
54
+ for tag in soup(list(_UNWANTED_TAGS)):
55
+ tag.decompose()
56
+
57
+ # Try article tag first, fall back to body
58
+ article = soup.find("article") or soup.find("main") or soup.body
59
+ if article is None:
60
+ return "", domain
61
+
62
+ paragraphs = article.find_all("p")
63
+ text = " ".join(p.get_text(separator=" ", strip=True) for p in paragraphs)
64
+ text = re.sub(r"\s+", " ", text).strip()
65
+
66
+ logger.info("Scraped %d chars from %s", len(text), domain)
67
+ return text, domain
68
+
69
+ except Exception as e:
70
+ logger.error("URL scraping failed for %s: %s", url, e)
71
+ return "", domain
main.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — FastAPI Application Entry Point
3
+ Run: uvicorn main:app --reload --port 8000
4
+ Docs: http://localhost:8000/docs
5
+ """
6
+ import logging
7
+ import os
8
+ from contextlib import asynccontextmanager
9
+
10
+ from fastapi import FastAPI, Request, status
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import JSONResponse
13
+
14
+ from config import get_settings
15
+ from api.routes.verify import router as verify_router
16
+ from api.routes.history import router as history_router
17
+ from api.routes.trends import router as trends_router
18
+
19
+ # ── Logging ───────────────────────────────────────────────────────────────────
20
+ logging.basicConfig(
21
+ level=getattr(logging, get_settings().log_level.upper(), logging.INFO),
22
+ format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
23
+ )
24
+ logger = logging.getLogger("philverify")
25
+
26
+
27
+ # ── Lifespan (startup / shutdown) ─────────────────────────────────────────────
28
+
29
+ @asynccontextmanager
30
+ async def lifespan(app: FastAPI):
31
+ """Warm up NLP models on startup so first request isn't slow."""
32
+ logger.info("🚀 PhilVerify starting up...")
33
+ try:
34
+ # Lazy-import to avoid crashing if heavy deps not yet installed
35
+ from nlp.language_detector import LanguageDetector
36
+ from nlp.preprocessor import TextPreprocessor
37
+ from ml.tfidf_classifier import TFIDFClassifier
38
+
39
+ app.state.preprocessor = TextPreprocessor()
40
+ app.state.language_detector = LanguageDetector()
41
+ classifier = TFIDFClassifier()
42
+ classifier.train() # Trains on seed dataset if model not persisted
43
+ app.state.classifier = classifier
44
+
45
+ logger.info("✅ NLP models ready")
46
+ except ImportError as e:
47
+ logger.warning("⚠️ Some NLP modules not installed yet: %s — stubs will be used", e)
48
+
49
+ yield # ── App is running ──
50
+
51
+ logger.info("👋 PhilVerify shutting down")
52
+
53
+
54
+ # ── App ───────────────────────────────────────────────────────────────────────
55
+
56
+ settings = get_settings()
57
+
58
+ app = FastAPI(
59
+ title="PhilVerify API",
60
+ description=(
61
+ "Multimodal fake news detection for Philippine social media. "
62
+ "Supports text, URL, image (OCR), and video (Whisper ASR) inputs."
63
+ ),
64
+ version="0.1.0",
65
+ docs_url="/docs",
66
+ redoc_url="/redoc",
67
+ lifespan=lifespan,
68
+ )
69
+
70
+
71
+ # ── CORS ──────────────────────────────────────────────────────────────────────
72
+
73
+ app.add_middleware(
74
+ CORSMiddleware,
75
+ allow_origins=settings.allowed_origins,
76
+ allow_credentials=True,
77
+ allow_methods=["*"],
78
+ allow_headers=["*"],
79
+ )
80
+
81
+
82
+ # ── Global Error Handler ──────────────────────────────────────────────────────
83
+
84
+ @app.exception_handler(Exception)
85
+ async def global_exception_handler(request: Request, exc: Exception):
86
+ logger.exception("Unhandled error on %s %s: %s", request.method, request.url.path, exc)
87
+ return JSONResponse(
88
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
89
+ content={"error": "Internal server error", "detail": str(exc)},
90
+ )
91
+
92
+
93
+ # ── Routers ───────────────────────────────────────────────────────────────────
94
+
95
+ app.include_router(verify_router)
96
+ app.include_router(history_router)
97
+ app.include_router(trends_router)
98
+
99
+
100
+ # ── Health ────────────────────────────────────────────────────────────────────
101
+
102
+ @app.get("/", tags=["Health"])
103
+ async def root():
104
+ return {
105
+ "service": "PhilVerify",
106
+ "version": "0.1.0",
107
+ "status": "operational",
108
+ "docs": "/docs",
109
+ }
110
+
111
+
112
+ @app.get("/health", tags=["Health"])
113
+ async def health():
114
+ return {"status": "ok", "env": settings.app_env}
115
+
116
+
117
+ # ── Dev runner ────────────────────────────────────────────────────────────────
118
+
119
+ if __name__ == "__main__":
120
+ import uvicorn
121
+ uvicorn.run(
122
+ "main:app",
123
+ host="0.0.0.0",
124
+ port=int(os.getenv("PORT", 8000)),
125
+ reload=settings.debug,
126
+ log_level=settings.log_level.lower(),
127
+ )
ml/__init__.py ADDED
File without changes
ml/tfidf_classifier.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
3
+ Seed dataset of 30 labeled PH news headlines (10 per class).
4
+ Replaced by fine-tuned XLM-RoBERTa in Phase 10.
5
+ """
6
+ import os
7
+ import logging
8
+ import pickle
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"
15
+
16
+ # ── Seed dataset (30 samples — 10 per class) ──────────────────────────────────
17
+ # Labels: 0=Credible, 1=Unverified, 2=Fake
18
+ SEED_DATA = [
19
+ # Credible (0)
20
+ ("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
21
+ ("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
22
+ ("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
23
+ ("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
24
+ ("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
25
+ ("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
26
+ ("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
27
+ ("DOF confirms revenue collection targets met for fiscal year 2025", 0),
28
+ ("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
29
+ ("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),
30
+
31
+ # Unverified (1)
32
+ ("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
33
+ ("VIRAL: Celebrity spotted at secret meeting with government official", 1),
34
+ ("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
35
+ ("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
36
+ ("CLAIM: Government hiding true COVID-19 death count from public", 1),
37
+ ("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
38
+ ("REPORT: Certain barangay officials accepting bribes according to residents", 1),
39
+ ("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
40
+ ("CLAIM: New mandatory vaccine policy for all government employees", 1),
41
+ ("Source says: Manila Water to increase rates by 20% next month", 1),
42
+
43
+ # Fake (2)
44
+ ("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
45
+ ("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
46
+ ("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
47
+ ("WATCH: Senator caught stealing money in Senate vault - full video", 2),
48
+ ("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
49
+ ("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
50
+ ("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
51
+ ("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
52
+ ("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
53
+ ("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
54
+ ]
55
+
56
+
57
+ @dataclass
58
+ class Layer1Result:
59
+ verdict: str # "Credible" | "Unverified" | "Fake"
60
+ confidence: float # 0.0 – 100.0
61
+ triggered_features: list[str] = field(default_factory=list)
62
+
63
+
64
+ class TFIDFClassifier:
65
+ """
66
+ TF-IDF + Logistic Regression baseline.
67
+ Train() fits on the seed dataset and saves to disk.
68
+ Predict() loads persisted model first call.
69
+ """
70
+
71
+ _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
72
+
73
+ def __init__(self):
74
+ self._vectorizer = None
75
+ self._clf = None
76
+
77
+ def train(self) -> None:
78
+ """Fit on seed data. Skips training if persisted model exists."""
79
+ if MODEL_PATH.exists():
80
+ self._load()
81
+ return
82
+
83
+ from sklearn.feature_extraction.text import TfidfVectorizer
84
+ from sklearn.linear_model import LogisticRegression
85
+
86
+ texts, labels = zip(*SEED_DATA)
87
+ self._vectorizer = TfidfVectorizer(
88
+ ngram_range=(1, 2),
89
+ max_features=1000,
90
+ sublinear_tf=True,
91
+ )
92
+ X = self._vectorizer.fit_transform(texts)
93
+ self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
94
+ self._clf.fit(X, labels)
95
+
96
+ MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
97
+ with open(MODEL_PATH, "wb") as f:
98
+ pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
99
+ logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)
100
+
101
+ def _load(self) -> None:
102
+ with open(MODEL_PATH, "rb") as f:
103
+ data = pickle.load(f)
104
+ self._vectorizer = data["vectorizer"]
105
+ self._clf = data["clf"]
106
+ logger.info("TF-IDF model loaded from %s", MODEL_PATH)
107
+
108
+ def predict(self, text: str) -> Layer1Result:
109
+ if self._vectorizer is None:
110
+ self.train()
111
+
112
+ X = self._vectorizer.transform([text])
113
+ pred_label = int(self._clf.predict(X)[0])
114
+ proba = self._clf.predict_proba(X)[0]
115
+ confidence = round(float(max(proba)) * 100, 1)
116
+ verdict = self._LABELS[pred_label]
117
+
118
+ # Extract top TF-IDF features as human-readable triggers
119
+ feature_names = self._vectorizer.get_feature_names_out()
120
+ tfidf_scores = X.toarray()[0]
121
+ top_indices = tfidf_scores.argsort()[-5:][::-1]
122
+ triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]
123
+
124
+ return Layer1Result(
125
+ verdict=verdict,
126
+ confidence=confidence,
127
+ triggered_features=triggered,
128
+ )
nlp/__init__.py ADDED
File without changes
nlp/claim_extractor.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Claim Extractor
3
+ Extracts the key falsifiable claim from noisy social media text.
4
+ Primary: HuggingFace summarization (t5-small)
5
+ Fallback: First 2 sentence heuristic
6
+ """
7
+ import re
8
+ import logging
9
+ from dataclasses import dataclass
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
14
+
15
+
16
+ @dataclass
17
+ class ClaimResult:
18
+ claim: str
19
+ method: str # "summarization" | "sentence_heuristic"
20
+
21
+
22
+ class ClaimExtractor:
23
+ """
24
+ Extracts the single most falsifiable claim from input text.
25
+ This claim is then sent to the NewsAPI evidence retrieval step.
26
+
27
+ Prompt engineering guide:
28
+ The summarization model is given a task-specific prefix to bias it
29
+ toward extracting assertions rather than summaries.
30
+ """
31
+
32
+ _TASK_PREFIX = "Extract the main factual claim: "
33
+
34
+ def __init__(self):
35
+ self._pipe = None
36
+ self._loaded = False
37
+
38
+ def _load_model(self):
39
+ if self._loaded:
40
+ return
41
+ try:
42
+ from transformers import pipeline
43
+ self._pipe = pipeline(
44
+ "summarization",
45
+ model="sshleifer/distilbart-cnn-6-6",
46
+ max_length=80,
47
+ min_length=10,
48
+ do_sample=False,
49
+ )
50
+ logger.info("Claim extractor model loaded (distilbart-cnn-6-6)")
51
+ except Exception as e:
52
+ logger.warning("Summarization model not available (%s) — using sentence heuristic", e)
53
+ self._loaded = True
54
+
55
+ def _sentence_heuristic(self, text: str) -> str:
56
+ """Return the first 1-2 sentences as the claim (fast fallback)."""
57
+ sentences = _SENTENCE_SPLIT.split(text.strip())
58
+ candidates = [s.strip() for s in sentences if len(s.strip()) > 20]
59
+ if not candidates:
60
+ return text[:200].strip()
61
+ return " ".join(candidates[:2])
62
+
63
+ def extract(self, text: str) -> ClaimResult:
64
+ self._load_model()
65
+
66
+ if not text or len(text.strip()) < 20:
67
+ return ClaimResult(claim=text.strip(), method="passthrough")
68
+
69
+ if self._pipe:
70
+ try:
71
+ input_text = self._TASK_PREFIX + text[:1024]
72
+ out = self._pipe(input_text, truncation=True)
73
+ claim = out[0]["summary_text"].strip()
74
+ # Strip the task prefix echo if model includes it
75
+ claim = re.sub(r"^extract the main factual claim:?\s*", "", claim, flags=re.I)
76
+ if len(claim) > 15:
77
+ return ClaimResult(claim=claim, method="summarization")
78
+ except Exception as e:
79
+ logger.warning("Summarization inference error: %s", e)
80
+
81
+ return ClaimResult(
82
+ claim=self._sentence_heuristic(text),
83
+ method="sentence_heuristic",
84
+ )
nlp/clickbait.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Clickbait Detector
3
+ Detects clickbait patterns common in Philippine fake news / viral content.
4
+ Uses regex patterns + feature flags (no model needed).
5
+ """
6
+ import re
7
+ from dataclasses import dataclass, field
8
+
9
+ # ── Pattern library ───────────────────────────────────────────────────────────
10
+ _CLICKBAIT_PHRASES_EN = [
11
+ r"\byou won'?t believe\b", r"\bshocking\b", r"\bviral\b", r"\bbreaking\b",
12
+ r"\bexclusive\b", r"\bmust[\s-]?see\b", r"\bsecret\b", r"\bconfirmed\b",
13
+ r"\bexposed\b", r"\bscandal\b", r"\bunbelievable\b", r"\bmiraculous?\b",
14
+ r"\bhoax\b", r"\bfact[\s-]?check\b", r"\bthis is why\b", r"\bwatch this\b",
15
+ ]
16
+ _CLICKBAIT_PHRASES_TL = [
17
+ r"\bgrabe\b", r"\bwow\b", r"\bsurprise\b", r"\bshocking\b", r"\btrending\b",
18
+ r"\bselo\b", r"\bbalita\b", r"\bnatuklasan\b", r"\bnahuli\b", r"\bsikat\b",
19
+ r"\bpakinggan\b", r"\bpanoorin\b", r"\bkumpirmado\b", r"\bkatotohanan\b",
20
+ ]
21
+
22
+ _CAPS_WORD = re.compile(r"\b[A-Z]{2,}\b")
23
+ _EXCESSIVE_PUNCT = re.compile(r"[!?]{2,}")
24
+ _NUMBER_BAIT = re.compile(r"\b\d+\s+(?:reasons?|things?|ways?|tips?|signs?|bagay)\b", re.I)
25
+ _QUESTION_BAIT = re.compile(r"\b(?:ano|bakit|paano|kailan|sino|saan)\b.*\?", re.I)
26
+ _ALL_PHRASES = [re.compile(p, re.IGNORECASE) for p in _CLICKBAIT_PHRASES_EN + _CLICKBAIT_PHRASES_TL]
27
+
28
+
29
+ @dataclass
30
+ class ClickbaitResult:
31
+ is_clickbait: bool
32
+ score: float # 0.0 – 1.0
33
+ triggered_patterns: list[str] = field(default_factory=list)
34
+
35
+
36
+ class ClickbaitDetector:
37
+ """
38
+ Feature-flag based clickbait detector optimized for PH social media.
39
+ Returns a continuous score based on how many patterns are triggered.
40
+ """
41
+
42
+ def detect(self, text: str) -> ClickbaitResult:
43
+ triggered: list[str] = []
44
+
45
+ # ALL CAPS words (2+ in a short span)
46
+ caps_words = _CAPS_WORD.findall(text)
47
+ if len(caps_words) >= 2:
48
+ triggered.append(f"all_caps_words: {caps_words[:3]}")
49
+
50
+ # Excessive punctuation !! ???
51
+ if _EXCESSIVE_PUNCT.search(text):
52
+ triggered.append("excessive_punctuation")
53
+
54
+ # Number-based bait: "5 reasons why..."
55
+ if _NUMBER_BAIT.search(text):
56
+ triggered.append("number_bait")
57
+
58
+ # Rhetorical question bait (Tagalog)
59
+ if _QUESTION_BAIT.search(text):
60
+ triggered.append("question_bait")
61
+
62
+ # Title length signal (extremely short or extremely long)
63
+ word_count = len(text.split())
64
+ if word_count < 5:
65
+ triggered.append("title_too_short")
66
+ elif word_count > 30:
67
+ triggered.append("title_very_long")
68
+
69
+ # Phrase patterns
70
+ for pattern in _ALL_PHRASES:
71
+ m = pattern.search(text)
72
+ if m:
73
+ triggered.append(f"clickbait_phrase: '{m.group(0)}'")
74
+
75
+ # Score: each feature contributes a weight
76
+ weights = {
77
+ "excessive_punctuation": 0.20,
78
+ "all_caps_words": 0.20,
79
+ "number_bait": 0.15,
80
+ "question_bait": 0.10,
81
+ "title_too_short": 0.05,
82
+ "title_very_long": 0.05,
83
+ }
84
+ score = 0.0
85
+ for feat in triggered:
86
+ for key, w in weights.items():
87
+ if feat.startswith(key):
88
+ score += w
89
+ break
90
+ else:
91
+ # clickbait_phrase triggers
92
+ if feat.startswith("clickbait_phrase"):
93
+ score += 0.25
94
+
95
+ score = min(score, 1.0)
96
+ return ClickbaitResult(
97
+ is_clickbait=score >= 0.4,
98
+ score=round(score, 3),
99
+ triggered_patterns=triggered,
100
+ )
nlp/language_detector.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Language Detector
3
+ Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic.
4
+ No heavy model needed — runs instantly.
5
+ """
6
+ import re
7
+ import logging
8
+ from dataclasses import dataclass
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # ── Filipino stopword set for heuristic ───────────────────────────────────────
13
+ _TL_MARKERS = {
14
+ "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
15
+ "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw",
16
+ "ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung",
17
+ "pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi",
18
+ "may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino",
19
+ }
20
+
21
+ # English marker words (distinct from TL)
22
+ _EN_MARKERS = {
23
+ "the", "and", "is", "are", "was", "were", "this", "that", "with",
24
+ "from", "have", "has", "had", "will", "would", "could", "should",
25
+ "not", "been", "being", "they", "their", "there",
26
+ }
27
+
28
+
29
+ @dataclass
30
+ class LanguageResult:
31
+ language: str # "Tagalog" | "English" | "Taglish" | "Unknown"
32
+ confidence: float # 0.0 – 1.0
33
+ tl_ratio: float
34
+ en_ratio: float
35
+ method: str # "heuristic" | "langdetect" | "combined"
36
+
37
+
38
+ class LanguageDetector:
39
+ """
40
+ Two-pass language detector:
41
+ Pass 1 — Filipino stopword ratio (fast, handles code-switching)
42
+ Pass 2 — langdetect (for confirmation when ratios are ambiguous)
43
+
44
+ Decision rules:
45
+ tl_ratio >= 0.25 and en_ratio < 0.15 → Tagalog
46
+ en_ratio >= 0.25 and tl_ratio < 0.15 → English
47
+ both >= 0.15 → Taglish
48
+ fallback → langdetect result
49
+ """
50
+
51
+ def _token_ratios(self, text: str) -> tuple[float, float]:
52
+ tokens = re.findall(r"\b\w+\b", text.lower())
53
+ if not tokens:
54
+ return 0.0, 0.0
55
+ tl_count = sum(1 for t in tokens if t in _TL_MARKERS)
56
+ en_count = sum(1 for t in tokens if t in _EN_MARKERS)
57
+ total = len(tokens)
58
+ return tl_count / total, en_count / total
59
+
60
+ def _langdetect(self, text: str) -> str:
61
+ try:
62
+ from langdetect import detect
63
+ code = detect(text)
64
+ # langdetect returns 'tl' for Tagalog
65
+ if code == "tl":
66
+ return "Tagalog"
67
+ elif code == "en":
68
+ return "English"
69
+ else:
70
+ return "Unknown"
71
+ except Exception:
72
+ return "Unknown"
73
+
74
+ def detect(self, text: str) -> LanguageResult:
75
+ if not text or len(text.strip()) < 5:
76
+ return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic")
77
+
78
+ tl_ratio, en_ratio = self._token_ratios(text)
79
+
80
+ # Clear Tagalog
81
+ if tl_ratio >= 0.25 and en_ratio < 0.15:
82
+ return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic")
83
+
84
+ # Clear English
85
+ if en_ratio >= 0.25 and tl_ratio < 0.15:
86
+ return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic")
87
+
88
+ # Taglish — both markers present
89
+ if tl_ratio >= 0.10 and en_ratio >= 0.10:
90
+ confidence = (tl_ratio + en_ratio) / 2
91
+ return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic")
92
+
93
+ # Ambiguous — fall back to langdetect
94
+ ld_lang = self._langdetect(text)
95
+ if ld_lang != "Unknown":
96
+ confidence = max(tl_ratio, en_ratio, 0.5)
97
+ return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect")
98
+
99
+ return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")
nlp/ner.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Named Entity Recognition
3
+ Extracts persons, organizations, locations, and dates from text.
4
+ Uses spaCy en_core_web_sm with graceful fallback if model not installed.
5
+ """
6
+ import logging
7
+ import re
8
+ from dataclasses import dataclass, field
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Philippine-specific named entity hints
13
+ _PH_PERSONS = {
14
+ "marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
15
+ "bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
16
+ }
17
+ _PH_ORGS = {
18
+ "doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
19
+ "afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
20
+ "senate", "congress", "supreme court", "malacanang",
21
+ }
22
+ _PH_LOCATIONS = {
23
+ "manila", "quezon city", "makati", "pasig", "taguig", "cebu",
24
+ "davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
25
+ "batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
26
+ "metro manila", "ncr", "philippines", "pilipinas",
27
+ }
28
+
29
+
30
+ @dataclass
31
+ class NERResult:
32
+ persons: list[str] = field(default_factory=list)
33
+ organizations: list[str] = field(default_factory=list)
34
+ locations: list[str] = field(default_factory=list)
35
+ dates: list[str] = field(default_factory=list)
36
+ method: str = "spacy"
37
+
38
+ def to_dict(self) -> dict:
39
+ return {
40
+ "persons": self.persons,
41
+ "organizations": self.organizations,
42
+ "locations": self.locations,
43
+ "dates": self.dates,
44
+ }
45
+
46
+
47
+ class EntityExtractor:
48
+ """
49
+ NER using spaCy (en_core_web_sm) + Philippine entity hint layer.
50
+ Falls back to regex-based date extraction if spaCy not installed.
51
+ """
52
+
53
+ def __init__(self):
54
+ self._nlp = None
55
+ self._loaded = False
56
+
57
+ def _load_model(self):
58
+ if self._loaded:
59
+ return
60
+ try:
61
+ import spacy
62
+ self._nlp = spacy.load("en_core_web_sm")
63
+ logger.info("spaCy en_core_web_sm loaded")
64
+ except Exception as e:
65
+ logger.warning("spaCy not available (%s) — using hint-based NER", e)
66
+ self._nlp = None
67
+ self._loaded = True
68
+
69
+ def _hint_based_extract(self, text: str) -> NERResult:
70
+ """Fallback: match PH-specific entity hint lists + date regex."""
71
+ lower = text.lower()
72
+ result = NERResult(method="hints")
73
+
74
+ result.persons = [p.title() for p in _PH_PERSONS if p in lower]
75
+ result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
76
+ result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]
77
+
78
+ # Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
79
+ date_patterns = [
80
+ r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
81
+ r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
82
+ r"\b\d{4}-\d{2}-\d{2}\b",
83
+ r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
84
+ ]
85
+ for pattern in date_patterns:
86
+ result.dates.extend(re.findall(pattern, text, re.IGNORECASE))
87
+
88
+ return result
89
+
90
+ def extract(self, text: str) -> NERResult:
91
+ self._load_model()
92
+
93
+ if self._nlp is None:
94
+ return self._hint_based_extract(text)
95
+
96
+ try:
97
+ doc = self._nlp(text[:5000]) # spaCy has a token limit
98
+ result = NERResult(method="spacy")
99
+
100
+ for ent in doc.ents:
101
+ ent_text = ent.text.strip()
102
+ if ent.label_ == "PERSON":
103
+ result.persons.append(ent_text)
104
+ elif ent.label_ in ("ORG", "NORP"):
105
+ result.organizations.append(ent_text)
106
+ elif ent.label_ in ("GPE", "LOC"):
107
+ result.locations.append(ent_text)
108
+ elif ent.label_ in ("DATE", "TIME"):
109
+ result.dates.append(ent_text)
110
+
111
+ # Deduplicate while preserving order
112
+ result.persons = list(dict.fromkeys(result.persons))
113
+ result.organizations = list(dict.fromkeys(result.organizations))
114
+ result.locations = list(dict.fromkeys(result.locations))
115
+ result.dates = list(dict.fromkeys(result.dates))
116
+
117
+ # Supplement with PH hints for entities spaCy may miss
118
+ hint_result = self._hint_based_extract(text)
119
+ for p in hint_result.persons:
120
+ if p not in result.persons:
121
+ result.persons.append(p)
122
+ for o in hint_result.organizations:
123
+ if o not in result.organizations:
124
+ result.organizations.append(o)
125
+
126
+ return result
127
+ except Exception as e:
128
+ logger.warning("spaCy extraction error: %s — falling back to hints", e)
129
+ return self._hint_based_extract(text)
nlp/preprocessor.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Text Preprocessor
3
+ Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
4
+ """
5
+ import re
6
+ import string
7
+ import unicodedata
8
+ from dataclasses import dataclass, field
9
+
10
+ # ── Filipino + English stopwords ──────────────────────────────────────────────
11
+ TAGALOG_STOPWORDS = {
12
+ "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
13
+ "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
14
+ "dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
15
+ "lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
16
+ "kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
17
+ "ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
18
+ "hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
19
+ }
20
+
21
+ ENGLISH_STOPWORDS = {
22
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
23
+ "for", "of", "with", "by", "from", "is", "are", "was", "were",
24
+ "be", "been", "being", "have", "has", "had", "do", "does", "did",
25
+ "will", "would", "could", "should", "may", "might", "shall", "can",
26
+ "not", "no", "nor", "so", "yet", "both", "either", "neither",
27
+ "this", "that", "these", "those", "it", "its", "i", "me", "my",
28
+ "we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
29
+ }
30
+
31
+ ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS
32
+
33
+ # ── Patterns ──────────────────────────────────────────────────────────────────
34
+ _URL_PATTERN = re.compile(
35
+ r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
36
+ )
37
+ _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
38
+ _MENTION_PATTERN = re.compile(r"@\w+")
39
+ _HASHTAG_PATTERN = re.compile(r"#\w+")
40
+ _REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}") # "graaabe" → "grabe"
41
+ _EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
42
+ _WHITESPACE_PATTERN = re.compile(r"\s+")
43
+
44
+ # Emoji removal via unicode category
45
+ def _remove_emojis(text: str) -> str:
46
+ return "".join(
47
+ ch for ch in text
48
+ if not unicodedata.category(ch).startswith("So") # Symbol, Other
49
+ and unicodedata.category(ch) not in ("Mn",) # Modifier letters
50
+ )
51
+
52
+
53
+ @dataclass
54
+ class PreprocessResult:
55
+ original: str
56
+ cleaned: str
57
+ normalized: str
58
+ tokens: list[str] = field(default_factory=list)
59
+ filtered_tokens: list[str] = field(default_factory=list)
60
+ char_count: int = 0
61
+ word_count: int = 0
62
+
63
+
64
+ class TextPreprocessor:
65
+ """
66
+ Multi-step text cleaner for Tagalog / English / Taglish content.
67
+
68
+ Pipeline:
69
+ 1. strip_html — remove HTML tags
70
+ 2. strip_urls — remove hyperlinks
71
+ 3. strip_mentions — remove @user
72
+ 4. strip_hashtags — remove #tag text (keep token)
73
+ 5. strip_emojis — remove Unicode emoji
74
+ 6. lowercase — normalize case
75
+ 7. normalize_chars — collapse repeated chars, excessive !??
76
+ 8. strip_punct — remove punctuation except apostrophe
77
+ 9. tokenize — split on whitespace
78
+ 10. remove_stopwords — drop EN + TL stopwords
79
+ """
80
+
81
+ def clean(self, text: str) -> str:
82
+ """Steps 1-6: structural cleaning."""
83
+ text = _HTML_TAG_PATTERN.sub(" ", text)
84
+ text = _URL_PATTERN.sub(" ", text)
85
+ text = _MENTION_PATTERN.sub(" ", text)
86
+ text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text) # Keep word, drop #
87
+ text = _remove_emojis(text)
88
+ text = text.lower()
89
+ return _WHITESPACE_PATTERN.sub(" ", text).strip()
90
+
91
+ def normalize(self, text: str) -> str:
92
+ """Steps 7-8: character-level normalization."""
93
+ text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text) # "graaabe" → "graabe"
94
+ text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text) # "!!!" → "!"
95
+ # Keep apostrophes (di, 'di, hindi), remove other punct
96
+ text = "".join(
97
+ ch if ch not in string.punctuation or ch == "'" else " "
98
+ for ch in text
99
+ )
100
+ return _WHITESPACE_PATTERN.sub(" ", text).strip()
101
+
102
+ def tokenize(self, text: str) -> list[str]:
103
+ """Step 9: whitespace tokenization."""
104
+ return [t for t in text.split() if len(t) > 1]
105
+
106
+ def remove_stopwords(self, tokens: list[str]) -> list[str]:
107
+ """Step 10: remove EN + TL stopwords."""
108
+ return [t for t in tokens if t not in ALL_STOPWORDS]
109
+
110
+ def preprocess(self, text: str) -> PreprocessResult:
111
+ """Run the full pipeline and return a structured result."""
112
+ cleaned = self.clean(text)
113
+ normalized = self.normalize(cleaned)
114
+ tokens = self.tokenize(normalized)
115
+ filtered = self.remove_stopwords(tokens)
116
+ return PreprocessResult(
117
+ original=text,
118
+ cleaned=cleaned,
119
+ normalized=normalized,
120
+ tokens=tokens,
121
+ filtered_tokens=filtered,
122
+ char_count=len(normalized),
123
+ word_count=len(tokens),
124
+ )
nlp/sentiment.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Sentiment & Emotion Analyzer
3
+ Uses HuggingFace transformers with graceful fallback to lexicon-based scoring.
4
+ """
5
+ import logging
6
+ from dataclasses import dataclass
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # ── Simple lexicons for fallback ──────────────────────────────────────────────
11
+ _NEGATIVE_WORDS = {
12
+ "fake", "false", "lie", "liar", "hoax", "scam", "fraud", "corrupt",
13
+ "criminal", "illegal", "murder", "die", "death", "dead", "kill",
14
+ "patay", "namatay", "peke", "sinungaling", "corrupt", "magnanakaw",
15
+ "kasamaan", "krimen", "karahasan", "pandemic", "sakit", "epidemya",
16
+ "grabe", "nakakatakot", "nakakainis", "nakakagalit", "kahiya",
17
+ }
18
+ _POSITIVE_WORDS = {
19
+ "good", "great", "excellent", "amazing", "wonderful", "positive",
20
+ "success", "win", "victory", "help", "support", "safe", "free",
21
+ "maganda", "magaling", "mahusay", "maayos", "tagumpay", "ligtas",
22
+ "masaya", "mabuti", "mahalaga", "mahal", "salamat", "pagbabago",
23
+ }
24
+ _FEAR_WORDS = {
25
+ "takot", "fear", "scared", "afraid", "terror", "danger", "dangerous",
26
+ "banta", "panganib", "nakakatakot", "kalamidad", "lindol",
27
+ }
28
+ _ANGER_WORDS = {
29
+ "galit", "angry", "anger", "furious", "rage", "outrage", "poot",
30
+ "nakakagalit", "nakakaasar", "sumpain", "putang", "gago",
31
+ }
32
+
33
+
34
+ @dataclass
35
+ class SentimentResult:
36
+ sentiment: str # positive | negative | neutral | high positive | high negative
37
+ sentiment_score: float # -1.0 to 1.0
38
+ emotion: str # anger | fear | joy | sadness | neutral
39
+ emotion_score: float # 0.0 to 1.0
40
+ method: str # "transformer" | "lexicon"
41
+
42
+
43
+ class SentimentAnalyzer:
44
+ """
45
+ Two-strategy sentiment analysis:
46
+ Primary — cardiffnlp/twitter-roberta-base-sentiment-latest (social media optimized)
47
+ Fallback — lexicon-based word counting
48
+ """
49
+
50
+ def __init__(self):
51
+ self._sentiment_pipe = None
52
+ self._emotion_pipe = None
53
+ self._loaded = False
54
+
55
+ def _load_models(self):
56
+ if self._loaded:
57
+ return
58
+ try:
59
+ from transformers import pipeline
60
+ self._sentiment_pipe = pipeline(
61
+ "text-classification",
62
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest",
63
+ top_k=1,
64
+ )
65
+ self._emotion_pipe = pipeline(
66
+ "text-classification",
67
+ model="j-hartmann/emotion-english-distilroberta-base",
68
+ top_k=1,
69
+ )
70
+ logger.info("Sentiment / emotion models loaded")
71
+ except Exception as e:
72
+ logger.warning("Transformer models not available (%s) — using lexicon fallback", e)
73
+ self._loaded = True
74
+
75
+ def _lexicon_analyze(self, text: str) -> SentimentResult:
76
+ words = set(text.lower().split())
77
+ neg = len(words & _NEGATIVE_WORDS)
78
+ pos = len(words & _POSITIVE_WORDS)
79
+ fear = len(words & _FEAR_WORDS)
80
+ anger = len(words & _ANGER_WORDS)
81
+
82
+ total = neg + pos
83
+ if total == 0:
84
+ score = 0.0
85
+ else:
86
+ score = (pos - neg) / total
87
+
88
+ if score > 0.3:
89
+ sentiment = "high positive" if score > 0.6 else "positive"
90
+ elif score < -0.3:
91
+ sentiment = "high negative" if score < -0.6 else "negative"
92
+ else:
93
+ sentiment = "neutral"
94
+
95
+ emotion_score = 0.0
96
+ if fear > anger:
97
+ emotion = "fear"
98
+ emotion_score = min(fear / max(len(words), 1) * 5, 1.0)
99
+ elif anger > 0:
100
+ emotion = "anger"
101
+ emotion_score = min(anger / max(len(words), 1) * 5, 1.0)
102
+ elif pos > neg:
103
+ emotion = "joy"
104
+ emotion_score = min(pos / max(len(words), 1) * 5, 1.0)
105
+ elif neg > 0:
106
+ emotion = "sadness"
107
+ emotion_score = min(neg / max(len(words), 1) * 5, 1.0)
108
+ else:
109
+ emotion = "neutral"
110
+ emotion_score = 0.0
111
+
112
+ return SentimentResult(sentiment, round(score, 3), emotion, round(emotion_score, 3), "lexicon")
113
+
114
+ def analyze(self, text: str) -> SentimentResult:
115
+ self._load_models()
116
+ snippet = text[:512] # Transformer token limit
117
+
118
+ if self._sentiment_pipe and self._emotion_pipe:
119
+ try:
120
+ s_out = self._sentiment_pipe(snippet)[0]
121
+ e_out = self._emotion_pipe(snippet)[0]
122
+
123
+ raw_label = s_out["label"].lower()
124
+ score = s_out["score"]
125
+ if "positive" in raw_label:
126
+ sentiment = "high positive" if score > 0.85 else "positive"
127
+ s_score = score
128
+ elif "negative" in raw_label:
129
+ sentiment = "high negative" if score > 0.85 else "negative"
130
+ s_score = -score
131
+ else:
132
+ sentiment = "neutral"
133
+ s_score = 0.0
134
+
135
+ emotion = e_out["label"].lower()
136
+ emotion_score = e_out["score"]
137
+ return SentimentResult(sentiment, round(s_score, 3), emotion, round(emotion_score, 3), "transformer")
138
+ except Exception as e:
139
+ logger.warning("Transformer inference error: %s — falling back to lexicon", e)
140
+
141
+ return self._lexicon_analyze(text)
pytest.ini ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [pytest]
2
+ asyncio_mode = auto
3
+ testpaths = tests
4
+ python_files = test_*.py
5
+ python_classes = Test*
6
+ python_functions = test_*
requirements.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Core Framework ────────────────────────────────────────────────────────────
2
+ fastapi==0.115.6
3
+ uvicorn[standard]==0.32.1
4
+ python-multipart==0.0.17 # File upload support
5
+ pydantic==2.9.2
6
+ pydantic-settings==2.6.1
7
+
8
+ # ── NLP & ML ──────────────────────────────────────────────────────────────────
9
+ transformers==4.46.3
10
+ torch==2.5.1
11
+ sentence-transformers==3.3.1
12
+ scikit-learn==1.5.2
13
+ spacy==3.8.2
14
+ langdetect==1.0.9
15
+ nltk==3.9.1
16
+
17
+ # ── Input Modules ─────────────────────────────────────────────────────────────
18
+ pytesseract==0.3.13 # OCR
19
+ Pillow==11.0.0 # Image processing
20
+ openai-whisper==20240930 # ASR (Filipino speech)
21
+ beautifulsoup4==4.12.3 # URL scraping
22
+ requests==2.32.3
23
+ lxml==5.3.0
24
+
25
+ # ── Evidence Retrieval ────────────────────────────────────────────────────────
26
+ newsapi-python==0.2.7
27
+
28
+ # ── Database ──────────────────────────────────────────────────────────────────
29
+ sqlalchemy==2.0.36
30
+ asyncpg==0.30.0 # Async PostgreSQL driver
31
+ alembic==1.14.0
32
+
33
+ # ── Caching ───────────────────────────────────────────────────────────────────
34
+ redis==5.2.1
35
+ cachetools==5.5.0
36
+
37
+ # ── Utilities ─────────────────────────────────────────────────────────────────
38
+ python-dotenv==1.0.1
39
+ httpx==0.28.1 # Async HTTP client
40
+ aiofiles==24.1.0
41
+ tqdm==4.67.1
42
+ numpy==1.26.4
43
+
44
+ # ── Testing ───────────────────────────────────────────────────────────────────
45
+ pytest==8.3.4
46
+ pytest-asyncio==0.24.0
47
+ httpx==0.28.1 # FastAPI TestClient
scoring/__init__.py ADDED
File without changes
scoring/engine.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Scoring Engine (Orchestrator)
3
+ Ties together all NLP modules, Layer 1, and Layer 2 into a final VerificationResponse.
4
+ Final Score = (ML Confidence × 0.40) + (Evidence Score × 0.60)
5
+ """
6
+ import asyncio
7
+ import json
8
+ import logging
9
+ import uuid
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+
13
+ from config import get_settings
14
+ from api.schemas import (
15
+ VerificationResponse, Verdict, Language, DomainTier,
16
+ Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+ settings = get_settings()
21
+
22
+ # ── Domain credibility lookup ─────────────────────────────────────────────────
23
+ _DOMAIN_DB_PATH = Path(__file__).parent.parent / "domain_credibility.json"
24
+ _DOMAIN_DB: dict = {}
25
+
26
+ def _load_domain_db() -> dict:
27
+ global _DOMAIN_DB
28
+ if not _DOMAIN_DB:
29
+ try:
30
+ _DOMAIN_DB = json.loads(_DOMAIN_DB_PATH.read_text())
31
+ except Exception as e:
32
+ logger.warning("Could not load domain_credibility.json: %s", e)
33
+ return _DOMAIN_DB
34
+
35
+ def get_domain_tier(domain: str) -> DomainTier | None:
36
+ if not domain:
37
+ return None
38
+ db = _load_domain_db()
39
+ domain = domain.lower().replace("www.", "")
40
+ for tier_key, tier_data in db.items():
41
+ if domain in tier_data.get("domains", []):
42
+ return DomainTier(int(tier_key[-1]))
43
+ return DomainTier.SUSPICIOUS # Unknown domains default to Tier 3
44
+
45
+
46
+ def _map_verdict(final_score: float) -> Verdict:
47
+ if final_score >= settings.credible_threshold:
48
+ return Verdict.CREDIBLE
49
+ elif final_score >= settings.fake_threshold:
50
+ return Verdict.UNVERIFIED
51
+ else:
52
+ return Verdict.LIKELY_FAKE
53
+
54
+
55
+ async def run_verification(
56
+ text: str,
57
+ input_type: str = "text",
58
+ source_domain: str | None = None,
59
+ ) -> VerificationResponse:
60
+ """
61
+ Full verification pipeline orchestrator.
62
+ Runs NLP analysis and ML classifier synchronously, evidence retrieval async.
63
+ """
64
+ # ── Lazy imports so app starts without heavy deps ─────────────────────────
65
+ from nlp.preprocessor import TextPreprocessor
66
+ from nlp.language_detector import LanguageDetector
67
+ from nlp.ner import EntityExtractor
68
+ from nlp.sentiment import SentimentAnalyzer
69
+ from nlp.clickbait import ClickbaitDetector
70
+ from nlp.claim_extractor import ClaimExtractor
71
+ from ml.tfidf_classifier import TFIDFClassifier
72
+ from evidence.news_fetcher import fetch_evidence, compute_similarity
73
+
74
+ # ── Step 1: Preprocess ────────────────────────────────────────────────────
75
+ preprocessor = TextPreprocessor()
76
+ proc = preprocessor.preprocess(text)
77
+
78
+ # ── Step 2: Language detection ────────────────────────────────────────────
79
+ lang_detector = LanguageDetector()
80
+ lang_result = lang_detector.detect(text)
81
+ language = Language(lang_result.language) if lang_result.language in Language._value2member_map_ else Language.TAGLISH
82
+
83
+ # ── Steps 3–6: NLP analysis (run concurrently) ───────────────────────────
84
+ ner_extractor = EntityExtractor()
85
+ sentiment_analyzer = SentimentAnalyzer()
86
+ clickbait_detector = ClickbaitDetector()
87
+ claim_extractor = ClaimExtractor()
88
+
89
+ ner_result = ner_extractor.extract(text)
90
+ sentiment_result = sentiment_analyzer.analyze(proc.cleaned)
91
+ clickbait_result = clickbait_detector.detect(text)
92
+ claim_result = claim_extractor.extract(proc.cleaned)
93
+
94
+ # ── Step 7: Layer 1 — ML Classifier ──────────────────────────────────────
95
+ classifier = TFIDFClassifier()
96
+ classifier.train()
97
+ l1 = classifier.predict(proc.cleaned)
98
+
99
+ # Enrich triggered features with NLP signals
100
+ if clickbait_result.is_clickbait:
101
+ l1.triggered_features.extend(clickbait_result.triggered_patterns[:3])
102
+ if sentiment_result.sentiment in ("high negative",):
103
+ l1.triggered_features.append("high emotional language")
104
+
105
+ layer1 = Layer1Result(
106
+ verdict=Verdict(l1.verdict),
107
+ confidence=l1.confidence,
108
+ triggered_features=l1.triggered_features,
109
+ )
110
+
111
+ # ── Step 8: Layer 2 — Evidence Retrieval ──────────────────────────────────
112
+ evidence_score = 50.0 # Neutral default when API key absent
113
+ evidence_sources: list[EvidenceSource] = []
114
+ l2_verdict = Verdict.UNVERIFIED
115
+
116
+ if settings.news_api_key:
117
+ try:
118
+ articles = await fetch_evidence(claim_result.claim, settings.news_api_key)
119
+ for art in articles[:5]:
120
+ article_text = f"{art.get('title', '')} {art.get('description', '')}"
121
+ sim = compute_similarity(claim_result.claim, article_text)
122
+ domain = (art.get("source", {}) or {}).get("name", "unknown").lower()
123
+ tier = get_domain_tier(domain)
124
+
125
+ # Simple stance heuristic — negative title keywords → Refutes
126
+ title_lower = (art.get("title") or "").lower()
127
+ stance = Stance.NOT_ENOUGH_INFO
128
+ if any(w in title_lower for w in ["false", "fake", "hoax", "wrong", "debunked", "fact check"]):
129
+ stance = Stance.REFUTES
130
+ elif sim > 0.6:
131
+ stance = Stance.SUPPORTS
132
+
133
+ evidence_sources.append(EvidenceSource(
134
+ title=art.get("title", ""),
135
+ url=art.get("url", ""),
136
+ similarity=sim,
137
+ stance=stance,
138
+ domain_tier=tier or DomainTier.SUSPICIOUS,
139
+ published_at=art.get("publishedAt"),
140
+ source_name=art.get("source", {}).get("name"),
141
+ ))
142
+
143
+ # Evidence score: average similarity × 100, penalized for refuting sources
144
+ if evidence_sources:
145
+ supporting = [s for s in evidence_sources if s.stance == Stance.SUPPORTS]
146
+ refuting = [s for s in evidence_sources if s.stance == Stance.REFUTES]
147
+ avg_sim = sum(s.similarity for s in evidence_sources) / len(evidence_sources)
148
+ refute_penalty = len(refuting) * 15
149
+ evidence_score = max(0.0, min(100.0, avg_sim * 100 - refute_penalty))
150
+
151
+ if len(refuting) > len(supporting):
152
+ l2_verdict = Verdict.LIKELY_FAKE
153
+ elif len(supporting) >= 2:
154
+ l2_verdict = Verdict.CREDIBLE
155
+ except Exception as e:
156
+ logger.warning("Evidence retrieval failed: %s — using neutral score", e)
157
+
158
+ layer2 = Layer2Result(
159
+ verdict=l2_verdict,
160
+ evidence_score=round(evidence_score, 1),
161
+ sources=evidence_sources,
162
+ claim_used=claim_result.claim,
163
+ )
164
+
165
+ # ── Step 9: Final Score ───────────────────────────────────────────────────
166
+ # ML confidence is 0-100 where high = more credible for the predicted class.
167
+ # Adjust: if ML says Fake, its confidence works against credibility.
168
+ ml_credibility = l1.confidence if l1.verdict == "Credible" else (100 - l1.confidence)
169
+ final_score = round(
170
+ (ml_credibility * settings.ml_weight) + (evidence_score * settings.evidence_weight),
171
+ 1,
172
+ )
173
+ verdict = _map_verdict(final_score)
174
+
175
+ # ── Step 10: Assemble response ────────────────────────────────────────────
176
+ result = VerificationResponse(
177
+ verdict=verdict,
178
+ confidence=round(max(l1.confidence, evidence_score / 100 * 100), 1),
179
+ final_score=final_score,
180
+ layer1=layer1,
181
+ layer2=layer2,
182
+ entities=EntitiesResult(
183
+ persons=ner_result.persons,
184
+ organizations=ner_result.organizations,
185
+ locations=ner_result.locations,
186
+ dates=ner_result.dates,
187
+ ),
188
+ sentiment=sentiment_result.sentiment,
189
+ emotion=sentiment_result.emotion,
190
+ language=language,
191
+ domain_credibility=get_domain_tier(source_domain) if source_domain else None,
192
+ input_type=input_type,
193
+ )
194
+
195
+ # ── Record to history ─────────────────────────────────────────────────────
196
+ try:
197
+ from api.routes.history import record_verification
198
+ record_verification({
199
+ "id": str(uuid.uuid4()),
200
+ "timestamp": datetime.now(timezone.utc).isoformat(),
201
+ "input_type": input_type,
202
+ "text_preview": text[:120],
203
+ "verdict": verdict.value,
204
+ "confidence": result.confidence,
205
+ "final_score": final_score,
206
+ "entities": ner_result.to_dict(),
207
+ "claim_used": claim_result.claim,
208
+ })
209
+ except Exception as e:
210
+ logger.warning("Failed to record history: %s", e)
211
+
212
+ return result
tests/__init__.py ADDED
File without changes
tests/test_philverify.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhilVerify — Unit Tests
3
+ Covers: text preprocessor, language detector, clickbait detector, and scoring engine.
4
+ Run: pytest tests/ -v
5
+ """
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Ensure project root is on PYTHONPATH
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ import pytest
13
+
14
+
15
+ # ── TextPreprocessor ──────────────────────────────────────────────────────────
16
+
17
+ class TestTextPreprocessor:
18
+ def setup_method(self):
19
+ from nlp.preprocessor import TextPreprocessor
20
+ self.preprocessor = TextPreprocessor()
21
+
22
+ def test_lowercases_text(self):
23
+ result = self.preprocessor.clean("HELLO WORLD")
24
+ assert result == "hello world"
25
+
26
+ def test_strips_urls(self):
27
+ result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
28
+ assert "https://" not in result
29
+ assert "rappler.com" not in result
30
+
31
+ def test_strips_html_tags(self):
32
+ result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
33
+ assert "<" not in result and ">" not in result
34
+
35
+ def test_strips_mentions(self):
36
+ result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
37
+ assert "@" not in result
38
+
39
+ def test_removes_stopwords(self):
40
+ filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
41
+ assert "ang" not in filtered
42
+ assert "fake" in filtered
43
+
44
+ def test_normalizes_repeated_chars(self):
45
+ result = self.preprocessor.normalize("graaabe ang gaaalit ko")
46
+ assert "graaabe" not in result
47
+
48
+ def test_full_pipeline_returns_result(self):
49
+ from nlp.preprocessor import PreprocessResult
50
+ result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
51
+ assert isinstance(result, PreprocessResult)
52
+ assert result.char_count > 0
53
+ assert len(result.tokens) > 0
54
+
55
+
56
+ # ── LanguageDetector ──────────────────────────────────────────────────────────
57
+
58
+ class TestLanguageDetector:
59
+ def setup_method(self):
60
+ from nlp.language_detector import LanguageDetector
61
+ self.detector = LanguageDetector()
62
+
63
+ def test_detects_tagalog(self):
64
+ result = self.detector.detect(
65
+ "Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
66
+ )
67
+ assert result.language in ("Tagalog", "Taglish")
68
+
69
+ def test_detects_english(self):
70
+ result = self.detector.detect(
71
+ "The Supreme Court ruled in favor of the petition filed by the opposition."
72
+ )
73
+ assert result.language in ("English", "Taglish")
74
+
75
+ def test_detects_taglish(self):
76
+ result = self.detector.detect(
77
+ "Grabe ang news ngayon! The president announced na libre ang lahat!"
78
+ )
79
+ # Should detect either Taglish or remain consistent
80
+ assert result.language in ("Tagalog", "English", "Taglish")
81
+
82
+ def test_unknown_for_empty(self):
83
+ result = self.detector.detect("")
84
+ assert result.language == "Unknown"
85
+
86
+ def test_confidence_between_0_and_1(self):
87
+ result = self.detector.detect("Ang balita ay napakalaki!")
88
+ assert 0.0 <= result.confidence <= 1.0
89
+
90
+
91
+ # ── ClickbaitDetector ─────────────────────────────────────────────────────────
92
+
93
+ class TestClickbaitDetector:
94
+ def setup_method(self):
95
+ from nlp.clickbait import ClickbaitDetector
96
+ self.detector = ClickbaitDetector()
97
+
98
+ def test_detects_clickbait_all_caps(self):
99
+ result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
100
+ assert result.is_clickbait is True
101
+ assert result.score > 0.3
102
+
103
+ def test_detects_clickbait_tagalog(self):
104
+ result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
105
+ assert result.score > 0.3
106
+
107
+ def test_clean_headline_not_clickbait(self):
108
+ result = self.detector.detect(
109
+ "DOH reports 500 new cases as vaccination drive continues in Metro Manila"
110
+ )
111
+ assert result.is_clickbait is False
112
+
113
+ def test_score_between_0_and_1(self):
114
+ result = self.detector.detect("Breaking news today")
115
+ assert 0.0 <= result.score <= 1.0
116
+
117
+
118
+ # ── TF-IDF Classifier ─────────────────────────────────────────────────────────
119
+
120
+ class TestTFIDFClassifier:
121
+ def setup_method(self):
122
+ from ml.tfidf_classifier import TFIDFClassifier
123
+ self.clf = TFIDFClassifier()
124
+ self.clf.train()
125
+
126
+ def test_predict_returns_valid_verdict(self):
127
+ result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
128
+ assert result.verdict in ("Credible", "Unverified", "Fake")
129
+
130
+ def test_confidence_in_valid_range(self):
131
+ result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
132
+ assert 0.0 <= result.confidence <= 100.0
133
+
134
+ def test_triggered_features_are_strings(self):
135
+ result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
136
+ assert all(isinstance(f, str) for f in result.triggered_features)
137
+
138
+ def test_seed_fake_news_detected(self):
139
+ result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
140
+ # Should not be Credible for obvious fake claim
141
+ assert result.verdict in ("Unverified", "Fake", "Likely Fake")
142
+
143
+
144
+ # ── Scoring Engine (lightweight integration) ──────────────────────────────────
145
+
146
+ class TestScoringEngine:
147
+ """Integration test — no API keys needed, evidence score defaults to 50."""
148
+
149
+ @pytest.mark.asyncio
150
+ async def test_verify_text_returns_response(self):
151
+ from scoring.engine import run_verification
152
+ from api.schemas import VerificationResponse
153
+
154
+ result = await run_verification(
155
+ "GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
156
+ input_type="text",
157
+ )
158
+ assert isinstance(result, VerificationResponse)
159
+ assert result.verdict is not None
160
+ assert 0.0 <= result.final_score <= 100.0
161
+
162
+ @pytest.mark.asyncio
163
+ async def test_verify_credible_text(self):
164
+ from scoring.engine import run_verification
165
+
166
+ result = await run_verification(
167
+ "DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
168
+ input_type="text",
169
+ )
170
+ assert result.final_score is not None
171
+ assert result.language is not None
172
+
173
+ @pytest.mark.asyncio
174
+ async def test_entities_extracted(self):
175
+ from scoring.engine import run_verification
176
+
177
+ result = await run_verification(
178
+ "President Marcos announced new policies in Manila regarding the AFP and PNP.",
179
+ input_type="text",
180
+ )
181
+ assert result.entities is not None