Spaces:

SemiAutomat1c
/

philverify-api

Running

App Files Files Community

Ryan Christian D. Deniega commited on Feb 24

Commit

6c9b8f1

0 Parent(s):

feat: PhilVerify Phase 1-3 — FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)

Browse files

Files changed (32) hide show

.env.example +27 -0
.gitignore +28 -0
api/__init__.py +0 -0
api/routes/__init__.py +0 -0
api/routes/history.py +56 -0
api/routes/trends.py +84 -0
api/routes/verify.py +147 -0
api/schemas.py +151 -0
config.py +55 -0
domain_credibility.json +33 -0
evidence/__init__.py +0 -0
evidence/news_fetcher.py +108 -0
inputs/__init__.py +0 -0
inputs/asr.py +49 -0
inputs/ocr.py +33 -0
inputs/url_scraper.py +71 -0
main.py +127 -0
ml/__init__.py +0 -0
ml/tfidf_classifier.py +128 -0
nlp/__init__.py +0 -0
nlp/claim_extractor.py +84 -0
nlp/clickbait.py +100 -0
nlp/language_detector.py +99 -0
nlp/ner.py +129 -0
nlp/preprocessor.py +124 -0
nlp/sentiment.py +141 -0
pytest.ini +6 -0
requirements.txt +47 -0
scoring/__init__.py +0 -0
scoring/engine.py +212 -0
tests/__init__.py +0 -0
tests/test_philverify.py +181 -0

.env.example ADDED Viewed

	@@ -0,0 +1,27 @@

+# ── API Keys ──────────────────────────────────────────────────────────────────
+NEWS_API_KEY=your_newsapi_key_here
+GOOGLE_VISION_API_KEY=your_google_vision_key_here   # Optional (alternative to Tesseract)
+# ── Database ──────────────────────────────────────────────────────────────────
+DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/philverify
+# ── Redis Cache ───────────────────────────────────────────────────────────────
+REDIS_URL=redis://localhost:6379/0
+# ── App Settings ──────────────────────────────────────────────────────────────
+APP_ENV=development                 # development | production
+DEBUG=true
+LOG_LEVEL=INFO
+ALLOWED_ORIGINS=http://localhost:3000,http://localhost:5173
+# ── Model Settings ────────────────────────────────────────────────────────────
+# Options: xlm-roberta-base | joelito/roberta-tagalog-base | bert-base-multilingual-cased
+ML_MODEL_NAME=xlm-roberta-base
+WHISPER_MODEL_SIZE=base             # base | medium | large-v3 (large-v3 for production)
+USE_GPU=false
+# ── Scoring Weights ───────────────────────────────────────────────────────────
+ML_WEIGHT=0.40
+EVIDENCE_WEIGHT=0.60
+CREDIBLE_THRESHOLD=70.0
+FAKE_THRESHOLD=40.0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Python
+venv/
+__pycache__/
+*.py[cod]
+*.pkl
+*.egg-info/
+dist/
+build/
+# Environment
+.env
+# Cache
+.cache/
+.pytest_cache/
+# IDE
+.vscode/
+.idea/
+*.swp
+# OS
+.DS_Store
+# ML models (too large for git)
+ml/models/*.pkl
+ml/models/*.bin
+ml/models/*.pt

api/__init__.py ADDED Viewed

File without changes

api/routes/__init__.py ADDED Viewed

File without changes

api/routes/history.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+PhilVerify — History Route
+GET /history — Returns past verification logs with pagination.
+"""
+import logging
+from fastapi import APIRouter, Query
+from api.schemas import HistoryResponse, HistoryEntry, Verdict
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/history", tags=["History"])
+# In-memory store for development. Will be replaced by DB queries in Phase 7.
+_HISTORY: list[dict] = []
+def record_verification(entry: dict) -> None:
+    """Called by the scoring engine to persist each verification result."""
+    _HISTORY.append(entry)
+@router.get(
+    "",
+    response_model=HistoryResponse,
+    summary="Get verification history",
+    description="Returns past verifications ordered by most recent. Supports pagination.",
+)
+async def get_history(
+    page: int = Query(1, ge=1, description="Page number"),
+    limit: int = Query(20, ge=1, le=100, description="Results per page"),
+    verdict_filter: Verdict | None = Query(None, alias="verdict", description="Filter by verdict"),
+) -> HistoryResponse:
+    logger.info("GET /history | page=%d limit=%d", page, limit)
+    entries = list(reversed(_HISTORY))  # Most recent first
+    if verdict_filter:
+        entries = [e for e in entries if e.get("verdict") == verdict_filter.value]
+    total = len(entries)
+    start = (page - 1) * limit
+    paginated = entries[start : start + limit]
+    return HistoryResponse(
+        total=total,
+        entries=[
+            HistoryEntry(
+                id=e["id"],
+                timestamp=e["timestamp"],
+                input_type=e.get("input_type", "text"),
+                text_preview=e.get("text_preview", "")[:120],
+                verdict=Verdict(e["verdict"]),
+                confidence=e["confidence"],
+                final_score=e["final_score"],
+            )
+            for e in paginated
+        ],
+    )

api/routes/trends.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+PhilVerify — Trends Route
+GET /trends — Aggregates entities and topics from fake-news verifications.
+"""
+import logging
+from collections import Counter
+from fastapi import APIRouter, Query
+from api.schemas import TrendsResponse, TrendingEntity, TrendingTopic, Verdict
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/trends", tags=["Trends"])
+# Reads from the same in-memory store as history (Phase 7 → DB aggregation).
+from api.routes.history import _HISTORY
+@router.get(
+    "",
+    response_model=TrendsResponse,
+    summary="Get trending entities & topics",
+    description="Aggregates NER entities and topics from recent verifications. Useful for identifying fake-news patterns.",
+)
+async def get_trends(
+    days: int = Query(7, ge=1, le=90, description="Lookback window in days"),
+    limit: int = Query(10, ge=1, le=50, description="Max results per category"),
+) -> TrendsResponse:
+    logger.info("GET /trends | days=%d", days)
+    entity_counter: Counter = Counter()
+    entity_type_map: dict[str, str] = {}
+    entity_fake_counter: Counter = Counter()
+    topic_counter: Counter = Counter()
+    topic_verdict_map: dict[str, list[str]] = {}
+    for entry in _HISTORY:
+        is_fake = entry.get("verdict") in (Verdict.LIKELY_FAKE.value, Verdict.UNVERIFIED.value)
+        entities = entry.get("entities", {})
+        for person in entities.get("persons", []):
+            entity_counter[person] += 1
+            entity_type_map[person] = "person"
+            if is_fake:
+                entity_fake_counter[person] += 1
+        for org in entities.get("organizations", []):
+            entity_counter[org] += 1
+            entity_type_map[org] = "org"
+            if is_fake:
+                entity_fake_counter[org] += 1
+        for loc in entities.get("locations", []):
+            entity_counter[loc] += 1
+            entity_type_map[loc] = "location"
+            if is_fake:
+                entity_fake_counter[loc] += 1
+        claim = entry.get("claim_used", "")
+        if claim:
+            topic_counter[claim[:60]] += 1
+            topic_verdict_map.setdefault(claim[:60], []).append(entry.get("verdict", "Unverified"))
+    top_entities = [
+        TrendingEntity(
+            entity=entity,
+            entity_type=entity_type_map.get(entity, "unknown"),
+            count=count,
+            fake_count=entity_fake_counter.get(entity, 0),
+            fake_ratio=round(entity_fake_counter.get(entity, 0) / count, 2),
+        )
+        for entity, count in entity_counter.most_common(limit)
+    ]
+    top_topics = [
+        TrendingTopic(
+            topic=topic,
+            count=count,
+            dominant_verdict=Verdict(
+                Counter(topic_verdict_map.get(topic, ["Unverified"])).most_common(1)[0][0]
+            ),
+        )
+        for topic, count in topic_counter.most_common(limit)
+    ]
+    return TrendsResponse(top_entities=top_entities, top_topics=top_topics)

api/routes/verify.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+PhilVerify — Verify Routes
+POST /verify/text | /verify/url | /verify/image | /verify/video
+All routes funnel through run_verification() in the scoring engine.
+"""
+import time
+import logging
+from fastapi import APIRouter, HTTPException, UploadFile, File, status
+from fastapi.responses import JSONResponse
+from api.schemas import (
+    TextVerifyRequest,
+    URLVerifyRequest,
+    VerificationResponse,
+    ErrorResponse,
+)
+from scoring.engine import run_verification
+from inputs.url_scraper import scrape_url
+from inputs.ocr import extract_text_from_image
+from inputs.asr import transcribe_video
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/verify", tags=["Verification"])
+# ── Text ──────────────────────────────────────────────────────────────────────
+@router.post(
+    "/text",
+    response_model=VerificationResponse,
+    summary="Verify raw text",
+    description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
+)
+async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
+    start = time.perf_counter()
+    logger.info("verify/text called | chars=%d", len(body.text))
+    try:
+        result = await run_verification(body.text, input_type="text")
+        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
+        return result
+    except Exception as exc:
+        logger.exception("verify/text error: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc
+# ── URL ───────────────────────────────────────────────────────────────────────
+@router.post(
+    "/url",
+    response_model=VerificationResponse,
+    summary="Verify a URL",
+    description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
+)
+async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
+    start = time.perf_counter()
+    url_str = str(body.url)
+    logger.info("verify/url called | url=%s", url_str)
+    try:
+        text, domain = await scrape_url(url_str)
+        if not text or len(text.strip()) < 20:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail="Could not extract meaningful text from the URL. The page may be paywalled or bot-protected.",
+            )
+        result = await run_verification(text, input_type="url", source_domain=domain)
+        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
+        return result
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("verify/url error: %s", exc)
+        raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc
+# ── Image ─────────────────────────────────────────────────────────────────────
+@router.post(
+    "/image",
+    response_model=VerificationResponse,
+    summary="Verify an image (OCR)",
+    description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
+)
+async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
+    start = time.perf_counter()
+    logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)
+    allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
+    if file.content_type not in allowed_types:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
+        )
+    try:
+        image_bytes = await file.read()
+        text = await extract_text_from_image(image_bytes)
+        if not text or len(text.strip()) < 10:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail="No readable text found in the image.",
+            )
+        result = await run_verification(text, input_type="image")
+        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
+        return result
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("verify/image error: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc
+# ── Video ─────────────────────────────────────────────────────────────────────
+@router.post(
+    "/video",
+    response_model=VerificationResponse,
+    summary="Verify a video/audio (Whisper ASR)",
+    description="Accepts a video or audio file. Runs Whisper ASR to transcribe, then verifies the transcript.",
+)
+async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
+    start = time.perf_counter()
+    logger.info("verify/video called | filename=%s", file.filename)
+    allowed_types = {
+        "video/mp4", "video/webm", "video/quicktime",
+        "audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
+    }
+    if file.content_type not in allowed_types:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported media type: {file.content_type}",
+        )
+    try:
+        media_bytes = await file.read()
+        text = await transcribe_video(media_bytes, filename=file.filename or "upload")
+        if not text or len(text.strip()) < 10:
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail="Could not transcribe meaningful speech from the media file.",
+            )
+        result = await run_verification(text, input_type="video")
+        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
+        return result
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("verify/video error: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc

api/schemas.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+PhilVerify — Pydantic Request / Response Schemas
+Matches the structured JSON output format from the system spec.
+"""
+from __future__ import annotations
+from enum import Enum
+from typing import Optional
+from pydantic import BaseModel, HttpUrl, Field
+# ── Enums ─────────────────────────────────────────────────────────────────────
+class Verdict(str, Enum):
+    CREDIBLE = "Credible"
+    UNVERIFIED = "Unverified"
+    LIKELY_FAKE = "Likely Fake"
+class Stance(str, Enum):
+    SUPPORTS = "Supports"
+    REFUTES = "Refutes"
+    NOT_ENOUGH_INFO = "Not Enough Info"
+class Language(str, Enum):
+    TAGALOG = "Tagalog"
+    ENGLISH = "English"
+    TAGLISH = "Taglish"
+    UNKNOWN = "Unknown"
+class Sentiment(str, Enum):
+    POSITIVE = "positive"
+    NEGATIVE = "negative"
+    NEUTRAL = "neutral"
+    HIGH_POSITIVE = "high positive"
+    HIGH_NEGATIVE = "high negative"
+class DomainTier(int, Enum):
+    CREDIBLE = 1
+    SATIRE_OPINION = 2
+    SUSPICIOUS = 3
+    KNOWN_FAKE = 4
+# ── Request Models ─────────────────────────────────────────────────────────────
+class TextVerifyRequest(BaseModel):
+    text: str = Field(..., min_length=10, max_length=10_000, description="Raw text to verify")
+class URLVerifyRequest(BaseModel):
+    url: HttpUrl = Field(..., description="URL of the news article or social media post")
+# ── Nested Response Models ────────────────────────────────────────────────────
+class EntitiesResult(BaseModel):
+    persons: list[str] = []
+    organizations: list[str] = []
+    locations: list[str] = []
+    dates: list[str] = []
+class Layer1Result(BaseModel):
+    verdict: Verdict
+    confidence: float = Field(..., ge=0.0, le=100.0, description="Confidence % from ML classifier")
+    triggered_features: list[str] = Field(
+        default_factory=list,
+        description="Human-readable list of suspicious features detected",
+    )
+class EvidenceSource(BaseModel):
+    title: str
+    url: str
+    similarity: float = Field(..., ge=0.0, le=1.0, description="Cosine similarity to input claim")
+    stance: Stance
+    domain_tier: DomainTier
+    published_at: Optional[str] = None
+    source_name: Optional[str] = None
+class Layer2Result(BaseModel):
+    verdict: Verdict
+    evidence_score: float = Field(..., ge=0.0, le=100.0)
+    sources: list[EvidenceSource] = []
+    claim_used: Optional[str] = Field(None, description="Extracted claim sent to evidence search")
+# ── Main Response ─────────────────────────────────────────────────────────────
+class VerificationResponse(BaseModel):
+    verdict: Verdict
+    confidence: float = Field(..., ge=0.0, le=100.0)
+    final_score: float = Field(..., ge=0.0, le=100.0)
+    layer1: Layer1Result
+    layer2: Layer2Result
+    entities: EntitiesResult
+    sentiment: str
+    emotion: str
+    language: Language
+    domain_credibility: Optional[DomainTier] = None
+    input_type: str = "text"
+    processing_time_ms: Optional[float] = None
+# ── History / Trends ──────────────────────────────────────────────────────────
+class HistoryEntry(BaseModel):
+    id: str
+    timestamp: str
+    input_type: str
+    text_preview: str
+    verdict: Verdict
+    confidence: float
+    final_score: float
+class HistoryResponse(BaseModel):
+    total: int
+    entries: list[HistoryEntry]
+class TrendingEntity(BaseModel):
+    entity: str
+    entity_type: str  # person | org | location
+    count: int
+    fake_count: int
+    fake_ratio: float
+class TrendingTopic(BaseModel):
+    topic: str
+    count: int
+    dominant_verdict: Verdict
+class TrendsResponse(BaseModel):
+    top_entities: list[TrendingEntity]
+    top_topics: list[TrendingTopic]
+# ── Error ─────────────────────────────────────────────────────────────────────
+class ErrorResponse(BaseModel):
+    error: str
+    detail: Optional[str] = None
+    code: Optional[str] = None

config.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+PhilVerify — Application Settings
+Loaded via pydantic-settings from environment variables / .env file.
+"""
+from functools import lru_cache
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore",
+    )
+    # ── API Keys ──────────────────────────────────────────────────────────────
+    news_api_key: str = ""
+    google_vision_api_key: str = ""
+    # ── Database ──────────────────────────────────────────────────────────────
+    database_url: str = "sqlite+aiosqlite:///./philverify_dev.db"  # Dev fallback
+    # ── Redis ─────────────────────────────────────────────────────────────────
+    redis_url: str = ""  # Empty = disable caching
+    # ── App ───────────────────────────────────────────────────────────────────
+    app_env: str = "development"
+    debug: bool = True
+    log_level: str = "INFO"
+    allowed_origins: list[str] = [
+        "http://localhost:3000",
+        "http://localhost:5173",
+    ]
+    # ── ML Models ─────────────────────────────────────────────────────────────
+    ml_model_name: str = "xlm-roberta-base"
+    whisper_model_size: str = "base"
+    use_gpu: bool = False
+    # ── Scoring Weights ───────────────────────────────────────────────────────
+    ml_weight: float = 0.40
+    evidence_weight: float = 0.60
+    credible_threshold: float = 70.0
+    fake_threshold: float = 40.0
+    @property
+    def is_production(self) -> bool:
+        return self.app_env == "production"
+@lru_cache
+def get_settings() -> Settings:
+    """Return a cached singleton Settings instance."""
+    return Settings()

domain_credibility.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "tier1": {
+    "description": "Established credible Philippine news organizations",
+    "score": 100,
+    "domains": [
+      "rappler.com", "inquirer.net", "gmanetwork.com", "abs-cbn.com",
+      "mb.com.ph", "philstar.com", "manilatimes.net", "sunstar.com.ph",
+      "businessmirror.com.ph", "bworldonline.com", "pna.gov.ph",
+      "doh.gov.ph", "official.deped.gov.ph", "senate.gov.ph", "congress.gov.ph"
+    ]
+  },
+  "tier2": {
+    "description": "Satire, opinion blogs, or entertainment sites",
+    "score": 50,
+    "domains": [
+      "knowyourmeme.com", "9gag.com", "buzzfeed.com",
+      "opinion.inquirer.net", "interaksyon.com"
+    ]
+  },
+  "tier3": {
+    "description": "Unknown / unverified sources — newly registered or low-authority",
+    "score": 25,
+    "domains": []
+  },
+  "tier4": {
+    "description": "Known fake news / misinformation sites (Vera Files blacklist)",
+    "score": 0,
+    "domains": [
+      "duterte.news", "pinoyakoblog.com", "filipinonewsalert.com",
+      "pilipinostar.com", "pinoytrending.net", "maharlikanews.com"
+    ]
+  }
+}

evidence/__init__.py ADDED Viewed

File without changes

evidence/news_fetcher.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+PhilVerify — Evidence Retrieval Module
+Fetches related articles from NewsAPI, computes cosine similarity,
+and produces an evidence score for Layer 2 of the scoring engine.
+"""
+import logging
+import hashlib
+from dataclasses import dataclass, field
+from pathlib import Path
+import json
+logger = logging.getLogger(__name__)
+# Simple file-based cache to respect NewsAPI 100 req/day free tier limit
+_CACHE_DIR = Path(__file__).parent.parent / ".cache" / "newsapi"
+_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+@dataclass
+class ArticleResult:
+    title: str
+    url: str
+    description: str
+    source_name: str
+    published_at: str
+    similarity: float = 0.0
+    stance: str = "Not Enough Info"
+    domain_tier: int = 3
+@dataclass
+class EvidenceResult:
+    verdict: str           # "Supported" | "Contradicted" | "Insufficient"
+    evidence_score: float  # 0–100
+    sources: list[ArticleResult] = field(default_factory=list)
+    claim_used: str = ""
+def _cache_key(claim: str) -> str:
+    return hashlib.md5(claim.lower().strip().encode()).hexdigest()
+def _load_cache(key: str) -> list[dict] | None:
+    path = _CACHE_DIR / f"{key}.json"
+    if path.exists():
+        try:
+            return json.loads(path.read_text())
+        except Exception:
+            return None
+    return None
+def _save_cache(key: str, data: list[dict]) -> None:
+    path = _CACHE_DIR / f"{key}.json"
+    path.write_text(json.dumps(data))
+async def fetch_evidence(claim: str, api_key: str, max_results: int = 5) -> list[dict]:
+    """Fetch top articles from NewsAPI for the given claim. Cached."""
+    key = _cache_key(claim)
+    cached = _load_cache(key)
+    if cached is not None:
+        logger.info("NewsAPI cache hit for claim hash %s", key[:8])
+        return cached
+    if not api_key:
+        logger.warning("NEWS_API_KEY not set — returning empty evidence")
+        return []
+    try:
+        from newsapi import NewsApiClient
+        client = NewsApiClient(api_key=api_key)
+        # Use first 100 chars of claim as query
+        query = claim[:100]
+        resp = client.get_everything(
+            q=query,
+            language="en",
+            sort_by="relevancy",
+            page_size=max_results,
+        )
+        articles = resp.get("articles", [])
+        _save_cache(key, articles)
+        logger.info("NewsAPI returned %d articles for query '%s...'", len(articles), query[:30])
+        return articles
+    except Exception as e:
+        logger.warning("NewsAPI fetch error: %s", e)
+        return []
+def compute_similarity(claim: str, article_text: str) -> float:
+    """
+    Compute cosine similarity between claim and article using sentence-transformers.
+    Falls back to simple word-overlap Jaccard similarity.
+    """
+    try:
+        from sentence_transformers import SentenceTransformer, util
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        emb_claim = model.encode(claim, convert_to_tensor=True)
+        emb_article = model.encode(article_text[:512], convert_to_tensor=True)
+        score = float(util.cos_sim(emb_claim, emb_article)[0][0])
+        return round(max(0.0, min(1.0, score)), 3)
+    except Exception:
+        # Jaccard fallback
+        a = set(claim.lower().split())
+        b = set(article_text.lower().split())
+        if not a or not b:
+            return 0.0
+        return round(len(a & b) / len(a | b), 3)

inputs/__init__.py ADDED Viewed

File without changes

inputs/asr.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+PhilVerify — Whisper ASR Module
+Transcribes video/audio files using OpenAI Whisper.
+Recommended model: large-v3 (best Filipino speech accuracy).
+"""
+import io
+import logging
+import tempfile
+import os
+logger = logging.getLogger(__name__)
+async def transcribe_video(media_bytes: bytes, filename: str = "upload") -> str:
+    """
+    Transcribe audio/video bytes using Whisper.
+    Saves bytes to a temp file (Whisper requires file path, not bytes).
+    Returns the transcript string.
+    """
+    try:
+        import whisper
+        from config import get_settings
+        settings = get_settings()
+        model_size = settings.whisper_model_size
+        logger.info("Loading Whisper model: %s", model_size)
+        model = whisper.load_model(model_size)
+        # Whisper needs a file path — write bytes to temp file
+        suffix = os.path.splitext(filename)[-1] or ".mp4"
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+            tmp.write(media_bytes)
+            tmp_path = tmp.name
+        try:
+            result = model.transcribe(tmp_path, language=None)  # Auto-detect language
+            transcript = result.get("text", "").strip()
+            logger.info("Whisper transcribed %d chars (lang=%s)", len(transcript), result.get("language"))
+            return transcript
+        finally:
+            os.unlink(tmp_path)  # Clean up temp file
+    except ImportError:
+        logger.warning("openai-whisper not installed — ASR unavailable")
+        return ""
+    except Exception as e:
+        logger.error("Whisper transcription failed: %s", e)
+        return ""

inputs/ocr.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+PhilVerify — OCR Module (Tesseract)
+Extracts text from images using pytesseract.
+Falls back gracefully if Tesseract not installed.
+"""
+import io
+import logging
+logger = logging.getLogger(__name__)
+# Supported languages: Filipino (fil) + English (eng)
+_TESSERACT_LANG = "fil+eng"
+async def extract_text_from_image(image_bytes: bytes) -> str:
+    """
+    Run Tesseract OCR on image bytes. Returns extracted text string.
+    """
+    try:
+        import pytesseract
+        from PIL import Image
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        text = pytesseract.image_to_string(image, lang=_TESSERACT_LANG)
+        text = text.strip()
+        logger.info("OCR extracted %d chars from image", len(text))
+        return text
+    except ImportError:
+        logger.warning("pytesseract / Pillow not installed — OCR unavailable")
+        return ""
+    except Exception as e:
+        logger.error("OCR failed: %s", e)
+        return ""

inputs/url_scraper.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+PhilVerify — URL Scraper (BeautifulSoup)
+Extracts article text from news URLs. Respects robots.txt.
+"""
+import logging
+import re
+from urllib.parse import urlparse
+from urllib.robotparser import RobotFileParser
+logger = logging.getLogger(__name__)
+_UNWANTED_TAGS = {"script", "style", "nav", "footer", "header", "aside", "figure", "figcaption"}
+def _get_domain(url: str) -> str:
+    return urlparse(url).netloc.replace("www.", "")
+def _robots_allow(url: str) -> bool:
+    try:
+        parsed = urlparse(url)
+        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        rp = RobotFileParser()
+        rp.set_url(robots_url)
+        rp.read()
+        return rp.can_fetch("*", url)
+    except Exception:
+        return True  # Allow by default if robots.txt fetch fails
+async def scrape_url(url: str) -> tuple[str, str]:
+    """
+    Returns (article_text, domain).
+    Raises ValueError if robots.txt disallows scraping.
+    """
+    domain = _get_domain(url)
+    if not _robots_allow(url):
+        logger.warning("robots.txt disallows scraping %s", url)
+        raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
+    try:
+        import httpx
+        from bs4 import BeautifulSoup
+        headers = {"User-Agent": "PhilVerifyBot/1.0 (fact-checking research)"}
+        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+            resp = await client.get(url, headers=headers)
+            resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "lxml")
+        # Remove unwanted tags
+        for tag in soup(list(_UNWANTED_TAGS)):
+            tag.decompose()
+        # Try article tag first, fall back to body
+        article = soup.find("article") or soup.find("main") or soup.body
+        if article is None:
+            return "", domain
+        paragraphs = article.find_all("p")
+        text = " ".join(p.get_text(separator=" ", strip=True) for p in paragraphs)
+        text = re.sub(r"\s+", " ", text).strip()
+        logger.info("Scraped %d chars from %s", len(text), domain)
+        return text, domain
+    except Exception as e:
+        logger.error("URL scraping failed for %s: %s", url, e)
+        return "", domain

main.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+PhilVerify — FastAPI Application Entry Point
+Run: uvicorn main:app --reload --port 8000
+Docs: http://localhost:8000/docs
+"""
+import logging
+import os
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from config import get_settings
+from api.routes.verify import router as verify_router
+from api.routes.history import router as history_router
+from api.routes.trends import router as trends_router
+# ── Logging ───────────────────────────────────────────────────────────────────
+logging.basicConfig(
+    level=getattr(logging, get_settings().log_level.upper(), logging.INFO),
+    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+)
+logger = logging.getLogger("philverify")
+# ── Lifespan (startup / shutdown) ─────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Warm up NLP models on startup so first request isn't slow."""
+    logger.info("🚀 PhilVerify starting up...")
+    try:
+        # Lazy-import to avoid crashing if heavy deps not yet installed
+        from nlp.language_detector import LanguageDetector
+        from nlp.preprocessor import TextPreprocessor
+        from ml.tfidf_classifier import TFIDFClassifier
+        app.state.preprocessor = TextPreprocessor()
+        app.state.language_detector = LanguageDetector()
+        classifier = TFIDFClassifier()
+        classifier.train()          # Trains on seed dataset if model not persisted
+        app.state.classifier = classifier
+        logger.info("✅ NLP models ready")
+    except ImportError as e:
+        logger.warning("⚠️  Some NLP modules not installed yet: %s — stubs will be used", e)
+    yield  # ── App is running ──
+    logger.info("👋 PhilVerify shutting down")
+# ── App ───────────────────────────────────────────────────────────────────────
+settings = get_settings()
+app = FastAPI(
+    title="PhilVerify API",
+    description=(
+        "Multimodal fake news detection for Philippine social media. "
+        "Supports text, URL, image (OCR), and video (Whisper ASR) inputs."
+    ),
+    version="0.1.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+# ── CORS ──────────────────────────────────────────────────────────────────────
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.allowed_origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Global Error Handler ──────────────────────────────────────────────────────
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    logger.exception("Unhandled error on %s %s: %s", request.method, request.url.path, exc)
+    return JSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={"error": "Internal server error", "detail": str(exc)},
+    )
+# ── Routers ───────────────────────────────────────────────────────────────────
+app.include_router(verify_router)
+app.include_router(history_router)
+app.include_router(trends_router)
+# ── Health ────────────────────────────────────────────────────────────────────
+@app.get("/", tags=["Health"])
+async def root():
+    return {
+        "service": "PhilVerify",
+        "version": "0.1.0",
+        "status": "operational",
+        "docs": "/docs",
+    }
+@app.get("/health", tags=["Health"])
+async def health():
+    return {"status": "ok", "env": settings.app_env}
+# ── Dev runner ────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=int(os.getenv("PORT", 8000)),
+        reload=settings.debug,
+        log_level=settings.log_level.lower(),
+    )

ml/__init__.py ADDED Viewed

File without changes

ml/tfidf_classifier.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+PhilVerify — TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
+Seed dataset of 30 labeled PH news headlines (10 per class).
+Replaced by fine-tuned XLM-RoBERTa in Phase 10.
+"""
+import os
+import logging
+import pickle
+from dataclasses import dataclass, field
+from pathlib import Path
+logger = logging.getLogger(__name__)
+MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"
+# ── Seed dataset (30 samples — 10 per class) ──────────────────────────────────
+# Labels: 0=Credible, 1=Unverified, 2=Fake
+SEED_DATA = [
+    # Credible (0)
+    ("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
+    ("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
+    ("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
+    ("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
+    ("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
+    ("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
+    ("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
+    ("DOF confirms revenue collection targets met for fiscal year 2025", 0),
+    ("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
+    ("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),
+    # Unverified (1)
+    ("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
+    ("VIRAL: Celebrity spotted at secret meeting with government official", 1),
+    ("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
+    ("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
+    ("CLAIM: Government hiding true COVID-19 death count from public", 1),
+    ("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
+    ("REPORT: Certain barangay officials accepting bribes according to residents", 1),
+    ("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
+    ("CLAIM: New mandatory vaccine policy for all government employees", 1),
+    ("Source says: Manila Water to increase rates by 20% next month", 1),
+    # Fake (2)
+    ("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
+    ("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
+    ("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
+    ("WATCH: Senator caught stealing money in Senate vault - full video", 2),
+    ("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
+    ("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
+    ("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
+    ("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
+    ("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
+    ("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
+]
+@dataclass
+class Layer1Result:
+    verdict: str         # "Credible" | "Unverified" | "Fake"
+    confidence: float    # 0.0 – 100.0
+    triggered_features: list[str] = field(default_factory=list)
+class TFIDFClassifier:
+    """
+    TF-IDF + Logistic Regression baseline.
+    Train() fits on the seed dataset and saves to disk.
+    Predict() loads persisted model first call.
+    """
+    _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
+    def __init__(self):
+        self._vectorizer = None
+        self._clf = None
+    def train(self) -> None:
+        """Fit on seed data. Skips training if persisted model exists."""
+        if MODEL_PATH.exists():
+            self._load()
+            return
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.linear_model import LogisticRegression
+        texts, labels = zip(*SEED_DATA)
+        self._vectorizer = TfidfVectorizer(
+            ngram_range=(1, 2),
+            max_features=1000,
+            sublinear_tf=True,
+        )
+        X = self._vectorizer.fit_transform(texts)
+        self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
+        self._clf.fit(X, labels)
+        MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
+        with open(MODEL_PATH, "wb") as f:
+            pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
+        logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)
+    def _load(self) -> None:
+        with open(MODEL_PATH, "rb") as f:
+            data = pickle.load(f)
+        self._vectorizer = data["vectorizer"]
+        self._clf = data["clf"]
+        logger.info("TF-IDF model loaded from %s", MODEL_PATH)
+    def predict(self, text: str) -> Layer1Result:
+        if self._vectorizer is None:
+            self.train()
+        X = self._vectorizer.transform([text])
+        pred_label = int(self._clf.predict(X)[0])
+        proba = self._clf.predict_proba(X)[0]
+        confidence = round(float(max(proba)) * 100, 1)
+        verdict = self._LABELS[pred_label]
+        # Extract top TF-IDF features as human-readable triggers
+        feature_names = self._vectorizer.get_feature_names_out()
+        tfidf_scores = X.toarray()[0]
+        top_indices = tfidf_scores.argsort()[-5:][::-1]
+        triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]
+        return Layer1Result(
+            verdict=verdict,
+            confidence=confidence,
+            triggered_features=triggered,
+        )

nlp/__init__.py ADDED Viewed

File without changes

nlp/claim_extractor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+PhilVerify — Claim Extractor
+Extracts the key falsifiable claim from noisy social media text.
+Primary: HuggingFace summarization (t5-small)
+Fallback: First 2 sentence heuristic
+"""
+import re
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
+@dataclass
+class ClaimResult:
+    claim: str
+    method: str   # "summarization" | "sentence_heuristic"
+class ClaimExtractor:
+    """
+    Extracts the single most falsifiable claim from input text.
+    This claim is then sent to the NewsAPI evidence retrieval step.
+    Prompt engineering guide:
+      The summarization model is given a task-specific prefix to bias it
+      toward extracting assertions rather than summaries.
+    """
+    _TASK_PREFIX = "Extract the main factual claim: "
+    def __init__(self):
+        self._pipe = None
+        self._loaded = False
+    def _load_model(self):
+        if self._loaded:
+            return
+        try:
+            from transformers import pipeline
+            self._pipe = pipeline(
+                "summarization",
+                model="sshleifer/distilbart-cnn-6-6",
+                max_length=80,
+                min_length=10,
+                do_sample=False,
+            )
+            logger.info("Claim extractor model loaded (distilbart-cnn-6-6)")
+        except Exception as e:
+            logger.warning("Summarization model not available (%s) — using sentence heuristic", e)
+        self._loaded = True
+    def _sentence_heuristic(self, text: str) -> str:
+        """Return the first 1-2 sentences as the claim (fast fallback)."""
+        sentences = _SENTENCE_SPLIT.split(text.strip())
+        candidates = [s.strip() for s in sentences if len(s.strip()) > 20]
+        if not candidates:
+            return text[:200].strip()
+        return " ".join(candidates[:2])
+    def extract(self, text: str) -> ClaimResult:
+        self._load_model()
+        if not text or len(text.strip()) < 20:
+            return ClaimResult(claim=text.strip(), method="passthrough")
+        if self._pipe:
+            try:
+                input_text = self._TASK_PREFIX + text[:1024]
+                out = self._pipe(input_text, truncation=True)
+                claim = out[0]["summary_text"].strip()
+                # Strip the task prefix echo if model includes it
+                claim = re.sub(r"^extract the main factual claim:?\s*", "", claim, flags=re.I)
+                if len(claim) > 15:
+                    return ClaimResult(claim=claim, method="summarization")
+            except Exception as e:
+                logger.warning("Summarization inference error: %s", e)
+        return ClaimResult(
+            claim=self._sentence_heuristic(text),
+            method="sentence_heuristic",
+        )

nlp/clickbait.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+PhilVerify — Clickbait Detector
+Detects clickbait patterns common in Philippine fake news / viral content.
+Uses regex patterns + feature flags (no model needed).
+"""
+import re
+from dataclasses import dataclass, field
+# ── Pattern library ───────────────────────────────────────────────────────────
+_CLICKBAIT_PHRASES_EN = [
+    r"\byou won'?t believe\b", r"\bshocking\b", r"\bviral\b", r"\bbreaking\b",
+    r"\bexclusive\b", r"\bmust[\s-]?see\b", r"\bsecret\b", r"\bconfirmed\b",
+    r"\bexposed\b", r"\bscandal\b", r"\bunbelievable\b", r"\bmiraculous?\b",
+    r"\bhoax\b", r"\bfact[\s-]?check\b", r"\bthis is why\b", r"\bwatch this\b",
+]
+_CLICKBAIT_PHRASES_TL = [
+    r"\bgrabe\b", r"\bwow\b", r"\bsurprise\b", r"\bshocking\b", r"\btrending\b",
+    r"\bselo\b", r"\bbalita\b", r"\bnatuklasan\b", r"\bnahuli\b", r"\bsikat\b",
+    r"\bpakinggan\b", r"\bpanoorin\b", r"\bkumpirmado\b", r"\bkatotohanan\b",
+]
+_CAPS_WORD = re.compile(r"\b[A-Z]{2,}\b")
+_EXCESSIVE_PUNCT = re.compile(r"[!?]{2,}")
+_NUMBER_BAIT = re.compile(r"\b\d+\s+(?:reasons?|things?|ways?|tips?|signs?|bagay)\b", re.I)
+_QUESTION_BAIT = re.compile(r"\b(?:ano|bakit|paano|kailan|sino|saan)\b.*\?", re.I)
+_ALL_PHRASES = [re.compile(p, re.IGNORECASE) for p in _CLICKBAIT_PHRASES_EN + _CLICKBAIT_PHRASES_TL]
+@dataclass
+class ClickbaitResult:
+    is_clickbait: bool
+    score: float                          # 0.0 – 1.0
+    triggered_patterns: list[str] = field(default_factory=list)
+class ClickbaitDetector:
+    """
+    Feature-flag based clickbait detector optimized for PH social media.
+    Returns a continuous score based on how many patterns are triggered.
+    """
+    def detect(self, text: str) -> ClickbaitResult:
+        triggered: list[str] = []
+        # ALL CAPS words (2+ in a short span)
+        caps_words = _CAPS_WORD.findall(text)
+        if len(caps_words) >= 2:
+            triggered.append(f"all_caps_words: {caps_words[:3]}")
+        # Excessive punctuation !! ???
+        if _EXCESSIVE_PUNCT.search(text):
+            triggered.append("excessive_punctuation")
+        # Number-based bait: "5 reasons why..."
+        if _NUMBER_BAIT.search(text):
+            triggered.append("number_bait")
+        # Rhetorical question bait (Tagalog)
+        if _QUESTION_BAIT.search(text):
+            triggered.append("question_bait")
+        # Title length signal (extremely short or extremely long)
+        word_count = len(text.split())
+        if word_count < 5:
+            triggered.append("title_too_short")
+        elif word_count > 30:
+            triggered.append("title_very_long")
+        # Phrase patterns
+        for pattern in _ALL_PHRASES:
+            m = pattern.search(text)
+            if m:
+                triggered.append(f"clickbait_phrase: '{m.group(0)}'")
+        # Score: each feature contributes a weight
+        weights = {
+            "excessive_punctuation": 0.20,
+            "all_caps_words": 0.20,
+            "number_bait": 0.15,
+            "question_bait": 0.10,
+            "title_too_short": 0.05,
+            "title_very_long": 0.05,
+        }
+        score = 0.0
+        for feat in triggered:
+            for key, w in weights.items():
+                if feat.startswith(key):
+                    score += w
+                    break
+            else:
+                # clickbait_phrase triggers
+                if feat.startswith("clickbait_phrase"):
+                    score += 0.25
+        score = min(score, 1.0)
+        return ClickbaitResult(
+            is_clickbait=score >= 0.4,
+            score=round(score, 3),
+            triggered_patterns=triggered,
+        )

nlp/language_detector.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+PhilVerify — Language Detector
+Detects Tagalog / English / Taglish using langdetect + Filipino stopword ratio heuristic.
+No heavy model needed — runs instantly.
+"""
+import re
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+# ── Filipino stopword set for heuristic ───────────────────────────────────────
+_TL_MARKERS = {
+    "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
+    "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "raw", "daw",
+    "ba", "po", "din", "rin", "naman", "lang", "kaya", "dahil", "kung",
+    "pero", "kapag", "talaga", "pala", "sana", "grabe", "wala", "hindi",
+    "may", "mayroon", "bakit", "paano", "kailan", "nasaan", "sino",
+}
+# English marker words (distinct from TL)
+_EN_MARKERS = {
+    "the", "and", "is", "are", "was", "were", "this", "that", "with",
+    "from", "have", "has", "had", "will", "would", "could", "should",
+    "not", "been", "being", "they", "their", "there",
+}
+@dataclass
+class LanguageResult:
+    language: str          # "Tagalog" | "English" | "Taglish" | "Unknown"
+    confidence: float      # 0.0 – 1.0
+    tl_ratio: float
+    en_ratio: float
+    method: str            # "heuristic" | "langdetect" | "combined"
+class LanguageDetector:
+    """
+    Two-pass language detector:
+    Pass 1 — Filipino stopword ratio (fast, handles code-switching)
+    Pass 2 — langdetect (for confirmation when ratios are ambiguous)
+    Decision rules:
+        tl_ratio >= 0.25 and en_ratio < 0.15  → Tagalog
+        en_ratio >= 0.25 and tl_ratio < 0.15  → English
+        both >= 0.15                           → Taglish
+        fallback                               → langdetect result
+    """
+    def _token_ratios(self, text: str) -> tuple[float, float]:
+        tokens = re.findall(r"\b\w+\b", text.lower())
+        if not tokens:
+            return 0.0, 0.0
+        tl_count = sum(1 for t in tokens if t in _TL_MARKERS)
+        en_count = sum(1 for t in tokens if t in _EN_MARKERS)
+        total = len(tokens)
+        return tl_count / total, en_count / total
+    def _langdetect(self, text: str) -> str:
+        try:
+            from langdetect import detect
+            code = detect(text)
+            # langdetect returns 'tl' for Tagalog
+            if code == "tl":
+                return "Tagalog"
+            elif code == "en":
+                return "English"
+            else:
+                return "Unknown"
+        except Exception:
+            return "Unknown"
+    def detect(self, text: str) -> LanguageResult:
+        if not text or len(text.strip()) < 5:
+            return LanguageResult("Unknown", 0.0, 0.0, 0.0, "heuristic")
+        tl_ratio, en_ratio = self._token_ratios(text)
+        # Clear Tagalog
+        if tl_ratio >= 0.25 and en_ratio < 0.15:
+            return LanguageResult("Tagalog", tl_ratio, tl_ratio, en_ratio, "heuristic")
+        # Clear English
+        if en_ratio >= 0.25 and tl_ratio < 0.15:
+            return LanguageResult("English", en_ratio, tl_ratio, en_ratio, "heuristic")
+        # Taglish — both markers present
+        if tl_ratio >= 0.10 and en_ratio >= 0.10:
+            confidence = (tl_ratio + en_ratio) / 2
+            return LanguageResult("Taglish", confidence, tl_ratio, en_ratio, "heuristic")
+        # Ambiguous — fall back to langdetect
+        ld_lang = self._langdetect(text)
+        if ld_lang != "Unknown":
+            confidence = max(tl_ratio, en_ratio, 0.5)
+            return LanguageResult(ld_lang, confidence, tl_ratio, en_ratio, "langdetect")
+        return LanguageResult("Taglish", 0.4, tl_ratio, en_ratio, "combined")

nlp/ner.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+PhilVerify — Named Entity Recognition
+Extracts persons, organizations, locations, and dates from text.
+Uses spaCy en_core_web_sm with graceful fallback if model not installed.
+"""
+import logging
+import re
+from dataclasses import dataclass, field
+logger = logging.getLogger(__name__)
+# Philippine-specific named entity hints
+_PH_PERSONS = {
+    "marcos", "duterte", "aquino", "robredo", "lacson", "pingping",
+    "bongbong", "sara", "panelo", "roque", "calida", "ano", "teodoro",
+}
+_PH_ORGS = {
+    "doh", "deped", "dilg", "dfa", "dof", "dswd", "ched", "nbi", "pnp",
+    "afp", "comelec", "sandiganbayan", "ombudsman", "pcso", "pagcor",
+    "senate", "congress", "supreme court", "malacanang",
+}
+_PH_LOCATIONS = {
+    "manila", "quezon city", "makati", "pasig", "taguig", "cebu",
+    "davao", "mindanao", "luzon", "visayas", "palawan", "boracay",
+    "batangas", "laguna", "cavite", "rizal", "bulacan", "pampanga",
+    "metro manila", "ncr", "philippines", "pilipinas",
+}
+@dataclass
+class NERResult:
+    persons: list[str] = field(default_factory=list)
+    organizations: list[str] = field(default_factory=list)
+    locations: list[str] = field(default_factory=list)
+    dates: list[str] = field(default_factory=list)
+    method: str = "spacy"
+    def to_dict(self) -> dict:
+        return {
+            "persons": self.persons,
+            "organizations": self.organizations,
+            "locations": self.locations,
+            "dates": self.dates,
+        }
+class EntityExtractor:
+    """
+    NER using spaCy (en_core_web_sm) + Philippine entity hint layer.
+    Falls back to regex-based date extraction if spaCy not installed.
+    """
+    def __init__(self):
+        self._nlp = None
+        self._loaded = False
+    def _load_model(self):
+        if self._loaded:
+            return
+        try:
+            import spacy
+            self._nlp = spacy.load("en_core_web_sm")
+            logger.info("spaCy en_core_web_sm loaded")
+        except Exception as e:
+            logger.warning("spaCy not available (%s) — using hint-based NER", e)
+            self._nlp = None
+        self._loaded = True
+    def _hint_based_extract(self, text: str) -> NERResult:
+        """Fallback: match PH-specific entity hint lists + date regex."""
+        lower = text.lower()
+        result = NERResult(method="hints")
+        result.persons = [p.title() for p in _PH_PERSONS if p in lower]
+        result.organizations = [o.upper() for o in _PH_ORGS if o in lower]
+        result.locations = [loc.title() for loc in _PH_LOCATIONS if loc in lower]
+        # Date patterns: "February 2026", "Feb 24, 2026", "2026-02-24"
+        date_patterns = [
+            r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)"
+            r"(?:\s+\d{1,2})?,?\s+\d{4}\b",
+            r"\b\d{4}-\d{2}-\d{2}\b",
+            r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",
+        ]
+        for pattern in date_patterns:
+            result.dates.extend(re.findall(pattern, text, re.IGNORECASE))
+        return result
+    def extract(self, text: str) -> NERResult:
+        self._load_model()
+        if self._nlp is None:
+            return self._hint_based_extract(text)
+        try:
+            doc = self._nlp(text[:5000])  # spaCy has a token limit
+            result = NERResult(method="spacy")
+            for ent in doc.ents:
+                ent_text = ent.text.strip()
+                if ent.label_ == "PERSON":
+                    result.persons.append(ent_text)
+                elif ent.label_ in ("ORG", "NORP"):
+                    result.organizations.append(ent_text)
+                elif ent.label_ in ("GPE", "LOC"):
+                    result.locations.append(ent_text)
+                elif ent.label_ in ("DATE", "TIME"):
+                    result.dates.append(ent_text)
+            # Deduplicate while preserving order
+            result.persons = list(dict.fromkeys(result.persons))
+            result.organizations = list(dict.fromkeys(result.organizations))
+            result.locations = list(dict.fromkeys(result.locations))
+            result.dates = list(dict.fromkeys(result.dates))
+            # Supplement with PH hints for entities spaCy may miss
+            hint_result = self._hint_based_extract(text)
+            for p in hint_result.persons:
+                if p not in result.persons:
+                    result.persons.append(p)
+            for o in hint_result.organizations:
+                if o not in result.organizations:
+                    result.organizations.append(o)
+            return result
+        except Exception as e:
+            logger.warning("spaCy extraction error: %s — falling back to hints", e)
+            return self._hint_based_extract(text)

nlp/preprocessor.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+PhilVerify — Text Preprocessor
+Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
+"""
+import re
+import string
+import unicodedata
+from dataclasses import dataclass, field
+# ── Filipino + English stopwords ──────────────────────────────────────────────
+TAGALOG_STOPWORDS = {
+    "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
+    "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
+    "dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
+    "lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
+    "kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
+    "ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
+    "hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
+}
+ENGLISH_STOPWORDS = {
+    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
+    "for", "of", "with", "by", "from", "is", "are", "was", "were",
+    "be", "been", "being", "have", "has", "had", "do", "does", "did",
+    "will", "would", "could", "should", "may", "might", "shall", "can",
+    "not", "no", "nor", "so", "yet", "both", "either", "neither",
+    "this", "that", "these", "those", "it", "its", "i", "me", "my",
+    "we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
+}
+ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS
+# ── Patterns ──────────────────────────────────────────────────────────────────
+_URL_PATTERN = re.compile(
+    r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+)
+_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+_MENTION_PATTERN = re.compile(r"@\w+")
+_HASHTAG_PATTERN = re.compile(r"#\w+")
+_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}")  # "graaabe" → "grabe"
+_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
+_WHITESPACE_PATTERN = re.compile(r"\s+")
+# Emoji removal via unicode category
+def _remove_emojis(text: str) -> str:
+    return "".join(
+        ch for ch in text
+        if not unicodedata.category(ch).startswith("So")  # Symbol, Other
+        and unicodedata.category(ch) not in ("Mn",)       # Modifier letters
+    )
+@dataclass
+class PreprocessResult:
+    original: str
+    cleaned: str
+    normalized: str
+    tokens: list[str] = field(default_factory=list)
+    filtered_tokens: list[str] = field(default_factory=list)
+    char_count: int = 0
+    word_count: int = 0
+class TextPreprocessor:
+    """
+    Multi-step text cleaner for Tagalog / English / Taglish content.
+    Pipeline:
+        1. strip_html       — remove HTML tags
+        2. strip_urls       — remove hyperlinks
+        3. strip_mentions   — remove @user
+        4. strip_hashtags   — remove #tag text (keep token)
+        5. strip_emojis     — remove Unicode emoji
+        6. lowercase        — normalize case
+        7. normalize_chars  — collapse repeated chars, excessive !??
+        8. strip_punct      — remove punctuation except apostrophe
+        9. tokenize         — split on whitespace
+       10. remove_stopwords — drop EN + TL stopwords
+    """
+    def clean(self, text: str) -> str:
+        """Steps 1-6: structural cleaning."""
+        text = _HTML_TAG_PATTERN.sub(" ", text)
+        text = _URL_PATTERN.sub(" ", text)
+        text = _MENTION_PATTERN.sub(" ", text)
+        text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text)  # Keep word, drop #
+        text = _remove_emojis(text)
+        text = text.lower()
+        return _WHITESPACE_PATTERN.sub(" ", text).strip()
+    def normalize(self, text: str) -> str:
+        """Steps 7-8: character-level normalization."""
+        text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text)   # "graaabe" → "graabe"
+        text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text)   # "!!!" → "!"
+        # Keep apostrophes (di, 'di, hindi), remove other punct
+        text = "".join(
+            ch if ch not in string.punctuation or ch == "'" else " "
+            for ch in text
+        )
+        return _WHITESPACE_PATTERN.sub(" ", text).strip()
+    def tokenize(self, text: str) -> list[str]:
+        """Step 9: whitespace tokenization."""
+        return [t for t in text.split() if len(t) > 1]
+    def remove_stopwords(self, tokens: list[str]) -> list[str]:
+        """Step 10: remove EN + TL stopwords."""
+        return [t for t in tokens if t not in ALL_STOPWORDS]
+    def preprocess(self, text: str) -> PreprocessResult:
+        """Run the full pipeline and return a structured result."""
+        cleaned = self.clean(text)
+        normalized = self.normalize(cleaned)
+        tokens = self.tokenize(normalized)
+        filtered = self.remove_stopwords(tokens)
+        return PreprocessResult(
+            original=text,
+            cleaned=cleaned,
+            normalized=normalized,
+            tokens=tokens,
+            filtered_tokens=filtered,
+            char_count=len(normalized),
+            word_count=len(tokens),
+        )

nlp/sentiment.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+PhilVerify — Sentiment & Emotion Analyzer
+Uses HuggingFace transformers with graceful fallback to lexicon-based scoring.
+"""
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+# ── Simple lexicons for fallback ──────────────────────────────────────────────
+_NEGATIVE_WORDS = {
+    "fake", "false", "lie", "liar", "hoax", "scam", "fraud", "corrupt",
+    "criminal", "illegal", "murder", "die", "death", "dead", "kill",
+    "patay", "namatay", "peke", "sinungaling", "corrupt", "magnanakaw",
+    "kasamaan", "krimen", "karahasan", "pandemic", "sakit", "epidemya",
+    "grabe", "nakakatakot", "nakakainis", "nakakagalit", "kahiya",
+}
+_POSITIVE_WORDS = {
+    "good", "great", "excellent", "amazing", "wonderful", "positive",
+    "success", "win", "victory", "help", "support", "safe", "free",
+    "maganda", "magaling", "mahusay", "maayos", "tagumpay", "ligtas",
+    "masaya", "mabuti", "mahalaga", "mahal", "salamat", "pagbabago",
+}
+_FEAR_WORDS = {
+    "takot", "fear", "scared", "afraid", "terror", "danger", "dangerous",
+    "banta", "panganib", "nakakatakot", "kalamidad", "lindol",
+}
+_ANGER_WORDS = {
+    "galit", "angry", "anger", "furious", "rage", "outrage", "poot",
+    "nakakagalit", "nakakaasar", "sumpain", "putang", "gago",
+}
+@dataclass
+class SentimentResult:
+    sentiment: str          # positive | negative | neutral | high positive | high negative
+    sentiment_score: float  # -1.0 to 1.0
+    emotion: str            # anger | fear | joy | sadness | neutral
+    emotion_score: float    # 0.0 to 1.0
+    method: str             # "transformer" | "lexicon"
+class SentimentAnalyzer:
+    """
+    Two-strategy sentiment analysis:
+    Primary  — cardiffnlp/twitter-roberta-base-sentiment-latest (social media optimized)
+    Fallback — lexicon-based word counting
+    """
+    def __init__(self):
+        self._sentiment_pipe = None
+        self._emotion_pipe = None
+        self._loaded = False
+    def _load_models(self):
+        if self._loaded:
+            return
+        try:
+            from transformers import pipeline
+            self._sentiment_pipe = pipeline(
+                "text-classification",
+                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
+                top_k=1,
+            )
+            self._emotion_pipe = pipeline(
+                "text-classification",
+                model="j-hartmann/emotion-english-distilroberta-base",
+                top_k=1,
+            )
+            logger.info("Sentiment / emotion models loaded")
+        except Exception as e:
+            logger.warning("Transformer models not available (%s) — using lexicon fallback", e)
+        self._loaded = True
+    def _lexicon_analyze(self, text: str) -> SentimentResult:
+        words = set(text.lower().split())
+        neg = len(words & _NEGATIVE_WORDS)
+        pos = len(words & _POSITIVE_WORDS)
+        fear = len(words & _FEAR_WORDS)
+        anger = len(words & _ANGER_WORDS)
+        total = neg + pos
+        if total == 0:
+            score = 0.0
+        else:
+            score = (pos - neg) / total
+        if score > 0.3:
+            sentiment = "high positive" if score > 0.6 else "positive"
+        elif score < -0.3:
+            sentiment = "high negative" if score < -0.6 else "negative"
+        else:
+            sentiment = "neutral"
+        emotion_score = 0.0
+        if fear > anger:
+            emotion = "fear"
+            emotion_score = min(fear / max(len(words), 1) * 5, 1.0)
+        elif anger > 0:
+            emotion = "anger"
+            emotion_score = min(anger / max(len(words), 1) * 5, 1.0)
+        elif pos > neg:
+            emotion = "joy"
+            emotion_score = min(pos / max(len(words), 1) * 5, 1.0)
+        elif neg > 0:
+            emotion = "sadness"
+            emotion_score = min(neg / max(len(words), 1) * 5, 1.0)
+        else:
+            emotion = "neutral"
+            emotion_score = 0.0
+        return SentimentResult(sentiment, round(score, 3), emotion, round(emotion_score, 3), "lexicon")
+    def analyze(self, text: str) -> SentimentResult:
+        self._load_models()
+        snippet = text[:512]  # Transformer token limit
+        if self._sentiment_pipe and self._emotion_pipe:
+            try:
+                s_out = self._sentiment_pipe(snippet)[0]
+                e_out = self._emotion_pipe(snippet)[0]
+                raw_label = s_out["label"].lower()
+                score = s_out["score"]
+                if "positive" in raw_label:
+                    sentiment = "high positive" if score > 0.85 else "positive"
+                    s_score = score
+                elif "negative" in raw_label:
+                    sentiment = "high negative" if score > 0.85 else "negative"
+                    s_score = -score
+                else:
+                    sentiment = "neutral"
+                    s_score = 0.0
+                emotion = e_out["label"].lower()
+                emotion_score = e_out["score"]
+                return SentimentResult(sentiment, round(s_score, 3), emotion, round(emotion_score, 3), "transformer")
+            except Exception as e:
+                logger.warning("Transformer inference error: %s — falling back to lexicon", e)
+        return self._lexicon_analyze(text)

pytest.ini ADDED Viewed

	@@ -0,0 +1,6 @@

+[pytest]
+asyncio_mode = auto
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*

requirements.txt ADDED Viewed

	@@ -0,0 +1,47 @@

+# ── Core Framework ────────────────────────────────────────────────────────────
+fastapi==0.115.6
+uvicorn[standard]==0.32.1
+python-multipart==0.0.17          # File upload support
+pydantic==2.9.2
+pydantic-settings==2.6.1
+# ── NLP & ML ──────────────────────────────────────────────────────────────────
+transformers==4.46.3
+torch==2.5.1
+sentence-transformers==3.3.1
+scikit-learn==1.5.2
+spacy==3.8.2
+langdetect==1.0.9
+nltk==3.9.1
+# ── Input Modules ─────────────────────────────────────────────────────────────
+pytesseract==0.3.13               # OCR
+Pillow==11.0.0                    # Image processing
+openai-whisper==20240930          # ASR (Filipino speech)
+beautifulsoup4==4.12.3            # URL scraping
+requests==2.32.3
+lxml==5.3.0
+# ── Evidence Retrieval ────────────────────────────────────────────────────────
+newsapi-python==0.2.7
+# ── Database ──────────────────────────────────────────────────────────────────
+sqlalchemy==2.0.36
+asyncpg==0.30.0                   # Async PostgreSQL driver
+alembic==1.14.0
+# ── Caching ───────────────────────────────────────────────────────────────────
+redis==5.2.1
+cachetools==5.5.0
+# ── Utilities ─────────────────────────────────────────────────────────────────
+python-dotenv==1.0.1
+httpx==0.28.1                     # Async HTTP client
+aiofiles==24.1.0
+tqdm==4.67.1
+numpy==1.26.4
+# ── Testing ───────────────────────────────────────────────────────────────────
+pytest==8.3.4
+pytest-asyncio==0.24.0
+httpx==0.28.1                     # FastAPI TestClient

scoring/__init__.py ADDED Viewed

File without changes

scoring/engine.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+PhilVerify — Scoring Engine (Orchestrator)
+Ties together all NLP modules, Layer 1, and Layer 2 into a final VerificationResponse.
+Final Score = (ML Confidence × 0.40) + (Evidence Score × 0.60)
+"""
+import asyncio
+import json
+import logging
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from config import get_settings
+from api.schemas import (
+    VerificationResponse, Verdict, Language, DomainTier,
+    Layer1Result, Layer2Result, EntitiesResult, EvidenceSource, Stance,
+)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+# ── Domain credibility lookup ─────────────────────────────────────────────────
+_DOMAIN_DB_PATH = Path(__file__).parent.parent / "domain_credibility.json"
+_DOMAIN_DB: dict = {}
+def _load_domain_db() -> dict:
+    global _DOMAIN_DB
+    if not _DOMAIN_DB:
+        try:
+            _DOMAIN_DB = json.loads(_DOMAIN_DB_PATH.read_text())
+        except Exception as e:
+            logger.warning("Could not load domain_credibility.json: %s", e)
+    return _DOMAIN_DB
+def get_domain_tier(domain: str) -> DomainTier | None:
+    if not domain:
+        return None
+    db = _load_domain_db()
+    domain = domain.lower().replace("www.", "")
+    for tier_key, tier_data in db.items():
+        if domain in tier_data.get("domains", []):
+            return DomainTier(int(tier_key[-1]))
+    return DomainTier.SUSPICIOUS  # Unknown domains default to Tier 3
+def _map_verdict(final_score: float) -> Verdict:
+    if final_score >= settings.credible_threshold:
+        return Verdict.CREDIBLE
+    elif final_score >= settings.fake_threshold:
+        return Verdict.UNVERIFIED
+    else:
+        return Verdict.LIKELY_FAKE
+async def run_verification(
+    text: str,
+    input_type: str = "text",
+    source_domain: str | None = None,
+) -> VerificationResponse:
+    """
+    Full verification pipeline orchestrator.
+    Runs NLP analysis and ML classifier synchronously, evidence retrieval async.
+    """
+    # ── Lazy imports so app starts without heavy deps ─────────────────────────
+    from nlp.preprocessor import TextPreprocessor
+    from nlp.language_detector import LanguageDetector
+    from nlp.ner import EntityExtractor
+    from nlp.sentiment import SentimentAnalyzer
+    from nlp.clickbait import ClickbaitDetector
+    from nlp.claim_extractor import ClaimExtractor
+    from ml.tfidf_classifier import TFIDFClassifier
+    from evidence.news_fetcher import fetch_evidence, compute_similarity
+    # ── Step 1: Preprocess ────────────────────────────────────────────────────
+    preprocessor = TextPreprocessor()
+    proc = preprocessor.preprocess(text)
+    # ── Step 2: Language detection ────────────────────────────────────────────
+    lang_detector = LanguageDetector()
+    lang_result = lang_detector.detect(text)
+    language = Language(lang_result.language) if lang_result.language in Language._value2member_map_ else Language.TAGLISH
+    # ── Steps 3–6: NLP analysis (run concurrently) ───────────────────────────
+    ner_extractor = EntityExtractor()
+    sentiment_analyzer = SentimentAnalyzer()
+    clickbait_detector = ClickbaitDetector()
+    claim_extractor = ClaimExtractor()
+    ner_result = ner_extractor.extract(text)
+    sentiment_result = sentiment_analyzer.analyze(proc.cleaned)
+    clickbait_result = clickbait_detector.detect(text)
+    claim_result = claim_extractor.extract(proc.cleaned)
+    # ── Step 7: Layer 1 — ML Classifier ──────────────────────────────────────
+    classifier = TFIDFClassifier()
+    classifier.train()
+    l1 = classifier.predict(proc.cleaned)
+    # Enrich triggered features with NLP signals
+    if clickbait_result.is_clickbait:
+        l1.triggered_features.extend(clickbait_result.triggered_patterns[:3])
+    if sentiment_result.sentiment in ("high negative",):
+        l1.triggered_features.append("high emotional language")
+    layer1 = Layer1Result(
+        verdict=Verdict(l1.verdict),
+        confidence=l1.confidence,
+        triggered_features=l1.triggered_features,
+    )
+    # ── Step 8: Layer 2 — Evidence Retrieval ──────────────────────────────────
+    evidence_score = 50.0  # Neutral default when API key absent
+    evidence_sources: list[EvidenceSource] = []
+    l2_verdict = Verdict.UNVERIFIED
+    if settings.news_api_key:
+        try:
+            articles = await fetch_evidence(claim_result.claim, settings.news_api_key)
+            for art in articles[:5]:
+                article_text = f"{art.get('title', '')} {art.get('description', '')}"
+                sim = compute_similarity(claim_result.claim, article_text)
+                domain = (art.get("source", {}) or {}).get("name", "unknown").lower()
+                tier = get_domain_tier(domain)
+                # Simple stance heuristic — negative title keywords → Refutes
+                title_lower = (art.get("title") or "").lower()
+                stance = Stance.NOT_ENOUGH_INFO
+                if any(w in title_lower for w in ["false", "fake", "hoax", "wrong", "debunked", "fact check"]):
+                    stance = Stance.REFUTES
+                elif sim > 0.6:
+                    stance = Stance.SUPPORTS
+                evidence_sources.append(EvidenceSource(
+                    title=art.get("title", ""),
+                    url=art.get("url", ""),
+                    similarity=sim,
+                    stance=stance,
+                    domain_tier=tier or DomainTier.SUSPICIOUS,
+                    published_at=art.get("publishedAt"),
+                    source_name=art.get("source", {}).get("name"),
+                ))
+            # Evidence score: average similarity × 100, penalized for refuting sources
+            if evidence_sources:
+                supporting = [s for s in evidence_sources if s.stance == Stance.SUPPORTS]
+                refuting = [s for s in evidence_sources if s.stance == Stance.REFUTES]
+                avg_sim = sum(s.similarity for s in evidence_sources) / len(evidence_sources)
+                refute_penalty = len(refuting) * 15
+                evidence_score = max(0.0, min(100.0, avg_sim * 100 - refute_penalty))
+                if len(refuting) > len(supporting):
+                    l2_verdict = Verdict.LIKELY_FAKE
+                elif len(supporting) >= 2:
+                    l2_verdict = Verdict.CREDIBLE
+        except Exception as e:
+            logger.warning("Evidence retrieval failed: %s — using neutral score", e)
+    layer2 = Layer2Result(
+        verdict=l2_verdict,
+        evidence_score=round(evidence_score, 1),
+        sources=evidence_sources,
+        claim_used=claim_result.claim,
+    )
+    # ── Step 9: Final Score ───────────────────────────────────────────────────
+    # ML confidence is 0-100 where high = more credible for the predicted class.
+    # Adjust: if ML says Fake, its confidence works against credibility.
+    ml_credibility = l1.confidence if l1.verdict == "Credible" else (100 - l1.confidence)
+    final_score = round(
+        (ml_credibility * settings.ml_weight) + (evidence_score * settings.evidence_weight),
+        1,
+    )
+    verdict = _map_verdict(final_score)
+    # ── Step 10: Assemble response ────────────────────────────────────────────
+    result = VerificationResponse(
+        verdict=verdict,
+        confidence=round(max(l1.confidence, evidence_score / 100 * 100), 1),
+        final_score=final_score,
+        layer1=layer1,
+        layer2=layer2,
+        entities=EntitiesResult(
+            persons=ner_result.persons,
+            organizations=ner_result.organizations,
+            locations=ner_result.locations,
+            dates=ner_result.dates,
+        ),
+        sentiment=sentiment_result.sentiment,
+        emotion=sentiment_result.emotion,
+        language=language,
+        domain_credibility=get_domain_tier(source_domain) if source_domain else None,
+        input_type=input_type,
+    )
+    # ── Record to history ─────────────────────────────────────────────────────
+    try:
+        from api.routes.history import record_verification
+        record_verification({
+            "id": str(uuid.uuid4()),
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "input_type": input_type,
+            "text_preview": text[:120],
+            "verdict": verdict.value,
+            "confidence": result.confidence,
+            "final_score": final_score,
+            "entities": ner_result.to_dict(),
+            "claim_used": claim_result.claim,
+        })
+    except Exception as e:
+        logger.warning("Failed to record history: %s", e)
+    return result

tests/__init__.py ADDED Viewed

File without changes

tests/test_philverify.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+PhilVerify — Unit Tests
+Covers: text preprocessor, language detector, clickbait detector, and scoring engine.
+Run: pytest tests/ -v
+"""
+import sys
+from pathlib import Path
+# Ensure project root is on PYTHONPATH
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import pytest
+# ── TextPreprocessor ──────────────────────────────────────────────────────────
+class TestTextPreprocessor:
+    def setup_method(self):
+        from nlp.preprocessor import TextPreprocessor
+        self.preprocessor = TextPreprocessor()
+    def test_lowercases_text(self):
+        result = self.preprocessor.clean("HELLO WORLD")
+        assert result == "hello world"
+    def test_strips_urls(self):
+        result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
+        assert "https://" not in result
+        assert "rappler.com" not in result
+    def test_strips_html_tags(self):
+        result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
+        assert "<" not in result and ">" not in result
+    def test_strips_mentions(self):
+        result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
+        assert "@" not in result
+    def test_removes_stopwords(self):
+        filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
+        assert "ang" not in filtered
+        assert "fake" in filtered
+    def test_normalizes_repeated_chars(self):
+        result = self.preprocessor.normalize("graaabe ang gaaalit ko")
+        assert "graaabe" not in result
+    def test_full_pipeline_returns_result(self):
+        from nlp.preprocessor import PreprocessResult
+        result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
+        assert isinstance(result, PreprocessResult)
+        assert result.char_count > 0
+        assert len(result.tokens) > 0
+# ── LanguageDetector ──────────────────────────────────────────────────────────
+class TestLanguageDetector:
+    def setup_method(self):
+        from nlp.language_detector import LanguageDetector
+        self.detector = LanguageDetector()
+    def test_detects_tagalog(self):
+        result = self.detector.detect(
+            "Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
+        )
+        assert result.language in ("Tagalog", "Taglish")
+    def test_detects_english(self):
+        result = self.detector.detect(
+            "The Supreme Court ruled in favor of the petition filed by the opposition."
+        )
+        assert result.language in ("English", "Taglish")
+    def test_detects_taglish(self):
+        result = self.detector.detect(
+            "Grabe ang news ngayon! The president announced na libre ang lahat!"
+        )
+        # Should detect either Taglish or remain consistent
+        assert result.language in ("Tagalog", "English", "Taglish")
+    def test_unknown_for_empty(self):
+        result = self.detector.detect("")
+        assert result.language == "Unknown"
+    def test_confidence_between_0_and_1(self):
+        result = self.detector.detect("Ang balita ay napakalaki!")
+        assert 0.0 <= result.confidence <= 1.0
+# ── ClickbaitDetector ─────────────────────────────────────────────────────────
+class TestClickbaitDetector:
+    def setup_method(self):
+        from nlp.clickbait import ClickbaitDetector
+        self.detector = ClickbaitDetector()
+    def test_detects_clickbait_all_caps(self):
+        result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
+        assert result.is_clickbait is True
+        assert result.score > 0.3
+    def test_detects_clickbait_tagalog(self):
+        result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
+        assert result.score > 0.3
+    def test_clean_headline_not_clickbait(self):
+        result = self.detector.detect(
+            "DOH reports 500 new cases as vaccination drive continues in Metro Manila"
+        )
+        assert result.is_clickbait is False
+    def test_score_between_0_and_1(self):
+        result = self.detector.detect("Breaking news today")
+        assert 0.0 <= result.score <= 1.0
+# ── TF-IDF Classifier ─────────────────────────────────────────────────────────
+class TestTFIDFClassifier:
+    def setup_method(self):
+        from ml.tfidf_classifier import TFIDFClassifier
+        self.clf = TFIDFClassifier()
+        self.clf.train()
+    def test_predict_returns_valid_verdict(self):
+        result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
+        assert result.verdict in ("Credible", "Unverified", "Fake")
+    def test_confidence_in_valid_range(self):
+        result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
+        assert 0.0 <= result.confidence <= 100.0
+    def test_triggered_features_are_strings(self):
+        result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
+        assert all(isinstance(f, str) for f in result.triggered_features)
+    def test_seed_fake_news_detected(self):
+        result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
+        # Should not be Credible for obvious fake claim
+        assert result.verdict in ("Unverified", "Fake", "Likely Fake")
+# ── Scoring Engine (lightweight integration) ──────────────────────────────────
+class TestScoringEngine:
+    """Integration test — no API keys needed, evidence score defaults to 50."""
+    @pytest.mark.asyncio
+    async def test_verify_text_returns_response(self):
+        from scoring.engine import run_verification
+        from api.schemas import VerificationResponse
+        result = await run_verification(
+            "GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
+            input_type="text",
+        )
+        assert isinstance(result, VerificationResponse)
+        assert result.verdict is not None
+        assert 0.0 <= result.final_score <= 100.0
+    @pytest.mark.asyncio
+    async def test_verify_credible_text(self):
+        from scoring.engine import run_verification
+        result = await run_verification(
+            "DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
+            input_type="text",
+        )
+        assert result.final_score is not None
+        assert result.language is not None
+    @pytest.mark.asyncio
+    async def test_entities_extracted(self):
+        from scoring.engine import run_verification
+        result = await run_verification(
+            "President Marcos announced new policies in Manila regarding the AFP and PNP.",
+            input_type="text",
+        )
+        assert result.entities is not None