Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1
"""
PhilVerify β€” Verify Routes
POST /verify/text | /verify/url | /verify/image | /verify/video
All routes funnel through run_verification() in the scoring engine.
"""
import time
import logging
from fastapi import APIRouter, HTTPException, UploadFile, File, status
from fastapi.responses import JSONResponse
from api.schemas import (
TextVerifyRequest,
URLVerifyRequest,
VerificationResponse,
ErrorResponse,
)
from scoring.engine import run_verification
from inputs.url_scraper import scrape_url
from inputs.ocr import extract_text_from_image
from inputs.asr import transcribe_and_ocr_video
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/verify", tags=["Verification"])
# ── OG meta fallback for bot-protected / social URLs ──────────────────────────
async def _fetch_og_text(url: str) -> str:
"""
Fetches OG/meta title + description from a URL using a plain HTTP GET.
Used as a last-resort fallback when the full scraper returns no content
(e.g. Facebook share links, photo URLs that block the scraper).
Returns a concatenated title + description string, or "" on failure.
"""
try:
import httpx
from bs4 import BeautifulSoup
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
async with httpx.AsyncClient(timeout=12, follow_redirects=True) as client:
resp = await client.get(url, headers=headers)
if resp.status_code >= 400:
return ""
head_end = resp.text.find("</head>")
head_html = resp.text[:head_end + 7] if head_end != -1 else resp.text[:8000]
soup = BeautifulSoup(head_html, "lxml")
def _m(prop=None, name=None):
el = (soup.find("meta", property=prop) if prop
else soup.find("meta", attrs={"name": name}))
return (el.get("content") or "").strip() if el else ""
title = (_m(prop="og:title") or _m(name="twitter:title")
or (soup.title.get_text(strip=True) if soup.title else ""))
description = (_m(prop="og:description") or _m(name="twitter:description")
or _m(name="description"))
parts = [p for p in [title, description] if p]
return " ".join(parts)
except Exception as exc:
logger.warning("OG meta fallback failed for %s: %s", url, exc)
return ""
# ── Text ──────────────────────────────────────────────────────────────────────
@router.post(
"/text",
response_model=VerificationResponse,
summary="Verify raw text",
description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
)
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/text called | chars=%d | has_image=%s", len(body.text), bool(body.image_url))
try:
result = await run_verification(body.text, input_type="text")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = body.text
# If an image URL was provided, fetch it and run OCR β€” store result separately
if body.image_url:
try:
import httpx
async with httpx.AsyncClient(timeout=10) as client:
img_resp = await client.get(body.image_url)
if img_resp.status_code == 200:
ocr = await extract_text_from_image(img_resp.content)
if ocr:
result.ocr_text = ocr.strip()
logger.info("OCR from image_url: %d chars", len(result.ocr_text))
except Exception as ocr_exc:
logger.warning("OCR for image_url failed (non-fatal): %s", ocr_exc)
return result
except Exception as exc:
logger.exception("verify/text error: %s", exc)
raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc
# ── URL ───────────────────────────────────────────────────────────────────────
@router.post(
"/url",
response_model=VerificationResponse,
summary="Verify a URL",
description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
)
async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
start = time.perf_counter()
url_str = str(body.url)
logger.info("verify/url called | url=%s", url_str)
try:
text, domain = await scrape_url(url_str)
if not text or len(text.strip()) < 20:
logger.info("scrape_url returned no content for %s β€” trying OG meta fallback", url_str)
text = await _fetch_og_text(url_str)
if not text or len(text.strip()) < 20:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Could not extract meaningful text from the URL. The page may be paywalled, private, or bot-protected. Try copying the post text and using the Text tab instead.",
)
result = await run_verification(text, input_type="url", source_domain=domain)
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except ValueError as exc:
# Expected user-facing errors (e.g. robots.txt block, bad URL)
logger.warning("verify/url rejected: %s", exc)
raise HTTPException(status_code=422, detail=str(exc)) from exc
except Exception as exc:
logger.exception("verify/url error: %s", exc)
raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc
# ── Image ─────────────────────────────────────────────────────────────────────
@router.post(
"/image",
response_model=VerificationResponse,
summary="Verify an image (OCR)",
description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
)
async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)
allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
if file.content_type not in allowed_types:
raise HTTPException(
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
)
try:
image_bytes = await file.read()
text = await extract_text_from_image(image_bytes)
if not text or len(text.strip()) < 10:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="No readable text found in the image.",
)
result = await run_verification(text, input_type="image")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except Exception as exc:
logger.exception("verify/image error: %s", exc)
raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc
# ── Video ─────────────────────────────────────────────────────────────────────
@router.post(
"/video",
response_model=VerificationResponse,
summary="Verify a video/audio (Whisper ASR + Frame OCR)",
description="Accepts a video or audio file. Runs Whisper ASR and frame OCR in parallel β€” handles speech-only, on-screen text only, or both.",
)
async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/video called | filename=%s", file.filename)
allowed_types = {
"video/mp4", "video/webm", "video/quicktime",
"audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
}
if file.content_type not in allowed_types:
raise HTTPException(
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
detail=f"Unsupported media type: {file.content_type}",
)
try:
media_bytes = await file.read()
text = await transcribe_and_ocr_video(media_bytes, filename=file.filename or "upload")
if not text or len(text.strip()) < 10:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Could not extract any usable text from the media file. "
"Ensure the video has audible speech or visible on-screen text.",
)
result = await run_verification(text, input_type="video")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except Exception as exc:
logger.exception("verify/video error: %s", exc)
raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc