Spaces:
Running
Running
File size: 10,167 Bytes
6c9b8f1 7097cb7 6c9b8f1 7e55328 6c9b8f1 c78c2c1 6c9b8f1 7e55328 c78c2c1 6c9b8f1 7e55328 6c9b8f1 7e55328 6c9b8f1 7e55328 6c9b8f1 b1c84b5 6c9b8f1 7e55328 6c9b8f1 7097cb7 6c9b8f1 7097cb7 6c9b8f1 7097cb7 6c9b8f1 7e55328 6c9b8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | """
PhilVerify β Verify Routes
POST /verify/text | /verify/url | /verify/image | /verify/video
All routes funnel through run_verification() in the scoring engine.
"""
import time
import logging
from fastapi import APIRouter, HTTPException, UploadFile, File, status
from fastapi.responses import JSONResponse
from api.schemas import (
TextVerifyRequest,
URLVerifyRequest,
VerificationResponse,
ErrorResponse,
)
from scoring.engine import run_verification
from inputs.url_scraper import scrape_url
from inputs.ocr import extract_text_from_image
from inputs.asr import transcribe_and_ocr_video
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/verify", tags=["Verification"])
# ββ OG meta fallback for bot-protected / social URLs ββββββββββββββββββββββββββ
async def _fetch_og_text(url: str) -> str:
"""
Fetches OG/meta title + description from a URL using a plain HTTP GET.
Used as a last-resort fallback when the full scraper returns no content
(e.g. Facebook share links, photo URLs that block the scraper).
Returns a concatenated title + description string, or "" on failure.
"""
try:
import httpx
from bs4 import BeautifulSoup
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
async with httpx.AsyncClient(timeout=12, follow_redirects=True) as client:
resp = await client.get(url, headers=headers)
if resp.status_code >= 400:
return ""
head_end = resp.text.find("</head>")
head_html = resp.text[:head_end + 7] if head_end != -1 else resp.text[:8000]
soup = BeautifulSoup(head_html, "lxml")
def _m(prop=None, name=None):
el = (soup.find("meta", property=prop) if prop
else soup.find("meta", attrs={"name": name}))
return (el.get("content") or "").strip() if el else ""
title = (_m(prop="og:title") or _m(name="twitter:title")
or (soup.title.get_text(strip=True) if soup.title else ""))
description = (_m(prop="og:description") or _m(name="twitter:description")
or _m(name="description"))
parts = [p for p in [title, description] if p]
return " ".join(parts)
except Exception as exc:
logger.warning("OG meta fallback failed for %s: %s", url, exc)
return ""
# ββ Text ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.post(
"/text",
response_model=VerificationResponse,
summary="Verify raw text",
description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
)
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/text called | chars=%d | has_image=%s", len(body.text), bool(body.image_url))
try:
result = await run_verification(body.text, input_type="text")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = body.text
# If an image URL was provided, fetch it and run OCR β store result separately
if body.image_url:
try:
import httpx
async with httpx.AsyncClient(timeout=10) as client:
img_resp = await client.get(body.image_url)
if img_resp.status_code == 200:
ocr = await extract_text_from_image(img_resp.content)
if ocr:
result.ocr_text = ocr.strip()
logger.info("OCR from image_url: %d chars", len(result.ocr_text))
except Exception as ocr_exc:
logger.warning("OCR for image_url failed (non-fatal): %s", ocr_exc)
return result
except Exception as exc:
logger.exception("verify/text error: %s", exc)
raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc
# ββ URL βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.post(
"/url",
response_model=VerificationResponse,
summary="Verify a URL",
description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
)
async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
start = time.perf_counter()
url_str = str(body.url)
logger.info("verify/url called | url=%s", url_str)
try:
text, domain = await scrape_url(url_str)
if not text or len(text.strip()) < 20:
logger.info("scrape_url returned no content for %s β trying OG meta fallback", url_str)
text = await _fetch_og_text(url_str)
if not text or len(text.strip()) < 20:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Could not extract meaningful text from the URL. The page may be paywalled, private, or bot-protected. Try copying the post text and using the Text tab instead.",
)
result = await run_verification(text, input_type="url", source_domain=domain)
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except ValueError as exc:
# Expected user-facing errors (e.g. robots.txt block, bad URL)
logger.warning("verify/url rejected: %s", exc)
raise HTTPException(status_code=422, detail=str(exc)) from exc
except Exception as exc:
logger.exception("verify/url error: %s", exc)
raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc
# ββ Image βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.post(
"/image",
response_model=VerificationResponse,
summary="Verify an image (OCR)",
description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
)
async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)
allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
if file.content_type not in allowed_types:
raise HTTPException(
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
)
try:
image_bytes = await file.read()
text = await extract_text_from_image(image_bytes)
if not text or len(text.strip()) < 10:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="No readable text found in the image.",
)
result = await run_verification(text, input_type="image")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except Exception as exc:
logger.exception("verify/image error: %s", exc)
raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc
# ββ Video βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.post(
"/video",
response_model=VerificationResponse,
summary="Verify a video/audio (Whisper ASR + Frame OCR)",
description="Accepts a video or audio file. Runs Whisper ASR and frame OCR in parallel β handles speech-only, on-screen text only, or both.",
)
async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
start = time.perf_counter()
logger.info("verify/video called | filename=%s", file.filename)
allowed_types = {
"video/mp4", "video/webm", "video/quicktime",
"audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
}
if file.content_type not in allowed_types:
raise HTTPException(
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
detail=f"Unsupported media type: {file.content_type}",
)
try:
media_bytes = await file.read()
text = await transcribe_and_ocr_video(media_bytes, filename=file.filename or "upload")
if not text or len(text.strip()) < 10:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Could not extract any usable text from the media file. "
"Ensure the video has audible speech or visible on-screen text.",
)
result = await run_verification(text, input_type="video")
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
result.extracted_text = text.strip()
return result
except HTTPException:
raise
except Exception as exc:
logger.exception("verify/video error: %s", exc)
raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc
|