Spaces:

SemiAutomat1c
/

philverify-api

Running

File size: 10,167 Bytes

"""
PhilVerify — Verify Routes
POST /verify/text | /verify/url | /verify/image | /verify/video
All routes funnel through run_verification() in the scoring engine.
"""
import time
import logging
from fastapi import APIRouter, HTTPException, UploadFile, File, status
from fastapi.responses import JSONResponse

from api.schemas import (
    TextVerifyRequest,
    URLVerifyRequest,
    VerificationResponse,
    ErrorResponse,
)
from scoring.engine import run_verification
from inputs.url_scraper import scrape_url
from inputs.ocr import extract_text_from_image
from inputs.asr import transcribe_and_ocr_video

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/verify", tags=["Verification"])

# ── OG meta fallback for bot-protected / social URLs ──────────────────────────
async def _fetch_og_text(url: str) -> str:
    """
    Fetches OG/meta title + description from a URL using a plain HTTP GET.
    Used as a last-resort fallback when the full scraper returns no content
    (e.g. Facebook share links, photo URLs that block the scraper).
    Returns a concatenated title + description string, or "" on failure.
    """
    try:
        import httpx
        from bs4 import BeautifulSoup

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
        }
        async with httpx.AsyncClient(timeout=12, follow_redirects=True) as client:
            resp = await client.get(url, headers=headers)
            if resp.status_code >= 400:
                return ""
            head_end = resp.text.find("</head>")
            head_html = resp.text[:head_end + 7] if head_end != -1 else resp.text[:8000]
            soup = BeautifulSoup(head_html, "lxml")

            def _m(prop=None, name=None):
                el = (soup.find("meta", property=prop) if prop
                      else soup.find("meta", attrs={"name": name}))
                return (el.get("content") or "").strip() if el else ""

            title = (_m(prop="og:title") or _m(name="twitter:title")
                     or (soup.title.get_text(strip=True) if soup.title else ""))
            description = (_m(prop="og:description") or _m(name="twitter:description")
                           or _m(name="description"))
            parts = [p for p in [title, description] if p]
            return " ".join(parts)
    except Exception as exc:
        logger.warning("OG meta fallback failed for %s: %s", url, exc)
        return ""


# ── Text ──────────────────────────────────────────────────────────────────────

@router.post(
    "/text",
    response_model=VerificationResponse,
    summary="Verify raw text",
    description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
)
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/text called | chars=%d | has_image=%s", len(body.text), bool(body.image_url))
    try:
        result = await run_verification(body.text, input_type="text")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = body.text

        # If an image URL was provided, fetch it and run OCR — store result separately
        if body.image_url:
            try:
                import httpx
                async with httpx.AsyncClient(timeout=10) as client:
                    img_resp = await client.get(body.image_url)
                if img_resp.status_code == 200:
                    ocr = await extract_text_from_image(img_resp.content)
                    if ocr:
                        result.ocr_text = ocr.strip()
                        logger.info("OCR from image_url: %d chars", len(result.ocr_text))
            except Exception as ocr_exc:
                logger.warning("OCR for image_url failed (non-fatal): %s", ocr_exc)

        return result
    except Exception as exc:
        logger.exception("verify/text error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc


# ── URL ───────────────────────────────────────────────────────────────────────

@router.post(
    "/url",
    response_model=VerificationResponse,
    summary="Verify a URL",
    description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
)
async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
    start = time.perf_counter()
    url_str = str(body.url)
    logger.info("verify/url called | url=%s", url_str)
    try:
        text, domain = await scrape_url(url_str)
        if not text or len(text.strip()) < 20:
            logger.info("scrape_url returned no content for %s — trying OG meta fallback", url_str)
            text = await _fetch_og_text(url_str)
        if not text or len(text.strip()) < 20:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="Could not extract meaningful text from the URL. The page may be paywalled, private, or bot-protected. Try copying the post text and using the Text tab instead.",
            )
        result = await run_verification(text, input_type="url", source_domain=domain)
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except ValueError as exc:
        # Expected user-facing errors (e.g. robots.txt block, bad URL)
        logger.warning("verify/url rejected: %s", exc)
        raise HTTPException(status_code=422, detail=str(exc)) from exc
    except Exception as exc:
        logger.exception("verify/url error: %s", exc)
        raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc


# ── Image ─────────────────────────────────────────────────────────────────────

@router.post(
    "/image",
    response_model=VerificationResponse,
    summary="Verify an image (OCR)",
    description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
)
async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)

    allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
    if file.content_type not in allowed_types:
        raise HTTPException(
            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
        )
    try:
        image_bytes = await file.read()
        text = await extract_text_from_image(image_bytes)
        if not text or len(text.strip()) < 10:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="No readable text found in the image.",
            )
        result = await run_verification(text, input_type="image")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except Exception as exc:
        logger.exception("verify/image error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc


# ── Video ─────────────────────────────────────────────────────────────────────

@router.post(
    "/video",
    response_model=VerificationResponse,
    summary="Verify a video/audio (Whisper ASR + Frame OCR)",
    description="Accepts a video or audio file. Runs Whisper ASR and frame OCR in parallel — handles speech-only, on-screen text only, or both.",
)
async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/video called | filename=%s", file.filename)

    allowed_types = {
        "video/mp4", "video/webm", "video/quicktime",
        "audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
    }
    if file.content_type not in allowed_types:
        raise HTTPException(
            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            detail=f"Unsupported media type: {file.content_type}",
        )
    try:
        media_bytes = await file.read()
        text = await transcribe_and_ocr_video(media_bytes, filename=file.filename or "upload")
        if not text or len(text.strip()) < 10:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="Could not extract any usable text from the media file. "
                       "Ensure the video has audible speech or visible on-screen text.",
            )
        result = await run_verification(text, input_type="video")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except Exception as exc:
        logger.exception("verify/video error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc