File size: 10,167 Bytes
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7097cb7
6c9b8f1
 
 
 
7e55328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
c78c2c1
6c9b8f1
 
 
7e55328
c78c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e55328
 
 
6c9b8f1
 
 
7e55328
6c9b8f1
 
 
7e55328
6c9b8f1
 
 
b1c84b5
 
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e55328
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
7097cb7
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7097cb7
6c9b8f1
 
 
7097cb7
 
6c9b8f1
 
 
7e55328
6c9b8f1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"""
PhilVerify β€” Verify Routes
POST /verify/text | /verify/url | /verify/image | /verify/video
All routes funnel through run_verification() in the scoring engine.
"""
import time
import logging
from fastapi import APIRouter, HTTPException, UploadFile, File, status
from fastapi.responses import JSONResponse

from api.schemas import (
    TextVerifyRequest,
    URLVerifyRequest,
    VerificationResponse,
    ErrorResponse,
)
from scoring.engine import run_verification
from inputs.url_scraper import scrape_url
from inputs.ocr import extract_text_from_image
from inputs.asr import transcribe_and_ocr_video

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/verify", tags=["Verification"])

# ── OG meta fallback for bot-protected / social URLs ──────────────────────────
async def _fetch_og_text(url: str) -> str:
    """
    Fetches OG/meta title + description from a URL using a plain HTTP GET.
    Used as a last-resort fallback when the full scraper returns no content
    (e.g. Facebook share links, photo URLs that block the scraper).
    Returns a concatenated title + description string, or "" on failure.
    """
    try:
        import httpx
        from bs4 import BeautifulSoup

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
        }
        async with httpx.AsyncClient(timeout=12, follow_redirects=True) as client:
            resp = await client.get(url, headers=headers)
            if resp.status_code >= 400:
                return ""
            head_end = resp.text.find("</head>")
            head_html = resp.text[:head_end + 7] if head_end != -1 else resp.text[:8000]
            soup = BeautifulSoup(head_html, "lxml")

            def _m(prop=None, name=None):
                el = (soup.find("meta", property=prop) if prop
                      else soup.find("meta", attrs={"name": name}))
                return (el.get("content") or "").strip() if el else ""

            title = (_m(prop="og:title") or _m(name="twitter:title")
                     or (soup.title.get_text(strip=True) if soup.title else ""))
            description = (_m(prop="og:description") or _m(name="twitter:description")
                           or _m(name="description"))
            parts = [p for p in [title, description] if p]
            return " ".join(parts)
    except Exception as exc:
        logger.warning("OG meta fallback failed for %s: %s", url, exc)
        return ""


# ── Text ──────────────────────────────────────────────────────────────────────

@router.post(
    "/text",
    response_model=VerificationResponse,
    summary="Verify raw text",
    description="Accepts plain text (Tagalog, English, or Taglish) and runs the full verification pipeline.",
)
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/text called | chars=%d | has_image=%s", len(body.text), bool(body.image_url))
    try:
        result = await run_verification(body.text, input_type="text")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = body.text

        # If an image URL was provided, fetch it and run OCR β€” store result separately
        if body.image_url:
            try:
                import httpx
                async with httpx.AsyncClient(timeout=10) as client:
                    img_resp = await client.get(body.image_url)
                if img_resp.status_code == 200:
                    ocr = await extract_text_from_image(img_resp.content)
                    if ocr:
                        result.ocr_text = ocr.strip()
                        logger.info("OCR from image_url: %d chars", len(result.ocr_text))
            except Exception as ocr_exc:
                logger.warning("OCR for image_url failed (non-fatal): %s", ocr_exc)

        return result
    except Exception as exc:
        logger.exception("verify/text error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Verification failed: {exc}") from exc


# ── URL ───────────────────────────────────────────────────────────────────────

@router.post(
    "/url",
    response_model=VerificationResponse,
    summary="Verify a URL",
    description="Scrapes the article text from the given URL, then runs the full verification pipeline.",
)
async def verify_url(body: URLVerifyRequest) -> VerificationResponse:
    start = time.perf_counter()
    url_str = str(body.url)
    logger.info("verify/url called | url=%s", url_str)
    try:
        text, domain = await scrape_url(url_str)
        if not text or len(text.strip()) < 20:
            logger.info("scrape_url returned no content for %s β€” trying OG meta fallback", url_str)
            text = await _fetch_og_text(url_str)
        if not text or len(text.strip()) < 20:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="Could not extract meaningful text from the URL. The page may be paywalled, private, or bot-protected. Try copying the post text and using the Text tab instead.",
            )
        result = await run_verification(text, input_type="url", source_domain=domain)
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except ValueError as exc:
        # Expected user-facing errors (e.g. robots.txt block, bad URL)
        logger.warning("verify/url rejected: %s", exc)
        raise HTTPException(status_code=422, detail=str(exc)) from exc
    except Exception as exc:
        logger.exception("verify/url error: %s", exc)
        raise HTTPException(status_code=500, detail=f"URL verification failed: {exc}") from exc


# ── Image ─────────────────────────────────────────────────────────────────────

@router.post(
    "/image",
    response_model=VerificationResponse,
    summary="Verify an image (OCR)",
    description="Accepts an uploaded image file. Runs Tesseract OCR to extract text, then verifies.",
)
async def verify_image(file: UploadFile = File(...)) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/image called | filename=%s | size=%s", file.filename, file.size)

    allowed_types = {"image/jpeg", "image/png", "image/webp", "image/gif", "image/bmp"}
    if file.content_type not in allowed_types:
        raise HTTPException(
            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            detail=f"Unsupported image type: {file.content_type}. Accepted: jpeg, png, webp, gif, bmp",
        )
    try:
        image_bytes = await file.read()
        text = await extract_text_from_image(image_bytes)
        if not text or len(text.strip()) < 10:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="No readable text found in the image.",
            )
        result = await run_verification(text, input_type="image")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except Exception as exc:
        logger.exception("verify/image error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Image verification failed: {exc}") from exc


# ── Video ─────────────────────────────────────────────────────────────────────

@router.post(
    "/video",
    response_model=VerificationResponse,
    summary="Verify a video/audio (Whisper ASR + Frame OCR)",
    description="Accepts a video or audio file. Runs Whisper ASR and frame OCR in parallel β€” handles speech-only, on-screen text only, or both.",
)
async def verify_video(file: UploadFile = File(...)) -> VerificationResponse:
    start = time.perf_counter()
    logger.info("verify/video called | filename=%s", file.filename)

    allowed_types = {
        "video/mp4", "video/webm", "video/quicktime",
        "audio/mpeg", "audio/wav", "audio/ogg", "audio/mp4",
    }
    if file.content_type not in allowed_types:
        raise HTTPException(
            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            detail=f"Unsupported media type: {file.content_type}",
        )
    try:
        media_bytes = await file.read()
        text = await transcribe_and_ocr_video(media_bytes, filename=file.filename or "upload")
        if not text or len(text.strip()) < 10:
            raise HTTPException(
                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
                detail="Could not extract any usable text from the media file. "
                       "Ensure the video has audible speech or visible on-screen text.",
            )
        result = await run_verification(text, input_type="video")
        result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
        result.extracted_text = text.strip()
        return result
    except HTTPException:
        raise
    except Exception as exc:
        logger.exception("verify/video error: %s", exc)
        raise HTTPException(status_code=500, detail=f"Video verification failed: {exc}") from exc