Spaces:

SemiAutomat1c
/

philverify-api

Running

App Files Files Community

Ryan Christian D. Deniega commited on Feb 25

Commit

0ee6199

1 Parent(s): ed3755c

feat: oEmbed scraping for Facebook/X URLs, remove social URL blocks

Browse files

Files changed (3) hide show

extension/popup.js +3 -27
frontend/src/pages/VerifyPage.jsx +9 -28
inputs/url_scraper.py +66 -0

extension/popup.js CHANGED Viewed

@@ -50,12 +50,7 @@ function isUrl(s) {
   try { new URL(s); return s.startsWith('http'); } catch { return false }
 }
-function isSocialUrl(s) {
-  try {
-    const h = new URL(s).hostname
-    return h.includes('facebook.com') || h.includes('x.com') || h.includes('twitter.com')
-  } catch { return false }
-}
 // ── Render helpers ────────────────────────────────────────────────────────────
@@ -142,16 +137,12 @@ const currentUrlEl = document.getElementById('current-url')
 // Auto-populate input with current tab URL if it's a news article
 chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
   const url = tab?.url ?? ''
-  if (url && !url.startsWith('chrome') && !isSocialUrl(url)) {
     currentUrlEl.textContent = url
     currentUrlEl.title = url
     verifyInput.value = url
   } else {
-    const h = (() => { try { return new URL(url).hostname } catch { return '' } })()
-    const site = h.includes('x.com') || h.includes('twitter.com') ? 'x.com / twitter.com'
-               : h.includes('facebook.com') ? 'facebook.com'
-               : 'social media'
-    currentUrlEl.textContent = `${site} — paste post text below`
   }
 })
@@ -167,21 +158,6 @@ btnVerify.addEventListener('click', async () => {
       <div class="spinner" aria-hidden="true"></div><br>Analyzing claim…
     </div>`
-  // Block social media URLs — backend can't scrape them
-  if (isSocialUrl(raw)) {
-    btnVerify.disabled = false
-    btnVerify.setAttribute('aria-busy', 'false')
-    btnVerify.textContent = 'Verify Claim'
-    verifyResult.innerHTML = `
-      <div class="state-error" role="alert">
-        Facebook, X, and Twitter URLs can't be scraped by the backend.<br>
-        <span style="font-size:10px;color:var(--text-muted)">
-          Paste the post's text/caption directly instead, or let the extension auto-scan your feed.
-        </span>
-      </div>`
-    return
-  }
   const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
   const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
   const resp = await msg(payload)

   try { new URL(s); return s.startsWith('http'); } catch { return false }
 }
 // ── Render helpers ────────────────────────────────────────────────────────────
 // Auto-populate input with current tab URL if it's a news article
 chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
   const url = tab?.url ?? ''
+  if (url && !url.startsWith('chrome')) {
     currentUrlEl.textContent = url
     currentUrlEl.title = url
     verifyInput.value = url
   } else {
+    currentUrlEl.textContent = 'No active page'
   }
 })
       <div class="spinner" aria-hidden="true"></div><br>Analyzing claim…
     </div>`
   const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
   const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
   const resp = await msg(payload)

frontend/src/pages/VerifyPage.jsx CHANGED Viewed

@@ -271,8 +271,6 @@ export default function VerifyPage() {
     useEffect(() => {
         if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
         try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
-        // Don't attempt to preview social media URLs — they're login-protected
-        if (isSocialUrl(input.trim())) { setUrlPreview(null); setUrlPreviewLoading(false); return }
         setUrlPreviewLoading(true)
         const timer = setTimeout(async () => {
             try {
@@ -300,12 +298,6 @@ export default function VerifyPage() {
         e.preventDefault()
         if (!canSubmit) return
-        /* Block social media URLs — backend can't scrape them */
-        if (tab === 'url' && isSocialUrl(input)) {
-            setError('Facebook, X, and Twitter URLs cannot be scraped — the page is login-protected.\n\nInstead: copy the post\'s text/caption and paste it into the Text tab.')
-            return
-        }
         /* Capture what the user submitted before any state resets */
         const previewUrl = (tab === 'image' || tab === 'video') && file
             ? URL.createObjectURL(file)
@@ -615,28 +607,17 @@ export default function VerifyPage() {
                 <div id={errorId} role="alert"
                     className="card p-4 flex items-start gap-2"
                     style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
-                    <AlertCircle size={15} style={{ color: isSocialUrl(input) ? '#fb923c' : '#f87171', marginTop: 1, flexShrink: 0 }} aria-hidden="true" />
                     <div>
-                        <p className="text-sm font-semibold" style={{ color: isSocialUrl(input) ? '#fb923c' : '#f87171', fontFamily: 'var(--font-display)' }}>
-                            {isSocialUrl(input) ? 'Social media URLs are not supported' : 'Verification failed'}
                         </p>
-                        {isSocialUrl(input) ? (
-                            <>
-                                <p className="text-xs mt-1" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
-                                    Facebook, X, and Twitter block server-side scraping — the page requires a login.
-                                </p>
-                                <p className="text-xs mt-1.5 font-semibold" style={{ color: 'var(--text-primary)', fontFamily: 'var(--font-body)' }}>
-                                    Instead: copy the post caption/text and paste it into the <strong>Text</strong> tab.
-                                </p>
-                            </>
-                        ) : (
-                            <p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
-                                {error}
-                                {/failed to fetch|network|ERR_/i.test(error) && (
-                                    <> — Make sure the backend is running at <code>localhost:8000</code>.</>
-                                )}
-                            </p>
-                        )}
                     </div>
                 </div>
             )}

     useEffect(() => {
         if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
         try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
         setUrlPreviewLoading(true)
         const timer = setTimeout(async () => {
             try {
         e.preventDefault()
         if (!canSubmit) return
         /* Capture what the user submitted before any state resets */
         const previewUrl = (tab === 'image' || tab === 'video') && file
             ? URL.createObjectURL(file)
                 <div id={errorId} role="alert"
                     className="card p-4 flex items-start gap-2"
                     style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
+                    <AlertCircle size={15} style={{ color: '#f87171', marginTop: 1, flexShrink: 0 }} aria-hidden="true" />
                     <div>
+                        <p className="text-sm font-semibold" style={{ color: '#f87171', fontFamily: 'var(--font-display)' }}>
+                            Verification failed
+                        </p>
+                        <p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
+                            {error}
+                            {/failed to fetch|network|ERR_/i.test(error) && (
+                                <> — Make sure the backend is running at <code>localhost:8000</code>.</>
+                            )}
                         </p>
                     </div>
                 </div>
             )}

inputs/url_scraper.py CHANGED Viewed

@@ -38,6 +38,58 @@ def _get_domain(url: str) -> str:
     return urlparse(url).netloc.replace("www.", "")
 def _slug_to_text(url: str) -> str:
     """
     Synthesize minimal article text from the URL slug and domain.
@@ -240,6 +292,20 @@ async def scrape_url(url: str) -> tuple[str, str]:
     domain = _get_domain(url)
     if not _robots_allow(url):
         logger.warning("robots.txt disallows scraping %s", url)
         raise ValueError(f"Scraping disallowed by robots.txt for {domain}")

     return urlparse(url).netloc.replace("www.", "")
+def _is_social_url(url: str) -> str | None:
+    """Return 'facebook' | 'twitter' | None based on hostname."""
+    host = urlparse(url).netloc.lower()
+    if "facebook.com" in host:
+        return "facebook"
+    if "x.com" in host or "twitter.com" in host:
+        return "twitter"
+    return None
+async def _scrape_social_oembed(url: str, platform: str, client) -> str:
+    """
+    Extract post text via the public oEmbed API — no login required.
+      Facebook: https://www.facebook.com/plugins/post/oembed.json/
+      Twitter/X: https://publish.twitter.com/oembed
+    Parses the returned HTML blockquote for plain text.
+    """
+    from bs4 import BeautifulSoup
+    encoded = urllib.parse.quote(url, safe="")
+    if platform == "facebook":
+        oembed_url = (
+            f"https://www.facebook.com/plugins/post/oembed.json/"
+            f"?url={encoded}&omitscript=1"
+        )
+    else:
+        oembed_url = (
+            f"https://publish.twitter.com/oembed"
+            f"?url={encoded}&omit_script=1"
+        )
+    try:
+        resp = await client.get(oembed_url, timeout=15)
+        if resp.status_code != 200:
+            logger.warning("oEmbed %s HTTP %d for %s", platform, resp.status_code, url)
+            return ""
+        data = resp.json()
+        html = data.get("html", "")
+        if not html:
+            return ""
+        soup = BeautifulSoup(html, "lxml")
+        # Drop the trailing attribution link / timestamp
+        for a in soup.find_all("a"):
+            a.decompose()
+        text = _clean_text(soup.get_text(separator=" ", strip=True))
+        logger.info("oEmbed %s: %d chars from %s", platform, len(text), url)
+        return text
+    except Exception as exc:
+        logger.warning("oEmbed failed for %s (%s): %s", url, platform, exc)
+        return ""
 def _slug_to_text(url: str) -> str:
     """
     Synthesize minimal article text from the URL slug and domain.
     domain = _get_domain(url)
+    # ── Social media: use public oEmbed API (no login required) ──────────────
+    platform = _is_social_url(url)
+    if platform:
+        try:
+            import httpx
+        except ImportError as exc:
+            raise RuntimeError(f"Missing dependency: {exc}") from exc
+        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
+            text = await _scrape_social_oembed(url, platform, client)
+        if text and len(text.strip()) >= 20:
+            return text, domain
+        # oEmbed failed — could be a profile/group URL rather than a specific post
+        return "", domain
     if not _robots_allow(url):
         logger.warning("robots.txt disallows scraping %s", url)
         raise ValueError(f"Scraping disallowed by robots.txt for {domain}")