Ryan Christian D. Deniega commited on
Commit
0ee6199
Β·
1 Parent(s): ed3755c

feat: oEmbed scraping for Facebook/X URLs, remove social URL blocks

Browse files
extension/popup.js CHANGED
@@ -50,12 +50,7 @@ function isUrl(s) {
50
  try { new URL(s); return s.startsWith('http'); } catch { return false }
51
  }
52
 
53
- function isSocialUrl(s) {
54
- try {
55
- const h = new URL(s).hostname
56
- return h.includes('facebook.com') || h.includes('x.com') || h.includes('twitter.com')
57
- } catch { return false }
58
- }
59
 
60
  // ── Render helpers ────────────────────────────────────────────────────────────
61
 
@@ -142,16 +137,12 @@ const currentUrlEl = document.getElementById('current-url')
142
  // Auto-populate input with current tab URL if it's a news article
143
  chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
144
  const url = tab?.url ?? ''
145
- if (url && !url.startsWith('chrome') && !isSocialUrl(url)) {
146
  currentUrlEl.textContent = url
147
  currentUrlEl.title = url
148
  verifyInput.value = url
149
  } else {
150
- const h = (() => { try { return new URL(url).hostname } catch { return '' } })()
151
- const site = h.includes('x.com') || h.includes('twitter.com') ? 'x.com / twitter.com'
152
- : h.includes('facebook.com') ? 'facebook.com'
153
- : 'social media'
154
- currentUrlEl.textContent = `${site} β€” paste post text below`
155
  }
156
  })
157
 
@@ -167,21 +158,6 @@ btnVerify.addEventListener('click', async () => {
167
  <div class="spinner" aria-hidden="true"></div><br>Analyzing claim…
168
  </div>`
169
 
170
- // Block social media URLs β€” backend can't scrape them
171
- if (isSocialUrl(raw)) {
172
- btnVerify.disabled = false
173
- btnVerify.setAttribute('aria-busy', 'false')
174
- btnVerify.textContent = 'Verify Claim'
175
- verifyResult.innerHTML = `
176
- <div class="state-error" role="alert">
177
- Facebook, X, and Twitter URLs can't be scraped by the backend.<br>
178
- <span style="font-size:10px;color:var(--text-muted)">
179
- Paste the post's text/caption directly instead, or let the extension auto-scan your feed.
180
- </span>
181
- </div>`
182
- return
183
- }
184
-
185
  const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
186
  const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
187
  const resp = await msg(payload)
 
50
  try { new URL(s); return s.startsWith('http'); } catch { return false }
51
  }
52
 
53
+
 
 
 
 
 
54
 
55
  // ── Render helpers ────────────────────────────────────────────────────────────
56
 
 
137
  // Auto-populate input with current tab URL if it's a news article
138
  chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
139
  const url = tab?.url ?? ''
140
+ if (url && !url.startsWith('chrome')) {
141
  currentUrlEl.textContent = url
142
  currentUrlEl.title = url
143
  verifyInput.value = url
144
  } else {
145
+ currentUrlEl.textContent = 'No active page'
 
 
 
 
146
  }
147
  })
148
 
 
158
  <div class="spinner" aria-hidden="true"></div><br>Analyzing claim…
159
  </div>`
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
162
  const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
163
  const resp = await msg(payload)
frontend/src/pages/VerifyPage.jsx CHANGED
@@ -271,8 +271,6 @@ export default function VerifyPage() {
271
  useEffect(() => {
272
  if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
273
  try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
274
- // Don't attempt to preview social media URLs β€” they're login-protected
275
- if (isSocialUrl(input.trim())) { setUrlPreview(null); setUrlPreviewLoading(false); return }
276
  setUrlPreviewLoading(true)
277
  const timer = setTimeout(async () => {
278
  try {
@@ -300,12 +298,6 @@ export default function VerifyPage() {
300
  e.preventDefault()
301
  if (!canSubmit) return
302
 
303
- /* Block social media URLs β€” backend can't scrape them */
304
- if (tab === 'url' && isSocialUrl(input)) {
305
- setError('Facebook, X, and Twitter URLs cannot be scraped β€” the page is login-protected.\n\nInstead: copy the post\'s text/caption and paste it into the Text tab.')
306
- return
307
- }
308
-
309
  /* Capture what the user submitted before any state resets */
310
  const previewUrl = (tab === 'image' || tab === 'video') && file
311
  ? URL.createObjectURL(file)
@@ -615,28 +607,17 @@ export default function VerifyPage() {
615
  <div id={errorId} role="alert"
616
  className="card p-4 flex items-start gap-2"
617
  style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
618
- <AlertCircle size={15} style={{ color: isSocialUrl(input) ? '#fb923c' : '#f87171', marginTop: 1, flexShrink: 0 }} aria-hidden="true" />
619
  <div>
620
- <p className="text-sm font-semibold" style={{ color: isSocialUrl(input) ? '#fb923c' : '#f87171', fontFamily: 'var(--font-display)' }}>
621
- {isSocialUrl(input) ? 'Social media URLs are not supported' : 'Verification failed'}
 
 
 
 
 
 
622
  </p>
623
- {isSocialUrl(input) ? (
624
- <>
625
- <p className="text-xs mt-1" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
626
- Facebook, X, and Twitter block server-side scraping β€” the page requires a login.
627
- </p>
628
- <p className="text-xs mt-1.5 font-semibold" style={{ color: 'var(--text-primary)', fontFamily: 'var(--font-body)' }}>
629
- Instead: copy the post caption/text and paste it into the <strong>Text</strong> tab.
630
- </p>
631
- </>
632
- ) : (
633
- <p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
634
- {error}
635
- {/failed to fetch|network|ERR_/i.test(error) && (
636
- <> β€” Make sure the backend is running at <code>localhost:8000</code>.</>
637
- )}
638
- </p>
639
- )}
640
  </div>
641
  </div>
642
  )}
 
271
  useEffect(() => {
272
  if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
273
  try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
 
 
274
  setUrlPreviewLoading(true)
275
  const timer = setTimeout(async () => {
276
  try {
 
298
  e.preventDefault()
299
  if (!canSubmit) return
300
 
 
 
 
 
 
 
301
  /* Capture what the user submitted before any state resets */
302
  const previewUrl = (tab === 'image' || tab === 'video') && file
303
  ? URL.createObjectURL(file)
 
607
  <div id={errorId} role="alert"
608
  className="card p-4 flex items-start gap-2"
609
  style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
610
+ <AlertCircle size={15} style={{ color: '#f87171', marginTop: 1, flexShrink: 0 }} aria-hidden="true" />
611
  <div>
612
+ <p className="text-sm font-semibold" style={{ color: '#f87171', fontFamily: 'var(--font-display)' }}>
613
+ Verification failed
614
+ </p>
615
+ <p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
616
+ {error}
617
+ {/failed to fetch|network|ERR_/i.test(error) && (
618
+ <> β€” Make sure the backend is running at <code>localhost:8000</code>.</>
619
+ )}
620
  </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  </div>
622
  </div>
623
  )}
inputs/url_scraper.py CHANGED
@@ -38,6 +38,58 @@ def _get_domain(url: str) -> str:
38
  return urlparse(url).netloc.replace("www.", "")
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def _slug_to_text(url: str) -> str:
42
  """
43
  Synthesize minimal article text from the URL slug and domain.
@@ -240,6 +292,20 @@ async def scrape_url(url: str) -> tuple[str, str]:
240
 
241
  domain = _get_domain(url)
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  if not _robots_allow(url):
244
  logger.warning("robots.txt disallows scraping %s", url)
245
  raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
 
38
  return urlparse(url).netloc.replace("www.", "")
39
 
40
 
41
+ def _is_social_url(url: str) -> str | None:
42
+ """Return 'facebook' | 'twitter' | None based on hostname."""
43
+ host = urlparse(url).netloc.lower()
44
+ if "facebook.com" in host:
45
+ return "facebook"
46
+ if "x.com" in host or "twitter.com" in host:
47
+ return "twitter"
48
+ return None
49
+
50
+
51
+ async def _scrape_social_oembed(url: str, platform: str, client) -> str:
52
+ """
53
+ Extract post text via the public oEmbed API β€” no login required.
54
+ Facebook: https://www.facebook.com/plugins/post/oembed.json/
55
+ Twitter/X: https://publish.twitter.com/oembed
56
+ Parses the returned HTML blockquote for plain text.
57
+ """
58
+ from bs4 import BeautifulSoup
59
+
60
+ encoded = urllib.parse.quote(url, safe="")
61
+ if platform == "facebook":
62
+ oembed_url = (
63
+ f"https://www.facebook.com/plugins/post/oembed.json/"
64
+ f"?url={encoded}&omitscript=1"
65
+ )
66
+ else:
67
+ oembed_url = (
68
+ f"https://publish.twitter.com/oembed"
69
+ f"?url={encoded}&omit_script=1"
70
+ )
71
+
72
+ try:
73
+ resp = await client.get(oembed_url, timeout=15)
74
+ if resp.status_code != 200:
75
+ logger.warning("oEmbed %s HTTP %d for %s", platform, resp.status_code, url)
76
+ return ""
77
+ data = resp.json()
78
+ html = data.get("html", "")
79
+ if not html:
80
+ return ""
81
+ soup = BeautifulSoup(html, "lxml")
82
+ # Drop the trailing attribution link / timestamp
83
+ for a in soup.find_all("a"):
84
+ a.decompose()
85
+ text = _clean_text(soup.get_text(separator=" ", strip=True))
86
+ logger.info("oEmbed %s: %d chars from %s", platform, len(text), url)
87
+ return text
88
+ except Exception as exc:
89
+ logger.warning("oEmbed failed for %s (%s): %s", url, platform, exc)
90
+ return ""
91
+
92
+
93
  def _slug_to_text(url: str) -> str:
94
  """
95
  Synthesize minimal article text from the URL slug and domain.
 
292
 
293
  domain = _get_domain(url)
294
 
295
+ # ── Social media: use public oEmbed API (no login required) ──────────────
296
+ platform = _is_social_url(url)
297
+ if platform:
298
+ try:
299
+ import httpx
300
+ except ImportError as exc:
301
+ raise RuntimeError(f"Missing dependency: {exc}") from exc
302
+ async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
303
+ text = await _scrape_social_oembed(url, platform, client)
304
+ if text and len(text.strip()) >= 20:
305
+ return text, domain
306
+ # oEmbed failed β€” could be a profile/group URL rather than a specific post
307
+ return "", domain
308
+
309
  if not _robots_allow(url):
310
  logger.warning("robots.txt disallows scraping %s", url)
311
  raise ValueError(f"Scraping disallowed by robots.txt for {domain}")