Spaces:
Running
Running
Ryan Christian D. Deniega commited on
Commit Β·
0ee6199
1
Parent(s): ed3755c
feat: oEmbed scraping for Facebook/X URLs, remove social URL blocks
Browse files- extension/popup.js +3 -27
- frontend/src/pages/VerifyPage.jsx +9 -28
- inputs/url_scraper.py +66 -0
extension/popup.js
CHANGED
|
@@ -50,12 +50,7 @@ function isUrl(s) {
|
|
| 50 |
try { new URL(s); return s.startsWith('http'); } catch { return false }
|
| 51 |
}
|
| 52 |
|
| 53 |
-
|
| 54 |
-
try {
|
| 55 |
-
const h = new URL(s).hostname
|
| 56 |
-
return h.includes('facebook.com') || h.includes('x.com') || h.includes('twitter.com')
|
| 57 |
-
} catch { return false }
|
| 58 |
-
}
|
| 59 |
|
| 60 |
// ββ Render helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
|
|
@@ -142,16 +137,12 @@ const currentUrlEl = document.getElementById('current-url')
|
|
| 142 |
// Auto-populate input with current tab URL if it's a news article
|
| 143 |
chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
|
| 144 |
const url = tab?.url ?? ''
|
| 145 |
-
if (url && !url.startsWith('chrome')
|
| 146 |
currentUrlEl.textContent = url
|
| 147 |
currentUrlEl.title = url
|
| 148 |
verifyInput.value = url
|
| 149 |
} else {
|
| 150 |
-
|
| 151 |
-
const site = h.includes('x.com') || h.includes('twitter.com') ? 'x.com / twitter.com'
|
| 152 |
-
: h.includes('facebook.com') ? 'facebook.com'
|
| 153 |
-
: 'social media'
|
| 154 |
-
currentUrlEl.textContent = `${site} β paste post text below`
|
| 155 |
}
|
| 156 |
})
|
| 157 |
|
|
@@ -167,21 +158,6 @@ btnVerify.addEventListener('click', async () => {
|
|
| 167 |
<div class="spinner" aria-hidden="true"></div><br>Analyzing claimβ¦
|
| 168 |
</div>`
|
| 169 |
|
| 170 |
-
// Block social media URLs β backend can't scrape them
|
| 171 |
-
if (isSocialUrl(raw)) {
|
| 172 |
-
btnVerify.disabled = false
|
| 173 |
-
btnVerify.setAttribute('aria-busy', 'false')
|
| 174 |
-
btnVerify.textContent = 'Verify Claim'
|
| 175 |
-
verifyResult.innerHTML = `
|
| 176 |
-
<div class="state-error" role="alert">
|
| 177 |
-
Facebook, X, and Twitter URLs can't be scraped by the backend.<br>
|
| 178 |
-
<span style="font-size:10px;color:var(--text-muted)">
|
| 179 |
-
Paste the post's text/caption directly instead, or let the extension auto-scan your feed.
|
| 180 |
-
</span>
|
| 181 |
-
</div>`
|
| 182 |
-
return
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
|
| 186 |
const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
|
| 187 |
const resp = await msg(payload)
|
|
|
|
| 50 |
try { new URL(s); return s.startsWith('http'); } catch { return false }
|
| 51 |
}
|
| 52 |
|
| 53 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
// ββ Render helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
|
|
|
|
| 137 |
// Auto-populate input with current tab URL if it's a news article
|
| 138 |
chrome.tabs.query({ active: true, currentWindow: true }, ([tab]) => {
|
| 139 |
const url = tab?.url ?? ''
|
| 140 |
+
if (url && !url.startsWith('chrome')) {
|
| 141 |
currentUrlEl.textContent = url
|
| 142 |
currentUrlEl.title = url
|
| 143 |
verifyInput.value = url
|
| 144 |
} else {
|
| 145 |
+
currentUrlEl.textContent = 'No active page'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
}
|
| 147 |
})
|
| 148 |
|
|
|
|
| 158 |
<div class="spinner" aria-hidden="true"></div><br>Analyzing claimβ¦
|
| 159 |
</div>`
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
const type = isUrl(raw) ? 'VERIFY_URL' : 'VERIFY_TEXT'
|
| 162 |
const payload = type === 'VERIFY_URL' ? { type, url: raw } : { type, text: raw }
|
| 163 |
const resp = await msg(payload)
|
frontend/src/pages/VerifyPage.jsx
CHANGED
|
@@ -271,8 +271,6 @@ export default function VerifyPage() {
|
|
| 271 |
useEffect(() => {
|
| 272 |
if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
|
| 273 |
try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
|
| 274 |
-
// Don't attempt to preview social media URLs β they're login-protected
|
| 275 |
-
if (isSocialUrl(input.trim())) { setUrlPreview(null); setUrlPreviewLoading(false); return }
|
| 276 |
setUrlPreviewLoading(true)
|
| 277 |
const timer = setTimeout(async () => {
|
| 278 |
try {
|
|
@@ -300,12 +298,6 @@ export default function VerifyPage() {
|
|
| 300 |
e.preventDefault()
|
| 301 |
if (!canSubmit) return
|
| 302 |
|
| 303 |
-
/* Block social media URLs β backend can't scrape them */
|
| 304 |
-
if (tab === 'url' && isSocialUrl(input)) {
|
| 305 |
-
setError('Facebook, X, and Twitter URLs cannot be scraped β the page is login-protected.\n\nInstead: copy the post\'s text/caption and paste it into the Text tab.')
|
| 306 |
-
return
|
| 307 |
-
}
|
| 308 |
-
|
| 309 |
/* Capture what the user submitted before any state resets */
|
| 310 |
const previewUrl = (tab === 'image' || tab === 'video') && file
|
| 311 |
? URL.createObjectURL(file)
|
|
@@ -615,28 +607,17 @@ export default function VerifyPage() {
|
|
| 615 |
<div id={errorId} role="alert"
|
| 616 |
className="card p-4 flex items-start gap-2"
|
| 617 |
style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
|
| 618 |
-
<AlertCircle size={15} style={{ color:
|
| 619 |
<div>
|
| 620 |
-
<p className="text-sm font-semibold" style={{ color:
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
</p>
|
| 623 |
-
{isSocialUrl(input) ? (
|
| 624 |
-
<>
|
| 625 |
-
<p className="text-xs mt-1" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
|
| 626 |
-
Facebook, X, and Twitter block server-side scraping β the page requires a login.
|
| 627 |
-
</p>
|
| 628 |
-
<p className="text-xs mt-1.5 font-semibold" style={{ color: 'var(--text-primary)', fontFamily: 'var(--font-body)' }}>
|
| 629 |
-
Instead: copy the post caption/text and paste it into the <strong>Text</strong> tab.
|
| 630 |
-
</p>
|
| 631 |
-
</>
|
| 632 |
-
) : (
|
| 633 |
-
<p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
|
| 634 |
-
{error}
|
| 635 |
-
{/failed to fetch|network|ERR_/i.test(error) && (
|
| 636 |
-
<> β Make sure the backend is running at <code>localhost:8000</code>.</>
|
| 637 |
-
)}
|
| 638 |
-
</p>
|
| 639 |
-
)}
|
| 640 |
</div>
|
| 641 |
</div>
|
| 642 |
)}
|
|
|
|
| 271 |
useEffect(() => {
|
| 272 |
if (tab !== 'url' || !input.trim()) { setUrlPreview(null); setUrlPreviewLoading(false); return }
|
| 273 |
try { new URL(input.trim()) } catch { setUrlPreview(null); setUrlPreviewLoading(false); return }
|
|
|
|
|
|
|
| 274 |
setUrlPreviewLoading(true)
|
| 275 |
const timer = setTimeout(async () => {
|
| 276 |
try {
|
|
|
|
| 298 |
e.preventDefault()
|
| 299 |
if (!canSubmit) return
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
/* Capture what the user submitted before any state resets */
|
| 302 |
const previewUrl = (tab === 'image' || tab === 'video') && file
|
| 303 |
? URL.createObjectURL(file)
|
|
|
|
| 607 |
<div id={errorId} role="alert"
|
| 608 |
className="card p-4 flex items-start gap-2"
|
| 609 |
style={{ borderColor: isSocialUrl(input) ? 'rgba(220,150,38,0.4)' : 'rgba(220,38,38,0.4)' }}>
|
| 610 |
+
<AlertCircle size={15} style={{ color: '#f87171', marginTop: 1, flexShrink: 0 }} aria-hidden="true" />
|
| 611 |
<div>
|
| 612 |
+
<p className="text-sm font-semibold" style={{ color: '#f87171', fontFamily: 'var(--font-display)' }}>
|
| 613 |
+
Verification failed
|
| 614 |
+
</p>
|
| 615 |
+
<p className="text-xs mt-0.5" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)' }}>
|
| 616 |
+
{error}
|
| 617 |
+
{/failed to fetch|network|ERR_/i.test(error) && (
|
| 618 |
+
<> β Make sure the backend is running at <code>localhost:8000</code>.</>
|
| 619 |
+
)}
|
| 620 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
</div>
|
| 622 |
</div>
|
| 623 |
)}
|
inputs/url_scraper.py
CHANGED
|
@@ -38,6 +38,58 @@ def _get_domain(url: str) -> str:
|
|
| 38 |
return urlparse(url).netloc.replace("www.", "")
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def _slug_to_text(url: str) -> str:
|
| 42 |
"""
|
| 43 |
Synthesize minimal article text from the URL slug and domain.
|
|
@@ -240,6 +292,20 @@ async def scrape_url(url: str) -> tuple[str, str]:
|
|
| 240 |
|
| 241 |
domain = _get_domain(url)
|
| 242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
if not _robots_allow(url):
|
| 244 |
logger.warning("robots.txt disallows scraping %s", url)
|
| 245 |
raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
|
|
|
|
| 38 |
return urlparse(url).netloc.replace("www.", "")
|
| 39 |
|
| 40 |
|
| 41 |
+
def _is_social_url(url: str) -> str | None:
|
| 42 |
+
"""Return 'facebook' | 'twitter' | None based on hostname."""
|
| 43 |
+
host = urlparse(url).netloc.lower()
|
| 44 |
+
if "facebook.com" in host:
|
| 45 |
+
return "facebook"
|
| 46 |
+
if "x.com" in host or "twitter.com" in host:
|
| 47 |
+
return "twitter"
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
async def _scrape_social_oembed(url: str, platform: str, client) -> str:
|
| 52 |
+
"""
|
| 53 |
+
Extract post text via the public oEmbed API β no login required.
|
| 54 |
+
Facebook: https://www.facebook.com/plugins/post/oembed.json/
|
| 55 |
+
Twitter/X: https://publish.twitter.com/oembed
|
| 56 |
+
Parses the returned HTML blockquote for plain text.
|
| 57 |
+
"""
|
| 58 |
+
from bs4 import BeautifulSoup
|
| 59 |
+
|
| 60 |
+
encoded = urllib.parse.quote(url, safe="")
|
| 61 |
+
if platform == "facebook":
|
| 62 |
+
oembed_url = (
|
| 63 |
+
f"https://www.facebook.com/plugins/post/oembed.json/"
|
| 64 |
+
f"?url={encoded}&omitscript=1"
|
| 65 |
+
)
|
| 66 |
+
else:
|
| 67 |
+
oembed_url = (
|
| 68 |
+
f"https://publish.twitter.com/oembed"
|
| 69 |
+
f"?url={encoded}&omit_script=1"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
resp = await client.get(oembed_url, timeout=15)
|
| 74 |
+
if resp.status_code != 200:
|
| 75 |
+
logger.warning("oEmbed %s HTTP %d for %s", platform, resp.status_code, url)
|
| 76 |
+
return ""
|
| 77 |
+
data = resp.json()
|
| 78 |
+
html = data.get("html", "")
|
| 79 |
+
if not html:
|
| 80 |
+
return ""
|
| 81 |
+
soup = BeautifulSoup(html, "lxml")
|
| 82 |
+
# Drop the trailing attribution link / timestamp
|
| 83 |
+
for a in soup.find_all("a"):
|
| 84 |
+
a.decompose()
|
| 85 |
+
text = _clean_text(soup.get_text(separator=" ", strip=True))
|
| 86 |
+
logger.info("oEmbed %s: %d chars from %s", platform, len(text), url)
|
| 87 |
+
return text
|
| 88 |
+
except Exception as exc:
|
| 89 |
+
logger.warning("oEmbed failed for %s (%s): %s", url, platform, exc)
|
| 90 |
+
return ""
|
| 91 |
+
|
| 92 |
+
|
| 93 |
def _slug_to_text(url: str) -> str:
|
| 94 |
"""
|
| 95 |
Synthesize minimal article text from the URL slug and domain.
|
|
|
|
| 292 |
|
| 293 |
domain = _get_domain(url)
|
| 294 |
|
| 295 |
+
# ββ Social media: use public oEmbed API (no login required) ββββββββββββββ
|
| 296 |
+
platform = _is_social_url(url)
|
| 297 |
+
if platform:
|
| 298 |
+
try:
|
| 299 |
+
import httpx
|
| 300 |
+
except ImportError as exc:
|
| 301 |
+
raise RuntimeError(f"Missing dependency: {exc}") from exc
|
| 302 |
+
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
| 303 |
+
text = await _scrape_social_oembed(url, platform, client)
|
| 304 |
+
if text and len(text.strip()) >= 20:
|
| 305 |
+
return text, domain
|
| 306 |
+
# oEmbed failed β could be a profile/group URL rather than a specific post
|
| 307 |
+
return "", domain
|
| 308 |
+
|
| 309 |
if not _robots_allow(url):
|
| 310 |
logger.warning("robots.txt disallows scraping %s", url)
|
| 311 |
raise ValueError(f"Scraping disallowed by robots.txt for {domain}")
|