Update app.py
Browse files
app.py
CHANGED
|
@@ -255,19 +255,58 @@ def scrape_website(url):
|
|
| 255 |
browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
|
| 256 |
page = browser.new_page()
|
| 257 |
try:
|
| 258 |
-
|
|
|
|
| 259 |
title = page.title()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
el = None
|
| 261 |
-
for sel in
|
| 262 |
try:
|
| 263 |
el = page.query_selector(sel)
|
| 264 |
-
if el:
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
if not el:
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return {"title": title, "content": text, "url": url}
|
| 270 |
except Exception as e:
|
|
|
|
| 271 |
st.error(f"Scraping failed: {e}")
|
| 272 |
return None
|
| 273 |
finally:
|
|
|
|
| 255 |
browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
|
| 256 |
page = browser.new_page()
|
| 257 |
try:
|
| 258 |
+
# Use networkidle so JS-rendered and late-loading content is fully ready
|
| 259 |
+
page.goto(url, wait_until="networkidle", timeout=45000)
|
| 260 |
title = page.title()
|
| 261 |
+
|
| 262 |
+
# Try semantic/common content selectors first
|
| 263 |
+
priority_selectors = [
|
| 264 |
+
"#content",
|
| 265 |
+
".mw-parser-output",
|
| 266 |
+
"main",
|
| 267 |
+
".main-content",
|
| 268 |
+
"#main",
|
| 269 |
+
"article",
|
| 270 |
+
".container", # generic bootstrap layouts
|
| 271 |
+
"#wrapper",
|
| 272 |
+
".page-content",
|
| 273 |
+
".site-content",
|
| 274 |
+
]
|
| 275 |
el = None
|
| 276 |
+
for sel in priority_selectors:
|
| 277 |
try:
|
| 278 |
el = page.query_selector(sel)
|
| 279 |
+
if el:
|
| 280 |
+
break
|
| 281 |
+
except:
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
# If no semantic container found, extract structured text from
|
| 285 |
+
# headings + paragraphs + list items directly — avoids nav/footer noise
|
| 286 |
if not el:
|
| 287 |
+
chunks = []
|
| 288 |
+
for tag in ["h1","h2","h3","h4","p","li","td","th","span"]:
|
| 289 |
+
elements = page.query_selector_all(tag)
|
| 290 |
+
for e in elements:
|
| 291 |
+
try:
|
| 292 |
+
t = e.inner_text().strip()
|
| 293 |
+
if t and len(t) > 2:
|
| 294 |
+
chunks.append(t)
|
| 295 |
+
except:
|
| 296 |
+
continue
|
| 297 |
+
text = clean_text(" ".join(chunks))
|
| 298 |
+
else:
|
| 299 |
+
text = clean_text(el.inner_text())
|
| 300 |
+
|
| 301 |
+
if not text or len(text) < 50:
|
| 302 |
+
# Last resort — full body
|
| 303 |
+
body = page.query_selector("body")
|
| 304 |
+
text = clean_text(body.inner_text())
|
| 305 |
+
|
| 306 |
+
logging.info(f"Scraped {len(text)} chars from {url}")
|
| 307 |
return {"title": title, "content": text, "url": url}
|
| 308 |
except Exception as e:
|
| 309 |
+
logging.error(f"Scrape error: {e}")
|
| 310 |
st.error(f"Scraping failed: {e}")
|
| 311 |
return None
|
| 312 |
finally:
|