muddasser commited on
Commit
b4c7a6b
·
verified ·
1 Parent(s): 971b841

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -6
app.py CHANGED
@@ -255,19 +255,58 @@ def scrape_website(url):
255
  browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
256
  page = browser.new_page()
257
  try:
258
- page.goto(url, wait_until="domcontentloaded", timeout=30000)
 
259
  title = page.title()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  el = None
261
- for sel in ["#content",".mw-parser-output","main",".main-content","#main","article"]:
262
  try:
263
  el = page.query_selector(sel)
264
- if el: break
265
- except: continue
 
 
 
 
 
266
  if not el:
267
- el = page.query_selector("body")
268
- text = clean_text(el.inner_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  return {"title": title, "content": text, "url": url}
270
  except Exception as e:
 
271
  st.error(f"Scraping failed: {e}")
272
  return None
273
  finally:
 
255
  browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
256
  page = browser.new_page()
257
  try:
258
+ # Use networkidle so JS-rendered and late-loading content is fully ready
259
+ page.goto(url, wait_until="networkidle", timeout=45000)
260
  title = page.title()
261
+
262
+ # Try semantic/common content selectors first
263
+ priority_selectors = [
264
+ "#content",
265
+ ".mw-parser-output",
266
+ "main",
267
+ ".main-content",
268
+ "#main",
269
+ "article",
270
+ ".container", # generic bootstrap layouts
271
+ "#wrapper",
272
+ ".page-content",
273
+ ".site-content",
274
+ ]
275
  el = None
276
+ for sel in priority_selectors:
277
  try:
278
  el = page.query_selector(sel)
279
+ if el:
280
+ break
281
+ except:
282
+ continue
283
+
284
+ # If no semantic container found, extract structured text from
285
+ # headings + paragraphs + list items directly — avoids nav/footer noise
286
  if not el:
287
+ chunks = []
288
+ for tag in ["h1","h2","h3","h4","p","li","td","th","span"]:
289
+ elements = page.query_selector_all(tag)
290
+ for e in elements:
291
+ try:
292
+ t = e.inner_text().strip()
293
+ if t and len(t) > 2:
294
+ chunks.append(t)
295
+ except:
296
+ continue
297
+ text = clean_text(" ".join(chunks))
298
+ else:
299
+ text = clean_text(el.inner_text())
300
+
301
+ if not text or len(text) < 50:
302
+ # Last resort — full body
303
+ body = page.query_selector("body")
304
+ text = clean_text(body.inner_text())
305
+
306
+ logging.info(f"Scraped {len(text)} chars from {url}")
307
  return {"title": title, "content": text, "url": url}
308
  except Exception as e:
309
+ logging.error(f"Scrape error: {e}")
310
  st.error(f"Scraping failed: {e}")
311
  return None
312
  finally: