Spaces:

muddasser
/

Webscrapping_Playwright

Running

App Files Files Community

muddasser commited on Mar 10

Commit

b4c7a6b

verified ·

1 Parent(s): 971b841

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -6

app.py CHANGED Viewed

@@ -255,19 +255,58 @@ def scrape_website(url):
         browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
-            page.goto(url, wait_until="domcontentloaded", timeout=30000)
             title = page.title()
             el = None
-            for sel in ["#content",".mw-parser-output","main",".main-content","#main","article"]:
                 try:
                     el = page.query_selector(sel)
-                    if el: break
-                except: continue
             if not el:
-                el = page.query_selector("body")
-            text = clean_text(el.inner_text())
             return {"title": title, "content": text, "url": url}
         except Exception as e:
             st.error(f"Scraping failed: {e}")
             return None
         finally:

         browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
+            # Use networkidle so JS-rendered and late-loading content is fully ready
+            page.goto(url, wait_until="networkidle", timeout=45000)
             title = page.title()
+            # Try semantic/common content selectors first
+            priority_selectors = [
+                "#content",
+                ".mw-parser-output",
+                "main",
+                ".main-content",
+                "#main",
+                "article",
+                ".container",        # generic bootstrap layouts
+                "#wrapper",
+                ".page-content",
+                ".site-content",
+            ]
             el = None
+            for sel in priority_selectors:
                 try:
                     el = page.query_selector(sel)
+                    if el:
+                        break
+                except:
+                    continue
+            # If no semantic container found, extract structured text from
+            # headings + paragraphs + list items directly — avoids nav/footer noise
             if not el:
+                chunks = []
+                for tag in ["h1","h2","h3","h4","p","li","td","th","span"]:
+                    elements = page.query_selector_all(tag)
+                    for e in elements:
+                        try:
+                            t = e.inner_text().strip()
+                            if t and len(t) > 2:
+                                chunks.append(t)
+                        except:
+                            continue
+                text = clean_text(" ".join(chunks))
+            else:
+                text = clean_text(el.inner_text())
+            if not text or len(text) < 50:
+                # Last resort — full body
+                body = page.query_selector("body")
+                text = clean_text(body.inner_text())
+            logging.info(f"Scraped {len(text)} chars from {url}")
             return {"title": title, "content": text, "url": url}
         except Exception as e:
+            logging.error(f"Scrape error: {e}")
             st.error(f"Scraping failed: {e}")
             return None
         finally: