Spaces:

muddasser
/

Webscrapping_Playwright

Sleeping

App Files Files Community

muddasser commited on 28 days ago

Commit

bf68fe9

verified ·

1 Parent(s): b04d996

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -44

app.py CHANGED Viewed

@@ -228,8 +228,9 @@ for key, default in [
 # ── Utilities ──────────────────────────────────────────────────────────────────
 def clean_text(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
     return text.strip()
 def is_valid_url(url):
@@ -255,56 +256,51 @@ def scrape_website(url):
         browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
-            # Use networkidle so JS-rendered and late-loading content is fully ready
             page.goto(url, wait_until="networkidle", timeout=45000)
             title = page.title()
-            # Try semantic/common content selectors first
-            priority_selectors = [
-                "#content",
-                ".mw-parser-output",
-                "main",
-                ".main-content",
-                "#main",
-                "article",
-                ".container",        # generic bootstrap layouts
-                "#wrapper",
-                ".page-content",
-                ".site-content",
-            ]
-            el = None
-            for sel in priority_selectors:
                 try:
-                    el = page.query_selector(sel)
-                    if el:
-                        break
                 except:
                     continue
-            # If no semantic container found, extract structured text from
-            # headings + paragraphs + list items directly — avoids nav/footer noise
-            if not el:
-                chunks = []
-                for tag in ["h1","h2","h3","h4","p","li","td","th","span"]:
-                    elements = page.query_selector_all(tag)
-                    for e in elements:
-                        try:
-                            t = e.inner_text().strip()
-                            if t and len(t) > 2:
-                                chunks.append(t)
-                        except:
-                            continue
-                text = clean_text(" ".join(chunks))
-            else:
-                text = clean_text(el.inner_text())
-            if not text or len(text) < 50:
-                # Last resort — full body
                 body = page.query_selector("body")
-                text = clean_text(body.inner_text())
-            logging.info(f"Scraped {len(text)} chars from {url}")
-            return {"title": title, "content": text, "url": url}
         except Exception as e:
             logging.error(f"Scrape error: {e}")
             st.error(f"Scraping failed: {e}")
@@ -484,4 +480,4 @@ else:
         </div>
         Enter a URL above and hit <strong>Scrape</strong> to get started.
     </div>
-    """, unsafe_allow_html=True)

 # ── Utilities ──────────────────────────────────────────────────────────────────
 def clean_text(text):
+    # Only collapse whitespace — preserve Rs. prices, commas, symbols
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
     return text.strip()
 def is_valid_url(url):
         browser = p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
             page.goto(url, wait_until="networkidle", timeout=45000)
             title = page.title()
+            # Strategy 1: extract structured name+price pairs from <li> elements
+            # Works well for listing/price pages like whatmobile.com.pk
+            lines = []
+            li_elements = page.query_selector_all("li")
+            for li in li_elements:
                 try:
+                    text = li.inner_text().strip()
+                    # Keep li items that contain a heading and a price-like pattern
+                    if text and len(text) > 3 and len(text) < 300:
+                        lines.append(text)
                 except:
                     continue
+            # Strategy 2: grab all headings and paragraphs too
+            for tag in ["h1", "h2", "h3", "h4", "p", "td"]:
+                elements = page.query_selector_all(tag)
+                for e in elements:
+                    try:
+                        text = e.inner_text().strip()
+                        if text and len(text) > 3 and len(text) < 500:
+                            lines.append(text)
+                    except:
+                        continue
+            # Deduplicate while preserving order
+            seen = set()
+            unique_lines = []
+            for line in lines:
+                normalised = re.sub(r'\s+', ' ', line).strip()
+                if normalised not in seen:
+                    seen.add(normalised)
+                    unique_lines.append(normalised)
+            content = "\n".join(unique_lines)
+            # Fallback to full body if we got almost nothing
+            if len(content) < 200:
                 body = page.query_selector("body")
+                content = clean_text(body.inner_text()) if body else content
+            logging.info(f"Scraped {len(content)} chars from {url}")
+            return {"title": title, "content": content, "url": url}
         except Exception as e:
             logging.error(f"Scrape error: {e}")
             st.error(f"Scraping failed: {e}")
         </div>
         Enter a URL above and hit <strong>Scrape</strong> to get started.
     </div>
+    """, unsafe_allow_html=True)