Spaces:

amalsp
/

web-scraper-app

Sleeping

App Files Files Community

amalsp commited on 6 days ago

Commit

2db3fe7

verified ·

1 Parent(s): 82f5373

Fix content extraction to provide clean, structured product data

Browse files

Files changed (1) hide show

main.py +66 -24

main.py CHANGED Viewed

@@ -77,37 +77,79 @@ def scrape_links(soup: BeautifulSoup):
     return df
 def scrape_all_content(soup: BeautifulSoup):
-    # Extract ALL visible text content from the page
     data = []
-    # Get all divs, spans, and p tags with text
-    for element in soup.find_all(["div", "span", "p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "th"]):
-        text = element.get_text(strip=True)
-        if text and len(text) > 2:  # Only include meaningful text
-            # Get element classes and id for context
-            classes = " ".join(element.get("class", []))
-            elem_id = element.get("id", "")
-            elem_type = element.name
-            data.append({
-                "Type": elem_type,
-                "Content": text[:500],  # Limit to 500 chars per element
-                "Class": classes[:100] if classes else "",
-                "ID": elem_id[:50] if elem_id else ""
-            })
     if not data:
-        raise HTTPException(status_code=400, detail="No content found on page")
-    # Remove duplicate content
-    seen = set()
-    unique_data = []
-    for item in data:
-        if item["Content"] not in seen:
-            seen.add(item["Content"])
-            unique_data.append(item)
-    df = pd.DataFrame(unique_data)
     return df
 @app.post("/scrape")

     return df
 def scrape_all_content(soup: BeautifulSoup):
+    # IMPROVED: Extract only meaningful product/content data
     data = []
+    # Remove unwanted elements (navigation, scripts, styles, ads)
+    for tag in soup(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
+        tag.decompose()
+    # Try to find product/article containers first (common e-commerce patterns)
+    product_containers = soup.find_all(
+        attrs={
+            "class": re.compile(r"product|item|card|listing|article", re.I)
+        }
+    )
+    # If we find product containers, extract from them
+    if product_containers and len(product_containers) > 5:
+        for container in product_containers[:100]:  # Limit to first 100 items
+            # Extract title/name
+            title_elem = container.find(["h1", "h2", "h3", "h4", "a"],
+                                       attrs={"class": re.compile(r"title|name|heading", re.I)})
+            title = title_elem.get_text(strip=True) if title_elem else ""
+            # Extract price
+            price_elem = container.find(attrs={"class": re.compile(r"price|cost|amount", re.I)})
+            price = price_elem.get_text(strip=True) if price_elem else ""
+            # Extract description
+            desc_elem = container.find(["p", "div"],
+                                      attrs={"class": re.compile(r"desc|detail|summary", re.I)})
+            description = desc_elem.get_text(strip=True)[:200] if desc_elem else ""
+            # Extract link
+            link_elem = container.find("a", href=True)
+            link = link_elem["href"] if link_elem else ""
+            if title or price:  # Only add if we have meaningful data
+                data.append({
+                    "Title": title[:200],
+                    "Price": price[:50],
+                    "Description": description,
+                    "Link": link[:300]
+                })
+    # Fallback: If no product containers found, extract main content
+    else:
+        # Look for main content area
+        main_content = soup.find(["main", "article", "div"],
+                                attrs={"id": re.compile(r"main|content|primary", re.I)}) or soup
+        # Extract headings and associated content
+        for heading in main_content.find_all(["h1", "h2", "h3"]):
+            heading_text = heading.get_text(strip=True)
+            if len(heading_text) > 5:  # Skip very short headings
+                # Get next sibling paragraph or div
+                content = ""
+                next_elem = heading.find_next_sibling(["p", "div", "ul"])
+                if next_elem:
+                    content = next_elem.get_text(strip=True)[:300]
+                data.append({
+                    "Title": heading_text[:200],
+                    "Price": "",
+                    "Description": content,
+                    "Link": ""
+                })
     if not data:
+        raise HTTPException(status_code=400, detail="No meaningful content found on page. Try 'Tables' or 'Links' mode instead.")
+    # Remove exact duplicates
+    df = pd.DataFrame(data)
+    df = df.drop_duplicates(subset=["Title"], keep="first")
     return df
 @app.post("/scrape")