Spaces:

amalsp
/

web-scraper-app

Sleeping

App Files Files Community

amalsp commited on 8 days ago

Commit

739af99

verified ·

1 Parent(s): 2a22627

Update main.py

Browse files

Files changed (1) hide show

main.py +53 -0

main.py CHANGED Viewed

@@ -80,6 +80,57 @@ def scrape_links(soup: BeautifulSoup):
     df = pd.DataFrame(links)
     return df
 @app.post("/scrape")
 def scrape_to_excel(req: ScrapeRequest):
     try:
@@ -97,6 +148,8 @@ def scrape_to_excel(req: ScrapeRequest):
         df = scrape_table(soup)
     elif req.mode == "links":
         df = scrape_links(soup)
     else:
         raise HTTPException(status_code=400, detail="Unsupported mode")

     df = pd.DataFrame(links)
     return df
+    def scrape_products(soup: BeautifulSoup):
+    """Extract product information from e-commerce sites"""
+    products = []
+    # Try different common product selectors
+    product_containers = (
+        soup.find_all('div', class_=lambda x: x and ('product' in str(x).lower() or 'item' in str(x).lower())) or
+        soup.find_all('div', {'data-id': True}) or
+        soup.find_all('article') or
+        soup.find_all('li', class_=lambda x: x and 'product' in str(x).lower())
+    )
+    if not product_containers:
+        raise HTTPException(status_code=400, detail="No products found on page")
+    for container in product_containers[:50]:  # Limit to first 50 products
+        product = {}
+        # Extract product name/title
+        title = container.find(['h1', 'h2', 'h3', 'h4', 'a'], class_=lambda x: x and any(k in str(x).lower() for k in ['title', 'name', 'product']))
+        if not title:
+            title = container.find(['h1', 'h2', 'h3', 'h4', 'a'])
+        product['Product Name'] = title.get_text(strip=True) if title else ''
+        # Extract price
+        price = container.find(['span', 'div', 'p'], class_=lambda x: x and 'price' in str(x).lower())
+        product['Price'] = price.get_text(strip=True) if price else ''
+        # Extract rating
+        rating = container.find(['span', 'div'], class_=lambda x: x and 'rating' in str(x).lower())
+        product['Rating'] = rating.get_text(strip=True) if rating else ''
+        # Extract link
+        link = container.find('a', href=True)
+        product['Link'] = link['href'] if link else ''
+        # Extract image
+        img = container.find('img')
+        product['Image'] = img.get('src', img.get('data-src', '')) if img else ''
+        # Only add if we have at least a name or price
+        if product['Product Name'] or product['Price']:
+            products.append(product)
+    if not products:
+        raise HTTPException(status_code=400, detail="Could not extract product data")
+    df = pd.DataFrame(products)
+    return df
 @app.post("/scrape")
 def scrape_to_excel(req: ScrapeRequest):
     try:
         df = scrape_table(soup)
     elif req.mode == "links":
         df = scrape_links(soup)
+            elif req.mode == "products":
+        df = scrape_products(soup)
     else:
         raise HTTPException(status_code=400, detail="Unsupported mode")