Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -80,6 +80,57 @@ def scrape_links(soup: BeautifulSoup):
|
|
| 80 |
df = pd.DataFrame(links)
|
| 81 |
return df
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
@app.post("/scrape")
|
| 84 |
def scrape_to_excel(req: ScrapeRequest):
|
| 85 |
try:
|
|
@@ -97,6 +148,8 @@ def scrape_to_excel(req: ScrapeRequest):
|
|
| 97 |
df = scrape_table(soup)
|
| 98 |
elif req.mode == "links":
|
| 99 |
df = scrape_links(soup)
|
|
|
|
|
|
|
| 100 |
else:
|
| 101 |
raise HTTPException(status_code=400, detail="Unsupported mode")
|
| 102 |
|
|
|
|
| 80 |
df = pd.DataFrame(links)
|
| 81 |
return df
|
| 82 |
|
| 83 |
+
def scrape_products(soup: BeautifulSoup):
|
| 84 |
+
"""Extract product information from e-commerce sites"""
|
| 85 |
+
products = []
|
| 86 |
+
|
| 87 |
+
# Try different common product selectors
|
| 88 |
+
product_containers = (
|
| 89 |
+
soup.find_all('div', class_=lambda x: x and ('product' in str(x).lower() or 'item' in str(x).lower())) or
|
| 90 |
+
soup.find_all('div', {'data-id': True}) or
|
| 91 |
+
soup.find_all('article') or
|
| 92 |
+
soup.find_all('li', class_=lambda x: x and 'product' in str(x).lower())
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
if not product_containers:
|
| 96 |
+
raise HTTPException(status_code=400, detail="No products found on page")
|
| 97 |
+
|
| 98 |
+
for container in product_containers[:50]: # Limit to first 50 products
|
| 99 |
+
product = {}
|
| 100 |
+
|
| 101 |
+
# Extract product name/title
|
| 102 |
+
title = container.find(['h1', 'h2', 'h3', 'h4', 'a'], class_=lambda x: x and any(k in str(x).lower() for k in ['title', 'name', 'product']))
|
| 103 |
+
if not title:
|
| 104 |
+
title = container.find(['h1', 'h2', 'h3', 'h4', 'a'])
|
| 105 |
+
product['Product Name'] = title.get_text(strip=True) if title else ''
|
| 106 |
+
|
| 107 |
+
# Extract price
|
| 108 |
+
price = container.find(['span', 'div', 'p'], class_=lambda x: x and 'price' in str(x).lower())
|
| 109 |
+
product['Price'] = price.get_text(strip=True) if price else ''
|
| 110 |
+
|
| 111 |
+
# Extract rating
|
| 112 |
+
rating = container.find(['span', 'div'], class_=lambda x: x and 'rating' in str(x).lower())
|
| 113 |
+
product['Rating'] = rating.get_text(strip=True) if rating else ''
|
| 114 |
+
|
| 115 |
+
# Extract link
|
| 116 |
+
link = container.find('a', href=True)
|
| 117 |
+
product['Link'] = link['href'] if link else ''
|
| 118 |
+
|
| 119 |
+
# Extract image
|
| 120 |
+
img = container.find('img')
|
| 121 |
+
product['Image'] = img.get('src', img.get('data-src', '')) if img else ''
|
| 122 |
+
|
| 123 |
+
# Only add if we have at least a name or price
|
| 124 |
+
if product['Product Name'] or product['Price']:
|
| 125 |
+
products.append(product)
|
| 126 |
+
|
| 127 |
+
if not products:
|
| 128 |
+
raise HTTPException(status_code=400, detail="Could not extract product data")
|
| 129 |
+
|
| 130 |
+
df = pd.DataFrame(products)
|
| 131 |
+
return df
|
| 132 |
+
|
| 133 |
+
|
| 134 |
@app.post("/scrape")
|
| 135 |
def scrape_to_excel(req: ScrapeRequest):
|
| 136 |
try:
|
|
|
|
| 148 |
df = scrape_table(soup)
|
| 149 |
elif req.mode == "links":
|
| 150 |
df = scrape_links(soup)
|
| 151 |
+
elif req.mode == "products":
|
| 152 |
+
df = scrape_products(soup)
|
| 153 |
else:
|
| 154 |
raise HTTPException(status_code=400, detail="Unsupported mode")
|
| 155 |
|