amalsp commited on
Commit
739af99
·
verified ·
1 Parent(s): 2a22627

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +53 -0
main.py CHANGED
@@ -80,6 +80,57 @@ def scrape_links(soup: BeautifulSoup):
80
  df = pd.DataFrame(links)
81
  return df
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  @app.post("/scrape")
84
  def scrape_to_excel(req: ScrapeRequest):
85
  try:
@@ -97,6 +148,8 @@ def scrape_to_excel(req: ScrapeRequest):
97
  df = scrape_table(soup)
98
  elif req.mode == "links":
99
  df = scrape_links(soup)
 
 
100
  else:
101
  raise HTTPException(status_code=400, detail="Unsupported mode")
102
 
 
80
  df = pd.DataFrame(links)
81
  return df
82
 
83
+ def scrape_products(soup: BeautifulSoup):
84
+ """Extract product information from e-commerce sites"""
85
+ products = []
86
+
87
+ # Try different common product selectors
88
+ product_containers = (
89
+ soup.find_all('div', class_=lambda x: x and ('product' in str(x).lower() or 'item' in str(x).lower())) or
90
+ soup.find_all('div', {'data-id': True}) or
91
+ soup.find_all('article') or
92
+ soup.find_all('li', class_=lambda x: x and 'product' in str(x).lower())
93
+ )
94
+
95
+ if not product_containers:
96
+ raise HTTPException(status_code=400, detail="No products found on page")
97
+
98
+ for container in product_containers[:50]: # Limit to first 50 products
99
+ product = {}
100
+
101
+ # Extract product name/title
102
+ title = container.find(['h1', 'h2', 'h3', 'h4', 'a'], class_=lambda x: x and any(k in str(x).lower() for k in ['title', 'name', 'product']))
103
+ if not title:
104
+ title = container.find(['h1', 'h2', 'h3', 'h4', 'a'])
105
+ product['Product Name'] = title.get_text(strip=True) if title else ''
106
+
107
+ # Extract price
108
+ price = container.find(['span', 'div', 'p'], class_=lambda x: x and 'price' in str(x).lower())
109
+ product['Price'] = price.get_text(strip=True) if price else ''
110
+
111
+ # Extract rating
112
+ rating = container.find(['span', 'div'], class_=lambda x: x and 'rating' in str(x).lower())
113
+ product['Rating'] = rating.get_text(strip=True) if rating else ''
114
+
115
+ # Extract link
116
+ link = container.find('a', href=True)
117
+ product['Link'] = link['href'] if link else ''
118
+
119
+ # Extract image
120
+ img = container.find('img')
121
+ product['Image'] = img.get('src', img.get('data-src', '')) if img else ''
122
+
123
+ # Only add if we have at least a name or price
124
+ if product['Product Name'] or product['Price']:
125
+ products.append(product)
126
+
127
+ if not products:
128
+ raise HTTPException(status_code=400, detail="Could not extract product data")
129
+
130
+ df = pd.DataFrame(products)
131
+ return df
132
+
133
+
134
  @app.post("/scrape")
135
  def scrape_to_excel(req: ScrapeRequest):
136
  try:
 
148
  df = scrape_table(soup)
149
  elif req.mode == "links":
150
  df = scrape_links(soup)
151
+ elif req.mode == "products":
152
+ df = scrape_products(soup)
153
  else:
154
  raise HTTPException(status_code=400, detail="Unsupported mode")
155