IndiScan / utils /scraper.py
Wendgan's picture
Upload 9 files
2ae3f7c verified
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
import re
from urllib.parse import quote_plus
class Scraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession(headers=self.headers)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def search_amazon(self, query: str) -> List[Dict]:
"""Search Amazon India for products"""
url = f"https://www.amazon.in/s?k={quote_plus(query)}"
async with self.session.get(url) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
products = []
for item in soup.select('.s-result-item[data-asin]'):
try:
title = item.select_one('.a-text-normal')
price = item.select_one('.a-price-whole')
url = item.select_one('a.a-link-normal')
if title and price and url:
products.append({
'platform': 'amazon',
'title': title.text.strip(),
'price': float(price.text.replace(',', '')),
'url': 'https://www.amazon.in' + url['href']
})
except Exception:
continue
return products[:5] # Return top 5 results
return []
async def search_blinkit(self, query: str) -> List[Dict]:
"""Search Blinkit for products"""
url = f"https://blinkit.com/v2/search?q={quote_plus(query)}"
async with self.session.get(url) as response:
if response.status == 200:
try:
data = await response.json()
products = []
for item in data.get('products', [])[:5]:
products.append({
'platform': 'blinkit',
'title': item.get('name', ''),
'price': float(item.get('price', 0)),
'url': f"https://blinkit.com/products/{item.get('slug', '')}"
})
return products
except Exception:
return []
return []
async def search_zepto(self, query: str) -> List[Dict]:
"""Search Zepto for products"""
url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}"
async with self.session.get(url) as response:
if response.status == 200:
try:
data = await response.json()
products = []
for item in data.get('products', [])[:5]:
products.append({
'platform': 'zepto',
'title': item.get('name', ''),
'price': float(item.get('mrp', 0)),
'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}"
})
return products
except Exception:
return []
return []
async def search_swiggy_instamart(self, query: str) -> List[Dict]:
"""Search Swiggy Instamart for products"""
url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}"
async with self.session.get(url) as response:
if response.status == 200:
try:
data = await response.json()
products = []
for item in data.get('data', {}).get('products', [])[:5]:
products.append({
'platform': 'swiggy_instamart',
'title': item.get('name', ''),
'price': float(item.get('price', 0)),
'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}"
})
return products
except Exception:
return []
return []
def extract_ingredients(self, text: str) -> List[str]:
"""Extract ingredients from product description text"""
# Common ingredient list markers
markers = [
r"ingredients?[:|\s]+(.*?)(?=\.|$)",
r"contains?[:|\s]+(.*?)(?=\.|$)",
r"composition?[:|\s]+(.*?)(?=\.|$)"
]
for marker in markers:
match = re.search(marker, text, re.IGNORECASE)
if match:
ingredients_text = match.group(1)
# Split by common separators
ingredients = re.split(r'[,;]|\sand\s', ingredients_text)
# Clean up each ingredient
return [ing.strip() for ing in ingredients if ing.strip()]
return []
def extract_nutrition_info(self, text: str) -> Dict:
"""Extract nutrition information from product description text"""
nutrition_info = {}
# Common nutrition patterns
patterns = {
'calories': r'(\d+)\s*(?:kcal|calories)',
'protein': r'protein\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
'carbohydrates': r'carbohydrates?\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
'fat': r'fat\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
'sugar': r'sugar\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
'fiber': r'fiber\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g'
}
for nutrient, pattern in patterns.items():
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
nutrition_info[nutrient] = float(match.group(1))
except ValueError:
continue
return nutrition_info
async def get_all_prices(self, query: str) -> List[Dict]:
"""Get prices from all supported platforms"""
tasks = [
self.search_amazon(query),
self.search_blinkit(query),
self.search_zepto(query),
self.search_swiggy_instamart(query)
]
results = await asyncio.gather(*tasks, return_exceptions=True)
all_prices = []
for result in results:
if isinstance(result, list):
all_prices.extend(result)
return all_prices