|
|
import aiohttp
|
|
|
import asyncio
|
|
|
from bs4 import BeautifulSoup
|
|
|
import json
|
|
|
from typing import Dict, List, Optional
|
|
|
import re
|
|
|
from urllib.parse import quote_plus
|
|
|
|
|
|
class Scraper:
|
|
|
def __init__(self):
|
|
|
self.headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
}
|
|
|
self.session = None
|
|
|
|
|
|
async def __aenter__(self):
|
|
|
self.session = aiohttp.ClientSession(headers=self.headers)
|
|
|
return self
|
|
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
|
if self.session:
|
|
|
await self.session.close()
|
|
|
|
|
|
async def search_amazon(self, query: str) -> List[Dict]:
|
|
|
"""Search Amazon India for products"""
|
|
|
url = f"https://www.amazon.in/s?k={quote_plus(query)}"
|
|
|
async with self.session.get(url) as response:
|
|
|
if response.status == 200:
|
|
|
html = await response.text()
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
products = []
|
|
|
|
|
|
for item in soup.select('.s-result-item[data-asin]'):
|
|
|
try:
|
|
|
title = item.select_one('.a-text-normal')
|
|
|
price = item.select_one('.a-price-whole')
|
|
|
url = item.select_one('a.a-link-normal')
|
|
|
|
|
|
if title and price and url:
|
|
|
products.append({
|
|
|
'platform': 'amazon',
|
|
|
'title': title.text.strip(),
|
|
|
'price': float(price.text.replace(',', '')),
|
|
|
'url': 'https://www.amazon.in' + url['href']
|
|
|
})
|
|
|
except Exception:
|
|
|
continue
|
|
|
|
|
|
return products[:5]
|
|
|
return []
|
|
|
|
|
|
async def search_blinkit(self, query: str) -> List[Dict]:
|
|
|
"""Search Blinkit for products"""
|
|
|
url = f"https://blinkit.com/v2/search?q={quote_plus(query)}"
|
|
|
async with self.session.get(url) as response:
|
|
|
if response.status == 200:
|
|
|
try:
|
|
|
data = await response.json()
|
|
|
products = []
|
|
|
|
|
|
for item in data.get('products', [])[:5]:
|
|
|
products.append({
|
|
|
'platform': 'blinkit',
|
|
|
'title': item.get('name', ''),
|
|
|
'price': float(item.get('price', 0)),
|
|
|
'url': f"https://blinkit.com/products/{item.get('slug', '')}"
|
|
|
})
|
|
|
|
|
|
return products
|
|
|
except Exception:
|
|
|
return []
|
|
|
return []
|
|
|
|
|
|
async def search_zepto(self, query: str) -> List[Dict]:
|
|
|
"""Search Zepto for products"""
|
|
|
url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}"
|
|
|
async with self.session.get(url) as response:
|
|
|
if response.status == 200:
|
|
|
try:
|
|
|
data = await response.json()
|
|
|
products = []
|
|
|
|
|
|
for item in data.get('products', [])[:5]:
|
|
|
products.append({
|
|
|
'platform': 'zepto',
|
|
|
'title': item.get('name', ''),
|
|
|
'price': float(item.get('mrp', 0)),
|
|
|
'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}"
|
|
|
})
|
|
|
|
|
|
return products
|
|
|
except Exception:
|
|
|
return []
|
|
|
return []
|
|
|
|
|
|
async def search_swiggy_instamart(self, query: str) -> List[Dict]:
|
|
|
"""Search Swiggy Instamart for products"""
|
|
|
url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}"
|
|
|
async with self.session.get(url) as response:
|
|
|
if response.status == 200:
|
|
|
try:
|
|
|
data = await response.json()
|
|
|
products = []
|
|
|
|
|
|
for item in data.get('data', {}).get('products', [])[:5]:
|
|
|
products.append({
|
|
|
'platform': 'swiggy_instamart',
|
|
|
'title': item.get('name', ''),
|
|
|
'price': float(item.get('price', 0)),
|
|
|
'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}"
|
|
|
})
|
|
|
|
|
|
return products
|
|
|
except Exception:
|
|
|
return []
|
|
|
return []
|
|
|
|
|
|
def extract_ingredients(self, text: str) -> List[str]:
|
|
|
"""Extract ingredients from product description text"""
|
|
|
|
|
|
markers = [
|
|
|
r"ingredients?[:|\s]+(.*?)(?=\.|$)",
|
|
|
r"contains?[:|\s]+(.*?)(?=\.|$)",
|
|
|
r"composition?[:|\s]+(.*?)(?=\.|$)"
|
|
|
]
|
|
|
|
|
|
for marker in markers:
|
|
|
match = re.search(marker, text, re.IGNORECASE)
|
|
|
if match:
|
|
|
ingredients_text = match.group(1)
|
|
|
|
|
|
ingredients = re.split(r'[,;]|\sand\s', ingredients_text)
|
|
|
|
|
|
return [ing.strip() for ing in ingredients if ing.strip()]
|
|
|
|
|
|
return []
|
|
|
|
|
|
def extract_nutrition_info(self, text: str) -> Dict:
|
|
|
"""Extract nutrition information from product description text"""
|
|
|
nutrition_info = {}
|
|
|
|
|
|
|
|
|
patterns = {
|
|
|
'calories': r'(\d+)\s*(?:kcal|calories)',
|
|
|
'protein': r'protein\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
|
|
'carbohydrates': r'carbohydrates?\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
|
|
'fat': r'fat\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
|
|
'sugar': r'sugar\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
|
|
'fiber': r'fiber\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g'
|
|
|
}
|
|
|
|
|
|
for nutrient, pattern in patterns.items():
|
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
|
if match:
|
|
|
try:
|
|
|
nutrition_info[nutrient] = float(match.group(1))
|
|
|
except ValueError:
|
|
|
continue
|
|
|
|
|
|
return nutrition_info
|
|
|
|
|
|
async def get_all_prices(self, query: str) -> List[Dict]:
|
|
|
"""Get prices from all supported platforms"""
|
|
|
tasks = [
|
|
|
self.search_amazon(query),
|
|
|
self.search_blinkit(query),
|
|
|
self.search_zepto(query),
|
|
|
self.search_swiggy_instamart(query)
|
|
|
]
|
|
|
|
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
all_prices = []
|
|
|
|
|
|
for result in results:
|
|
|
if isinstance(result, list):
|
|
|
all_prices.extend(result)
|
|
|
|
|
|
return all_prices |