Spaces:

Wendgan
/

IndiScan

Sleeping

App Files Files Community

IndiScan / utils /scraper.py

Wendgan

Upload 9 files

2ae3f7c verified 9 months ago

raw

history blame contribute delete

7.29 kB

	import aiohttp
	import asyncio
	from bs4 import BeautifulSoup
	import json
	from typing import Dict, List, Optional
	import re
	from urllib.parse import quote_plus

	class Scraper:
	def __init__(self):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	self.session = None

	async def __aenter__(self):
	self.session = aiohttp.ClientSession(headers=self.headers)
	return self

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	if self.session:
	await self.session.close()

	async def search_amazon(self, query: str) -> List[Dict]:
	"""Search Amazon India for products"""
	url = f"https://www.amazon.in/s?k={quote_plus(query)}"
	async with self.session.get(url) as response:
	if response.status == 200:
	html = await response.text()
	soup = BeautifulSoup(html, 'html.parser')
	products = []

	for item in soup.select('.s-result-item[data-asin]'):
	try:
	title = item.select_one('.a-text-normal')
	price = item.select_one('.a-price-whole')
	url = item.select_one('a.a-link-normal')

	if title and price and url:
	products.append({
	'platform': 'amazon',
	'title': title.text.strip(),
	'price': float(price.text.replace(',', '')),
	'url': 'https://www.amazon.in' + url['href']
	})
	except Exception:
	continue

	return products[:5] # Return top 5 results
	return []

	async def search_blinkit(self, query: str) -> List[Dict]:
	"""Search Blinkit for products"""
	url = f"https://blinkit.com/v2/search?q={quote_plus(query)}"
	async with self.session.get(url) as response:
	if response.status == 200:
	try:
	data = await response.json()
	products = []

	for item in data.get('products', [])[:5]:
	products.append({
	'platform': 'blinkit',
	'title': item.get('name', ''),
	'price': float(item.get('price', 0)),
	'url': f"https://blinkit.com/products/{item.get('slug', '')}"
	})

	return products
	except Exception:
	return []
	return []

	async def search_zepto(self, query: str) -> List[Dict]:
	"""Search Zepto for products"""
	url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}"
	async with self.session.get(url) as response:
	if response.status == 200:
	try:
	data = await response.json()
	products = []

	for item in data.get('products', [])[:5]:
	products.append({
	'platform': 'zepto',
	'title': item.get('name', ''),
	'price': float(item.get('mrp', 0)),
	'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}"
	})

	return products
	except Exception:
	return []
	return []

	async def search_swiggy_instamart(self, query: str) -> List[Dict]:
	"""Search Swiggy Instamart for products"""
	url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}"
	async with self.session.get(url) as response:
	if response.status == 200:
	try:
	data = await response.json()
	products = []

	for item in data.get('data', {}).get('products', [])[:5]:
	products.append({
	'platform': 'swiggy_instamart',
	'title': item.get('name', ''),
	'price': float(item.get('price', 0)),
	'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}"
	})

	return products
	except Exception:
	return []
	return []

	def extract_ingredients(self, text: str) -> List[str]:
	"""Extract ingredients from product description text"""
	# Common ingredient list markers
	markers = [
	r"ingredients?[:\|\s]+(.*?)(?=\.\|$)",
	r"contains?[:\|\s]+(.*?)(?=\.\|$)",
	r"composition?[:\|\s]+(.*?)(?=\.\|$)"
	]

	for marker in markers:
	match = re.search(marker, text, re.IGNORECASE)
	if match:
	ingredients_text = match.group(1)
	# Split by common separators
	ingredients = re.split(r'[,;]\|\sand\s', ingredients_text)
	# Clean up each ingredient
	return [ing.strip() for ing in ingredients if ing.strip()]

	return []

	def extract_nutrition_info(self, text: str) -> Dict:
	"""Extract nutrition information from product description text"""
	nutrition_info = {}

	# Common nutrition patterns
	patterns = {
	'calories': r'(\d+)\s*(?:kcal\|calories)',
	'protein': r'protein\s(?:\w+\s+)?(\d+(?:\.\d+)?)\sg',
	'carbohydrates': r'carbohydrates?\s(?:\w+\s+)?(\d+(?:\.\d+)?)\sg',
	'fat': r'fat\s(?:\w+\s+)?(\d+(?:\.\d+)?)\sg',
	'sugar': r'sugar\s(?:\w+\s+)?(\d+(?:\.\d+)?)\sg',
	'fiber': r'fiber\s(?:\w+\s+)?(\d+(?:\.\d+)?)\sg'
	}

	for nutrient, pattern in patterns.items():
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	try:
	nutrition_info[nutrient] = float(match.group(1))
	except ValueError:
	continue

	return nutrition_info

	async def get_all_prices(self, query: str) -> List[Dict]:
	"""Get prices from all supported platforms"""
	tasks = [
	self.search_amazon(query),
	self.search_blinkit(query),
	self.search_zepto(query),
	self.search_swiggy_instamart(query)
	]

	results = await asyncio.gather(*tasks, return_exceptions=True)
	all_prices = []

	for result in results:
	if isinstance(result, list):
	all_prices.extend(result)

	return all_prices