Spaces:

Wendgan
/

IndiScan

Sleeping

File size: 7,288 Bytes

2ae3f7c

import aiohttp
import asyncio
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Optional
import re
from urllib.parse import quote_plus

class Scraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.session = None

    async def __aenter__(self):
        self.session = aiohttp.ClientSession(headers=self.headers)
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()

    async def search_amazon(self, query: str) -> List[Dict]:
        """Search Amazon India for products"""
        url = f"https://www.amazon.in/s?k={quote_plus(query)}"
        async with self.session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, 'html.parser')
                products = []
                
                for item in soup.select('.s-result-item[data-asin]'):
                    try:
                        title = item.select_one('.a-text-normal')
                        price = item.select_one('.a-price-whole')
                        url = item.select_one('a.a-link-normal')
                        
                        if title and price and url:
                            products.append({
                                'platform': 'amazon',
                                'title': title.text.strip(),
                                'price': float(price.text.replace(',', '')),
                                'url': 'https://www.amazon.in' + url['href']
                            })
                    except Exception:
                        continue
                        
                return products[:5]  # Return top 5 results
        return []

    async def search_blinkit(self, query: str) -> List[Dict]:
        """Search Blinkit for products"""
        url = f"https://blinkit.com/v2/search?q={quote_plus(query)}"
        async with self.session.get(url) as response:
            if response.status == 200:
                try:
                    data = await response.json()
                    products = []
                    
                    for item in data.get('products', [])[:5]:
                        products.append({
                            'platform': 'blinkit',
                            'title': item.get('name', ''),
                            'price': float(item.get('price', 0)),
                            'url': f"https://blinkit.com/products/{item.get('slug', '')}"
                        })
                    
                    return products
                except Exception:
                    return []
        return []

    async def search_zepto(self, query: str) -> List[Dict]:
        """Search Zepto for products"""
        url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}"
        async with self.session.get(url) as response:
            if response.status == 200:
                try:
                    data = await response.json()
                    products = []
                    
                    for item in data.get('products', [])[:5]:
                        products.append({
                            'platform': 'zepto',
                            'title': item.get('name', ''),
                            'price': float(item.get('mrp', 0)),
                            'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}"
                        })
                    
                    return products
                except Exception:
                    return []
        return []

    async def search_swiggy_instamart(self, query: str) -> List[Dict]:
        """Search Swiggy Instamart for products"""
        url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}"
        async with self.session.get(url) as response:
            if response.status == 200:
                try:
                    data = await response.json()
                    products = []
                    
                    for item in data.get('data', {}).get('products', [])[:5]:
                        products.append({
                            'platform': 'swiggy_instamart',
                            'title': item.get('name', ''),
                            'price': float(item.get('price', 0)),
                            'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}"
                        })
                    
                    return products
                except Exception:
                    return []
        return []

    def extract_ingredients(self, text: str) -> List[str]:
        """Extract ingredients from product description text"""
        # Common ingredient list markers
        markers = [
            r"ingredients?[:|\s]+(.*?)(?=\.|$)",
            r"contains?[:|\s]+(.*?)(?=\.|$)",
            r"composition?[:|\s]+(.*?)(?=\.|$)"
        ]
        
        for marker in markers:
            match = re.search(marker, text, re.IGNORECASE)
            if match:
                ingredients_text = match.group(1)
                # Split by common separators
                ingredients = re.split(r'[,;]|\sand\s', ingredients_text)
                # Clean up each ingredient
                return [ing.strip() for ing in ingredients if ing.strip()]
        
        return []

    def extract_nutrition_info(self, text: str) -> Dict:
        """Extract nutrition information from product description text"""
        nutrition_info = {}
        
        # Common nutrition patterns
        patterns = {
            'calories': r'(\d+)\s*(?:kcal|calories)',
            'protein': r'protein\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
            'carbohydrates': r'carbohydrates?\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
            'fat': r'fat\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
            'sugar': r'sugar\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
            'fiber': r'fiber\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g'
        }
        
        for nutrient, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    nutrition_info[nutrient] = float(match.group(1))
                except ValueError:
                    continue
        
        return nutrition_info

    async def get_all_prices(self, query: str) -> List[Dict]:
        """Get prices from all supported platforms"""
        tasks = [
            self.search_amazon(query),
            self.search_blinkit(query),
            self.search_zepto(query),
            self.search_swiggy_instamart(query)
        ]
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        all_prices = []
        
        for result in results:
            if isinstance(result, list):
                all_prices.extend(result)
        
        return all_prices