Spaces:

garvitcpp
/

recipe-rover-api

Running

File size: 5,400 Bytes

c30b4ba

import logging
import asyncio
import aiohttp
import random
import re
from typing import List, Union
from app.utils.scrapers.google_scraper import GoogleScraper
from app.utils.scrapers.food_network_scraper import FoodNetworkScraper
from app.utils.scrapers.allrecipes_scraper import AllRecipesScraper
from app.utils.scrapers.wikimedia_scraper import WikimediaScraper
from app.utils.scrapers.fooddotcom_scraper import FoodDotComScraper

logger = logging.getLogger(__name__)

class ImageSearchService:
    def __init__(self):
        self.scrapers = [
            GoogleScraper(),
            FoodNetworkScraper(),
            AllRecipesScraper(),
            WikimediaScraper(),
            FoodDotComScraper()
        ]
        self.session = None
        self.placeholder_images = [
            "https://drive.google.com/file/d/1gYOjs06yiq7EUXaO19BE-L7MkrTR6wlc/view?usp=sharing",
            "https://drive.google.com/file/d/1ob4KbzVLtwsE_ckYKBu_70FLEXNCJRSr/view?usp=sharing",
            "https://drive.google.com/file/d/1UUv3zF1ouXteZVt8Oc_UXORcJrlWfRXR/view?usp=sharing"
        ]

    async def __aenter__(self):
        if self.session is None:
            self.session = aiohttp.ClientSession()
            for scraper in self.scrapers:
                scraper.session = self.session
        logger.info("ImageSearchService session initialized")
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
            self.session = None
        logger.info("ImageSearchService session closed")

    async def search_recipe_images(self, recipe_name: str, image_data: Union[str, float, int], num_images: int = 3) -> List[str]:
        logger.info(f"Searching images for recipe: {recipe_name}")
        
        # First try to get existing URLs from the database
        existing_urls = self.extract_urls_from_image_column(image_data)
        if existing_urls:
            logger.info(f"Found {len(existing_urls)} existing URLs")
            return existing_urls[:num_images]
        
        try:
            # Try to get images from scrapers
            all_results = []
            tasks = []
            
            for scraper in self.scrapers:
                task = asyncio.create_task(scraper.search_images(recipe_name, num_images))
                tasks.append(task)
            
            logger.info(f"Created {len(tasks)} scraper tasks")
            done, pending = await asyncio.wait(tasks, timeout=60)
            
            for task in pending:
                logger.warning(f"Cancelling pending task for {task.get_coro().__name__}")
                task.cancel()
            
            for task in done:
                try:
                    results = await task
                    logger.info(f"Scraper {task.get_coro().__name__} found {len(results)} images")
                    all_results.extend(results)
                except Exception as e:
                    logger.error(f"Error in scraper task {task.get_coro().__name__}: {str(e)}")
            
            # Get unique results
            seen = set()
            unique_results = []
            for url in all_results:
                if url not in seen:
                    seen.add(url)
                    unique_results.append(url)
            
            if unique_results:
                logger.info(f"Found {len(unique_results)} unique image URLs")
                return unique_results[:num_images]
            
            # If no images found, return random placeholder images
            logger.info("No images found, using placeholder images")
            selected_placeholders = []
            for _ in range(num_images):
                placeholder = random.choice(self.placeholder_images)
                while placeholder in selected_placeholders and len(selected_placeholders) < len(self.placeholder_images):
                    placeholder = random.choice(self.placeholder_images)
                selected_placeholders.append(placeholder)
            
            return selected_placeholders
            
        except Exception as e:
            logger.error(f"Error in image search: {str(e)}")
            # Return placeholder images even in case of error
            return random.sample(self.placeholder_images, min(num_images, len(self.placeholder_images)))

    def extract_urls_from_image_column(self, image_data: Union[str, float, int]) -> List[str]:
        logger.debug(f"Extracting URLs from image data: {image_data}")
        if image_data is None or image_data == 'NA' or isinstance(image_data, (float, int)):
            logger.debug("No valid image data found in database")
            return []
        
        try:
            image_data_str = str(image_data)
            urls = []
            if image_data_str.startswith('c(') and image_data_str.endswith(')'):
                content = image_data_str[2:-1].strip()
                parts = re.findall(r'"([^"]*)"', content)
                urls = [url for url in parts if url.startswith('http')]
            else:
                urls = re.findall(r'https?://[^\s,"\')]+', image_data_str)
            
            logger.info(f"Extracted {len(urls)} URLs from image column")
            return urls
        except Exception as e:
            logger.error(f"Error extracting URLs from image data: {str(e)}")
            return []