Spaces:
Running
Running
| import logging | |
| import asyncio | |
| import aiohttp | |
| import random | |
| import re | |
| from typing import List, Union | |
| from app.utils.scrapers.google_scraper import GoogleScraper | |
| from app.utils.scrapers.food_network_scraper import FoodNetworkScraper | |
| from app.utils.scrapers.allrecipes_scraper import AllRecipesScraper | |
| from app.utils.scrapers.wikimedia_scraper import WikimediaScraper | |
| from app.utils.scrapers.fooddotcom_scraper import FoodDotComScraper | |
| logger = logging.getLogger(__name__) | |
| class ImageSearchService: | |
| def __init__(self): | |
| self.scrapers = [ | |
| GoogleScraper(), | |
| FoodNetworkScraper(), | |
| AllRecipesScraper(), | |
| WikimediaScraper(), | |
| FoodDotComScraper() | |
| ] | |
| self.session = None | |
| self.placeholder_images = [ | |
| "https://drive.google.com/file/d/1gYOjs06yiq7EUXaO19BE-L7MkrTR6wlc/view?usp=sharing", | |
| "https://drive.google.com/file/d/1ob4KbzVLtwsE_ckYKBu_70FLEXNCJRSr/view?usp=sharing", | |
| "https://drive.google.com/file/d/1UUv3zF1ouXteZVt8Oc_UXORcJrlWfRXR/view?usp=sharing" | |
| ] | |
| async def __aenter__(self): | |
| if self.session is None: | |
| self.session = aiohttp.ClientSession() | |
| for scraper in self.scrapers: | |
| scraper.session = self.session | |
| logger.info("ImageSearchService session initialized") | |
| return self | |
| async def __aexit__(self, exc_type, exc_val, exc_tb): | |
| if self.session: | |
| await self.session.close() | |
| self.session = None | |
| logger.info("ImageSearchService session closed") | |
| async def search_recipe_images(self, recipe_name: str, image_data: Union[str, float, int], num_images: int = 3) -> List[str]: | |
| logger.info(f"Searching images for recipe: {recipe_name}") | |
| # First try to get existing URLs from the database | |
| existing_urls = self.extract_urls_from_image_column(image_data) | |
| if existing_urls: | |
| logger.info(f"Found {len(existing_urls)} existing URLs") | |
| return existing_urls[:num_images] | |
| try: | |
| # Try to get images from scrapers | |
| all_results = [] | |
| tasks = [] | |
| for scraper in self.scrapers: | |
| task = asyncio.create_task(scraper.search_images(recipe_name, num_images)) | |
| tasks.append(task) | |
| logger.info(f"Created {len(tasks)} scraper tasks") | |
| done, pending = await asyncio.wait(tasks, timeout=60) | |
| for task in pending: | |
| logger.warning(f"Cancelling pending task for {task.get_coro().__name__}") | |
| task.cancel() | |
| for task in done: | |
| try: | |
| results = await task | |
| logger.info(f"Scraper {task.get_coro().__name__} found {len(results)} images") | |
| all_results.extend(results) | |
| except Exception as e: | |
| logger.error(f"Error in scraper task {task.get_coro().__name__}: {str(e)}") | |
| # Get unique results | |
| seen = set() | |
| unique_results = [] | |
| for url in all_results: | |
| if url not in seen: | |
| seen.add(url) | |
| unique_results.append(url) | |
| if unique_results: | |
| logger.info(f"Found {len(unique_results)} unique image URLs") | |
| return unique_results[:num_images] | |
| # If no images found, return random placeholder images | |
| logger.info("No images found, using placeholder images") | |
| selected_placeholders = [] | |
| for _ in range(num_images): | |
| placeholder = random.choice(self.placeholder_images) | |
| while placeholder in selected_placeholders and len(selected_placeholders) < len(self.placeholder_images): | |
| placeholder = random.choice(self.placeholder_images) | |
| selected_placeholders.append(placeholder) | |
| return selected_placeholders | |
| except Exception as e: | |
| logger.error(f"Error in image search: {str(e)}") | |
| # Return placeholder images even in case of error | |
| return random.sample(self.placeholder_images, min(num_images, len(self.placeholder_images))) | |
| def extract_urls_from_image_column(self, image_data: Union[str, float, int]) -> List[str]: | |
| logger.debug(f"Extracting URLs from image data: {image_data}") | |
| if image_data is None or image_data == 'NA' or isinstance(image_data, (float, int)): | |
| logger.debug("No valid image data found in database") | |
| return [] | |
| try: | |
| image_data_str = str(image_data) | |
| urls = [] | |
| if image_data_str.startswith('c(') and image_data_str.endswith(')'): | |
| content = image_data_str[2:-1].strip() | |
| parts = re.findall(r'"([^"]*)"', content) | |
| urls = [url for url in parts if url.startswith('http')] | |
| else: | |
| urls = re.findall(r'https?://[^\s,"\')]+', image_data_str) | |
| logger.info(f"Extracted {len(urls)} URLs from image column") | |
| return urls | |
| except Exception as e: | |
| logger.error(f"Error extracting URLs from image data: {str(e)}") | |
| return [] |