Spaces:
Running
Running
| from bs4 import BeautifulSoup | |
| import re | |
| from urllib.parse import quote, unquote | |
| from .base_scraper import BaseScraper | |
| import logging | |
| from typing import List | |
| logger = logging.getLogger(__name__) | |
| class GoogleScraper(BaseScraper): | |
| async def search_images(self, recipe_name: str, num_images: int) -> List[str]: | |
| search_query = f"{recipe_name} recipe food" | |
| url = f"https://www.google.com/search?q={quote(search_query)}&tbm=isch" | |
| try: | |
| async with self.session.get(url, headers=await self.get_headers()) as response: | |
| if response.status != 200: | |
| return [] | |
| html = await response.text() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| images = set() | |
| # Extract from JSON-like data in scripts | |
| for script in soup.find_all('script'): | |
| if script.string and 'AF_initDataCallback' in script.string: | |
| urls = re.findall(r'(https?://\S+\.(?:jpg|jpeg|png))', script.string) | |
| images.update(unquote(url) for url in urls) | |
| # Verify URLs and take only valid ones | |
| valid_images = [] | |
| for img_url in images: | |
| if len(valid_images) >= num_images: | |
| break | |
| if await self.verify_image_url(img_url): | |
| valid_images.append(img_url) | |
| return valid_images | |
| except Exception as e: | |
| logger.error(f"Google scraping error: {str(e)}") | |
| return [] |