Spaces:
Running
Running
File size: 1,619 Bytes
c30b4ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | from bs4 import BeautifulSoup
import re
from urllib.parse import quote, unquote
from .base_scraper import BaseScraper
import logging
from typing import List
logger = logging.getLogger(__name__)
class GoogleScraper(BaseScraper):
async def search_images(self, recipe_name: str, num_images: int) -> List[str]:
search_query = f"{recipe_name} recipe food"
url = f"https://www.google.com/search?q={quote(search_query)}&tbm=isch"
try:
async with self.session.get(url, headers=await self.get_headers()) as response:
if response.status != 200:
return []
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
images = set()
# Extract from JSON-like data in scripts
for script in soup.find_all('script'):
if script.string and 'AF_initDataCallback' in script.string:
urls = re.findall(r'(https?://\S+\.(?:jpg|jpeg|png))', script.string)
images.update(unquote(url) for url in urls)
# Verify URLs and take only valid ones
valid_images = []
for img_url in images:
if len(valid_images) >= num_images:
break
if await self.verify_image_url(img_url):
valid_images.append(img_url)
return valid_images
except Exception as e:
logger.error(f"Google scraping error: {str(e)}")
return [] |