File size: 1,619 Bytes
c30b4ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from bs4 import BeautifulSoup
import re
from urllib.parse import quote, unquote
from .base_scraper import BaseScraper
import logging
from typing import List


logger = logging.getLogger(__name__)

class GoogleScraper(BaseScraper):
    async def search_images(self, recipe_name: str, num_images: int) -> List[str]:
        search_query = f"{recipe_name} recipe food"
        url = f"https://www.google.com/search?q={quote(search_query)}&tbm=isch"
        
        try:
            async with self.session.get(url, headers=await self.get_headers()) as response:
                if response.status != 200:
                    return []
                
                html = await response.text()
                soup = BeautifulSoup(html, 'html.parser')
                images = set()

                # Extract from JSON-like data in scripts
                for script in soup.find_all('script'):
                    if script.string and 'AF_initDataCallback' in script.string:
                        urls = re.findall(r'(https?://\S+\.(?:jpg|jpeg|png))', script.string)
                        images.update(unquote(url) for url in urls)

                # Verify URLs and take only valid ones
                valid_images = []
                for img_url in images:
                    if len(valid_images) >= num_images:
                        break
                    if await self.verify_image_url(img_url):
                        valid_images.append(img_url)

                return valid_images
        except Exception as e:
            logger.error(f"Google scraping error: {str(e)}")
            return []