Spaces:
Running
Running
| from urllib.parse import quote | |
| from typing import List | |
| from .base_scraper import BaseScraper | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class WikimediaScraper(BaseScraper): | |
| async def search_images(self, recipe_name: str, num_images: int) -> List[str]: | |
| search_query = quote(recipe_name) | |
| url = f"https://commons.wikimedia.org/w/api.php" | |
| params = { | |
| "action": "query", | |
| "format": "json", | |
| "list": "search", | |
| "srsearch": f"{search_query} food", | |
| "srnamespace": "6", # File namespace | |
| "srlimit": num_images | |
| } | |
| try: | |
| async with self.session.get(url, params=params, headers=await self.get_headers()) as response: | |
| if response.status != 200: | |
| return [] | |
| data = await response.json() | |
| images = set() | |
| for item in data.get('query', {}).get('search', []): | |
| title = item.get('title', '') | |
| if title.startswith('File:'): | |
| file_url = f"https://commons.wikimedia.org/wiki/Special:FilePath/{quote(title[5:])}" | |
| images.add(file_url) | |
| valid_images = [] | |
| for img_url in images: | |
| if len(valid_images) >= num_images: | |
| break | |
| if await self.verify_image_url(img_url): | |
| valid_images.append(img_url) | |
| return valid_images | |
| except Exception as e: | |
| logger.error(f"Wikimedia scraping error: {str(e)}") | |
| return [] |