recipe-rover-api / app /utils /scrapers /wikimedia_scraper.py
garvitcpp's picture
Upload 48 files
c30b4ba verified
from urllib.parse import quote
from typing import List
from .base_scraper import BaseScraper
import logging
logger = logging.getLogger(__name__)
class WikimediaScraper(BaseScraper):
async def search_images(self, recipe_name: str, num_images: int) -> List[str]:
search_query = quote(recipe_name)
url = f"https://commons.wikimedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"list": "search",
"srsearch": f"{search_query} food",
"srnamespace": "6", # File namespace
"srlimit": num_images
}
try:
async with self.session.get(url, params=params, headers=await self.get_headers()) as response:
if response.status != 200:
return []
data = await response.json()
images = set()
for item in data.get('query', {}).get('search', []):
title = item.get('title', '')
if title.startswith('File:'):
file_url = f"https://commons.wikimedia.org/wiki/Special:FilePath/{quote(title[5:])}"
images.add(file_url)
valid_images = []
for img_url in images:
if len(valid_images) >= num_images:
break
if await self.verify_image_url(img_url):
valid_images.append(img_url)
return valid_images
except Exception as e:
logger.error(f"Wikimedia scraping error: {str(e)}")
return []