Spaces:
Sleeping
Sleeping
| # modules/api_utils.py | |
| """API utility functions for Wikimedia services""" | |
| import requests | |
| from typing import Dict, List, Optional | |
| import time | |
| import random # Import the random module | |
| from config.settings import ( | |
| WIKIPEDIA_API, | |
| WIKIDATA_API, | |
| WIKIBOOKS_API, | |
| WIKI_REST_API, | |
| CACHE_TIMEOUT, | |
| ) | |
| # Cache for API responses | |
| _cache = {} | |
| def _get_cached_or_fetch(url: str, params: Dict = None) -> Optional[Dict]: | |
| """Get cached response or fetch from API. Uses a simple in-memory cache.""" | |
| cache_key = f"{url}_{str(params)}" | |
| if cache_key in _cache: | |
| cached_data, timestamp = _cache[cache_key] | |
| if time.time() - timestamp < CACHE_TIMEOUT: # Fixed: changed _cache_timeout to CACHE_TIMEOUT | |
| return cached_data | |
| try: | |
| response = requests.get( | |
| url, params=params, timeout=10 | |
| ) # Increased timeout for robustness | |
| if response.status_code == 200: | |
| data = response.json() | |
| _cache[cache_key] = (data, time.time() + CACHE_TIMEOUT) | |
| return data | |
| except requests.exceptions.RequestException as e: | |
| print(f"API request error: {e}") | |
| return None | |
| def fetch_wikipedia_summary(topic: str) -> Optional[Dict]: | |
| """Fetch Wikipedia page summary with caching""" | |
| return _get_cached_or_fetch(f"{WIKI_REST_API}{topic}") | |
| def search_wikipedia(query: str, limit: int = 5) -> List[str]: | |
| """Search Wikipedia for topics""" | |
| params = {"action": "opensearch", "search": query, "limit": limit, "format": "json"} | |
| data = _get_cached_or_fetch(WIKIPEDIA_API, params) | |
| if data and len(data) > 1: | |
| return data[1] | |
| return [] | |
| def fetch_wikidata_entity(entity_id: str) -> Optional[Dict]: | |
| """Fetch Wikidata entity information""" | |
| params = { | |
| "action": "wbgetentities", | |
| "ids": entity_id, | |
| "format": "json", | |
| "languages": "en", | |
| } | |
| return _get_cached_or_fetch(WIKIDATA_API, params) | |
| def fetch_wikipedia_categories(page_title: str) -> List[str]: | |
| """Fetch categories for a Wikipedia page""" | |
| params = { | |
| "action": "query", | |
| "prop": "categories", | |
| "titles": page_title, | |
| "format": "json", | |
| "cllimit": 10, | |
| } | |
| data = _get_cached_or_fetch(WIKIPEDIA_API, params) | |
| if data: | |
| pages = data.get("query", {}).get("pages", {}) | |
| for page_id, page_data in pages.items(): | |
| categories = page_data.get("categories", []) | |
| return [cat["title"].replace("Category:", "") for cat in categories] | |
| return [] | |
| def fetch_related_topics(topic: str, limit: int = 5) -> List[str]: | |
| """Fetch related topics from Wikipedia""" | |
| params = { | |
| "action": "query", | |
| "list": "search", | |
| "srsearch": topic, | |
| "srlimit": limit, | |
| "format": "json", | |
| } | |
| data = _get_cached_or_fetch(WIKIPEDIA_API, params) | |
| if data: | |
| search_results = data.get("query", {}).get("search", []) | |
| return [ | |
| result["title"] for result in search_results if result["title"] != topic | |
| ] | |
| return [] | |
| def fetch_wikibooks_content(topic: str) -> Optional[str]: | |
| """Fetch content from Wikibooks""" | |
| params = {"action": "query", "list": "search", "srsearch": topic, "format": "json"} | |
| data = _get_cached_or_fetch(WIKIBOOKS_API, params) | |
| if data: | |
| search_results = data.get("query", {}).get("search", []) | |
| if search_results: | |
| return search_results[0].get("snippet", "") | |
| return None | |
| def fetch_wikipedia_page_details(topic: str) -> Optional[Dict]: | |
| """Fetch detailed Wikipedia page content and images.""" | |
| params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": topic, | |
| "prop": "extracts|pageimages", | |
| "exintro": False, # Get full extract | |
| "explaintext": True, # Get plain text | |
| "pithumbsize": 200, # Thumbnail size for images | |
| "redirects": 1 # Resolve redirects | |
| } | |
| data = _get_cached_or_fetch(WIKIPEDIA_API, params) | |
| if data and "query" in data and "pages" in data["query"]: | |
| page_id = next(iter(data["query"]["pages"])) # Get the first page ID | |
| page_data = data["query"]["pages"][page_id] | |
| if "missing" in page_data: | |
| return None # Page not found | |
| extract = page_data.get("extract", "") | |
| images = [] | |
| if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]: | |
| images.append(page_data["pageimages"]["thumbnail"]["source"]) | |
| return { | |
| "title": page_data.get("title"), | |
| "extract": extract, | |
| "images": images, | |
| "full_url": f"https://en.wikipedia.org/wiki/{page_data.get('title').replace(' ', '_')}" | |
| } | |
| return None | |
| def fetch_wikipedia_images(topic: str, limit: int = 1) -> List[str]: | |
| """Fetch image URLs for a given topic from Wikipedia, prioritizing relevant images.""" | |
| image_urls = [] | |
| # First, try to get images directly from pageimages (main image/thumbnail) | |
| page_details_params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": topic, | |
| "prop": "pageimages", | |
| "pithumbsize": 400, # Larger thumbnail | |
| "redirects": 1 | |
| } | |
| page_details_data = _get_cached_or_fetch(WIKIPEDIA_API, page_details_params) | |
| if page_details_data and "query" in page_details_data and "pages" in page_details_data["query"]: | |
| page_id = next(iter(page_details_data["query"]["pages"])) | |
| page_data = page_details_data["query"]["pages"][page_id] | |
| if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]: | |
| image_urls.append(page_data["pageimages"]["thumbnail"]["source"]) | |
| if len(image_urls) >= limit: | |
| return image_urls | |
| # If not enough images, try fetching all images from the page and filter | |
| params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": topic, | |
| "prop": "images", | |
| "imlimit": "max", # Fetch all images used on the page | |
| "redirects": 1 | |
| } | |
| data = _get_cached_or_fetch(WIKIPEDIA_API, params) | |
| if data and "query" in data and "pages" in data["query"]: | |
| page_id = next(iter(data["query"]["pages"])) | |
| page_data = data["query"]["pages"][page_id] | |
| if "images" in page_data: | |
| all_page_images = [] | |
| for img_info in page_data["images"]: | |
| # Basic relevance heuristic: check if topic keywords are in image title | |
| if topic.lower() in img_info["title"].lower() or any(keyword.lower() in img_info["title"].lower() for keyword in topic.split()): | |
| all_page_images.append(img_info["title"]) | |
| random.shuffle(all_page_images) # Shuffle to get different relevant images each time | |
| for image_title in all_page_images: | |
| if len(image_urls) >= limit: | |
| break | |
| # Fetch image info to get direct URL | |
| image_params = { | |
| "action": "query", | |
| "format": "json", | |
| "titles": image_title, | |
| "prop": "imageinfo", | |
| "iiprop": "url", | |
| } | |
| image_data = _get_cached_or_fetch(WIKIPEDIA_API, image_params) | |
| if image_data and "query" in image_data and "pages" in image_data["query"]: | |
| img_page_id = next(iter(image_data["query"]["pages"])) | |
| img_page_data = image_data["query"]["pages"][img_page_id] | |
| if "imageinfo" in img_page_data and img_page_data["imageinfo"]: | |
| image_urls.append(img_page_data["imageinfo"][0]["url"]) | |
| return image_urls[:limit] # Ensure limit is respected | |