# modules/api_utils.py """API utility functions for Wikimedia services""" import requests from typing import Dict, List, Optional import time import random # Import the random module from config.settings import ( WIKIPEDIA_API, WIKIDATA_API, WIKIBOOKS_API, WIKI_REST_API, CACHE_TIMEOUT, ) # Cache for API responses _cache = {} def _get_cached_or_fetch(url: str, params: Dict = None) -> Optional[Dict]: """Get cached response or fetch from API. Uses a simple in-memory cache.""" cache_key = f"{url}_{str(params)}" if cache_key in _cache: cached_data, timestamp = _cache[cache_key] if time.time() - timestamp < CACHE_TIMEOUT: # Fixed: changed _cache_timeout to CACHE_TIMEOUT return cached_data try: response = requests.get( url, params=params, timeout=10 ) # Increased timeout for robustness if response.status_code == 200: data = response.json() _cache[cache_key] = (data, time.time() + CACHE_TIMEOUT) return data except requests.exceptions.RequestException as e: print(f"API request error: {e}") return None def fetch_wikipedia_summary(topic: str) -> Optional[Dict]: """Fetch Wikipedia page summary with caching""" return _get_cached_or_fetch(f"{WIKI_REST_API}{topic}") def search_wikipedia(query: str, limit: int = 5) -> List[str]: """Search Wikipedia for topics""" params = {"action": "opensearch", "search": query, "limit": limit, "format": "json"} data = _get_cached_or_fetch(WIKIPEDIA_API, params) if data and len(data) > 1: return data[1] return [] def fetch_wikidata_entity(entity_id: str) -> Optional[Dict]: """Fetch Wikidata entity information""" params = { "action": "wbgetentities", "ids": entity_id, "format": "json", "languages": "en", } return _get_cached_or_fetch(WIKIDATA_API, params) def fetch_wikipedia_categories(page_title: str) -> List[str]: """Fetch categories for a Wikipedia page""" params = { "action": "query", "prop": "categories", "titles": page_title, "format": "json", "cllimit": 10, } data = _get_cached_or_fetch(WIKIPEDIA_API, params) if data: pages = data.get("query", {}).get("pages", {}) for page_id, page_data in pages.items(): categories = page_data.get("categories", []) return [cat["title"].replace("Category:", "") for cat in categories] return [] def fetch_related_topics(topic: str, limit: int = 5) -> List[str]: """Fetch related topics from Wikipedia""" params = { "action": "query", "list": "search", "srsearch": topic, "srlimit": limit, "format": "json", } data = _get_cached_or_fetch(WIKIPEDIA_API, params) if data: search_results = data.get("query", {}).get("search", []) return [ result["title"] for result in search_results if result["title"] != topic ] return [] def fetch_wikibooks_content(topic: str) -> Optional[str]: """Fetch content from Wikibooks""" params = {"action": "query", "list": "search", "srsearch": topic, "format": "json"} data = _get_cached_or_fetch(WIKIBOOKS_API, params) if data: search_results = data.get("query", {}).get("search", []) if search_results: return search_results[0].get("snippet", "") return None def fetch_wikipedia_page_details(topic: str) -> Optional[Dict]: """Fetch detailed Wikipedia page content and images.""" params = { "action": "query", "format": "json", "titles": topic, "prop": "extracts|pageimages", "exintro": False, # Get full extract "explaintext": True, # Get plain text "pithumbsize": 200, # Thumbnail size for images "redirects": 1 # Resolve redirects } data = _get_cached_or_fetch(WIKIPEDIA_API, params) if data and "query" in data and "pages" in data["query"]: page_id = next(iter(data["query"]["pages"])) # Get the first page ID page_data = data["query"]["pages"][page_id] if "missing" in page_data: return None # Page not found extract = page_data.get("extract", "") images = [] if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]: images.append(page_data["pageimages"]["thumbnail"]["source"]) return { "title": page_data.get("title"), "extract": extract, "images": images, "full_url": f"https://en.wikipedia.org/wiki/{page_data.get('title').replace(' ', '_')}" } return None def fetch_wikipedia_images(topic: str, limit: int = 1) -> List[str]: """Fetch image URLs for a given topic from Wikipedia, prioritizing relevant images.""" image_urls = [] # First, try to get images directly from pageimages (main image/thumbnail) page_details_params = { "action": "query", "format": "json", "titles": topic, "prop": "pageimages", "pithumbsize": 400, # Larger thumbnail "redirects": 1 } page_details_data = _get_cached_or_fetch(WIKIPEDIA_API, page_details_params) if page_details_data and "query" in page_details_data and "pages" in page_details_data["query"]: page_id = next(iter(page_details_data["query"]["pages"])) page_data = page_details_data["query"]["pages"][page_id] if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]: image_urls.append(page_data["pageimages"]["thumbnail"]["source"]) if len(image_urls) >= limit: return image_urls # If not enough images, try fetching all images from the page and filter params = { "action": "query", "format": "json", "titles": topic, "prop": "images", "imlimit": "max", # Fetch all images used on the page "redirects": 1 } data = _get_cached_or_fetch(WIKIPEDIA_API, params) if data and "query" in data and "pages" in data["query"]: page_id = next(iter(data["query"]["pages"])) page_data = data["query"]["pages"][page_id] if "images" in page_data: all_page_images = [] for img_info in page_data["images"]: # Basic relevance heuristic: check if topic keywords are in image title if topic.lower() in img_info["title"].lower() or any(keyword.lower() in img_info["title"].lower() for keyword in topic.split()): all_page_images.append(img_info["title"]) random.shuffle(all_page_images) # Shuffle to get different relevant images each time for image_title in all_page_images: if len(image_urls) >= limit: break # Fetch image info to get direct URL image_params = { "action": "query", "format": "json", "titles": image_title, "prop": "imageinfo", "iiprop": "url", } image_data = _get_cached_or_fetch(WIKIPEDIA_API, image_params) if image_data and "query" in image_data and "pages" in image_data["query"]: img_page_id = next(iter(image_data["query"]["pages"])) img_page_data = image_data["query"]["pages"][img_page_id] if "imageinfo" in img_page_data and img_page_data["imageinfo"]: image_urls.append(img_page_data["imageinfo"][0]["url"]) return image_urls[:limit] # Ensure limit is respected