Trivia5 / src /modules /api_utils.py
Bharath370's picture
Upload 102 files
582bf6b verified
# modules/api_utils.py
"""API utility functions for Wikimedia services"""
import requests
from typing import Dict, List, Optional
import time
import random # Import the random module
from config.settings import (
WIKIPEDIA_API,
WIKIDATA_API,
WIKIBOOKS_API,
WIKI_REST_API,
CACHE_TIMEOUT,
)
# Cache for API responses
_cache = {}
def _get_cached_or_fetch(url: str, params: Dict = None) -> Optional[Dict]:
"""Get cached response or fetch from API. Uses a simple in-memory cache."""
cache_key = f"{url}_{str(params)}"
if cache_key in _cache:
cached_data, timestamp = _cache[cache_key]
if time.time() - timestamp < CACHE_TIMEOUT: # Fixed: changed _cache_timeout to CACHE_TIMEOUT
return cached_data
try:
response = requests.get(
url, params=params, timeout=10
) # Increased timeout for robustness
if response.status_code == 200:
data = response.json()
_cache[cache_key] = (data, time.time() + CACHE_TIMEOUT)
return data
except requests.exceptions.RequestException as e:
print(f"API request error: {e}")
return None
def fetch_wikipedia_summary(topic: str) -> Optional[Dict]:
"""Fetch Wikipedia page summary with caching"""
return _get_cached_or_fetch(f"{WIKI_REST_API}{topic}")
def search_wikipedia(query: str, limit: int = 5) -> List[str]:
"""Search Wikipedia for topics"""
params = {"action": "opensearch", "search": query, "limit": limit, "format": "json"}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and len(data) > 1:
return data[1]
return []
def fetch_wikidata_entity(entity_id: str) -> Optional[Dict]:
"""Fetch Wikidata entity information"""
params = {
"action": "wbgetentities",
"ids": entity_id,
"format": "json",
"languages": "en",
}
return _get_cached_or_fetch(WIKIDATA_API, params)
def fetch_wikipedia_categories(page_title: str) -> List[str]:
"""Fetch categories for a Wikipedia page"""
params = {
"action": "query",
"prop": "categories",
"titles": page_title,
"format": "json",
"cllimit": 10,
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data:
pages = data.get("query", {}).get("pages", {})
for page_id, page_data in pages.items():
categories = page_data.get("categories", [])
return [cat["title"].replace("Category:", "") for cat in categories]
return []
def fetch_related_topics(topic: str, limit: int = 5) -> List[str]:
"""Fetch related topics from Wikipedia"""
params = {
"action": "query",
"list": "search",
"srsearch": topic,
"srlimit": limit,
"format": "json",
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data:
search_results = data.get("query", {}).get("search", [])
return [
result["title"] for result in search_results if result["title"] != topic
]
return []
def fetch_wikibooks_content(topic: str) -> Optional[str]:
"""Fetch content from Wikibooks"""
params = {"action": "query", "list": "search", "srsearch": topic, "format": "json"}
data = _get_cached_or_fetch(WIKIBOOKS_API, params)
if data:
search_results = data.get("query", {}).get("search", [])
if search_results:
return search_results[0].get("snippet", "")
return None
def fetch_wikipedia_page_details(topic: str) -> Optional[Dict]:
"""Fetch detailed Wikipedia page content and images."""
params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "extracts|pageimages",
"exintro": False, # Get full extract
"explaintext": True, # Get plain text
"pithumbsize": 200, # Thumbnail size for images
"redirects": 1 # Resolve redirects
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and "query" in data and "pages" in data["query"]:
page_id = next(iter(data["query"]["pages"])) # Get the first page ID
page_data = data["query"]["pages"][page_id]
if "missing" in page_data:
return None # Page not found
extract = page_data.get("extract", "")
images = []
if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
images.append(page_data["pageimages"]["thumbnail"]["source"])
return {
"title": page_data.get("title"),
"extract": extract,
"images": images,
"full_url": f"https://en.wikipedia.org/wiki/{page_data.get('title').replace(' ', '_')}"
}
return None
def fetch_wikipedia_images(topic: str, limit: int = 1) -> List[str]:
"""Fetch image URLs for a given topic from Wikipedia, prioritizing relevant images."""
image_urls = []
# First, try to get images directly from pageimages (main image/thumbnail)
page_details_params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "pageimages",
"pithumbsize": 400, # Larger thumbnail
"redirects": 1
}
page_details_data = _get_cached_or_fetch(WIKIPEDIA_API, page_details_params)
if page_details_data and "query" in page_details_data and "pages" in page_details_data["query"]:
page_id = next(iter(page_details_data["query"]["pages"]))
page_data = page_details_data["query"]["pages"][page_id]
if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
image_urls.append(page_data["pageimages"]["thumbnail"]["source"])
if len(image_urls) >= limit:
return image_urls
# If not enough images, try fetching all images from the page and filter
params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "images",
"imlimit": "max", # Fetch all images used on the page
"redirects": 1
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and "query" in data and "pages" in data["query"]:
page_id = next(iter(data["query"]["pages"]))
page_data = data["query"]["pages"][page_id]
if "images" in page_data:
all_page_images = []
for img_info in page_data["images"]:
# Basic relevance heuristic: check if topic keywords are in image title
if topic.lower() in img_info["title"].lower() or any(keyword.lower() in img_info["title"].lower() for keyword in topic.split()):
all_page_images.append(img_info["title"])
random.shuffle(all_page_images) # Shuffle to get different relevant images each time
for image_title in all_page_images:
if len(image_urls) >= limit:
break
# Fetch image info to get direct URL
image_params = {
"action": "query",
"format": "json",
"titles": image_title,
"prop": "imageinfo",
"iiprop": "url",
}
image_data = _get_cached_or_fetch(WIKIPEDIA_API, image_params)
if image_data and "query" in image_data and "pages" in image_data["query"]:
img_page_id = next(iter(image_data["query"]["pages"]))
img_page_data = image_data["query"]["pages"][img_page_id]
if "imageinfo" in img_page_data and img_page_data["imageinfo"]:
image_urls.append(img_page_data["imageinfo"][0]["url"])
return image_urls[:limit] # Ensure limit is respected