Spaces:
Sleeping
Sleeping
File size: 7,997 Bytes
44073c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# modules/api_utils.py
"""API utility functions for Wikimedia services"""
import requests
from typing import Dict, List, Optional
import time
import random # Import the random module
from config.settings import (
WIKIPEDIA_API,
WIKIDATA_API,
WIKIBOOKS_API,
WIKI_REST_API,
CACHE_TIMEOUT,
)
# Cache for API responses
_cache = {}
def _get_cached_or_fetch(url: str, params: Dict = None) -> Optional[Dict]:
"""Get cached response or fetch from API. Uses a simple in-memory cache."""
cache_key = f"{url}_{str(params)}"
if cache_key in _cache:
cached_data, timestamp = _cache[cache_key]
if time.time() - timestamp < CACHE_TIMEOUT: # Fixed: changed _cache_timeout to CACHE_TIMEOUT
return cached_data
try:
response = requests.get(
url, params=params, timeout=10
) # Increased timeout for robustness
if response.status_code == 200:
data = response.json()
_cache[cache_key] = (data, time.time() + CACHE_TIMEOUT)
return data
except requests.exceptions.RequestException as e:
print(f"API request error: {e}")
return None
def fetch_wikipedia_summary(topic: str) -> Optional[Dict]:
"""Fetch Wikipedia page summary with caching"""
return _get_cached_or_fetch(f"{WIKI_REST_API}{topic}")
def search_wikipedia(query: str, limit: int = 5) -> List[str]:
"""Search Wikipedia for topics"""
params = {"action": "opensearch", "search": query, "limit": limit, "format": "json"}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and len(data) > 1:
return data[1]
return []
def fetch_wikidata_entity(entity_id: str) -> Optional[Dict]:
"""Fetch Wikidata entity information"""
params = {
"action": "wbgetentities",
"ids": entity_id,
"format": "json",
"languages": "en",
}
return _get_cached_or_fetch(WIKIDATA_API, params)
def fetch_wikipedia_categories(page_title: str) -> List[str]:
"""Fetch categories for a Wikipedia page"""
params = {
"action": "query",
"prop": "categories",
"titles": page_title,
"format": "json",
"cllimit": 10,
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data:
pages = data.get("query", {}).get("pages", {})
for page_id, page_data in pages.items():
categories = page_data.get("categories", [])
return [cat["title"].replace("Category:", "") for cat in categories]
return []
def fetch_related_topics(topic: str, limit: int = 5) -> List[str]:
"""Fetch related topics from Wikipedia"""
params = {
"action": "query",
"list": "search",
"srsearch": topic,
"srlimit": limit,
"format": "json",
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data:
search_results = data.get("query", {}).get("search", [])
return [
result["title"] for result in search_results if result["title"] != topic
]
return []
def fetch_wikibooks_content(topic: str) -> Optional[str]:
"""Fetch content from Wikibooks"""
params = {"action": "query", "list": "search", "srsearch": topic, "format": "json"}
data = _get_cached_or_fetch(WIKIBOOKS_API, params)
if data:
search_results = data.get("query", {}).get("search", [])
if search_results:
return search_results[0].get("snippet", "")
return None
def fetch_wikipedia_page_details(topic: str) -> Optional[Dict]:
"""Fetch detailed Wikipedia page content and images."""
params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "extracts|pageimages",
"exintro": False, # Get full extract
"explaintext": True, # Get plain text
"pithumbsize": 200, # Thumbnail size for images
"redirects": 1 # Resolve redirects
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and "query" in data and "pages" in data["query"]:
page_id = next(iter(data["query"]["pages"])) # Get the first page ID
page_data = data["query"]["pages"][page_id]
if "missing" in page_data:
return None # Page not found
extract = page_data.get("extract", "")
images = []
if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
images.append(page_data["pageimages"]["thumbnail"]["source"])
return {
"title": page_data.get("title"),
"extract": extract,
"images": images,
"full_url": f"https://en.wikipedia.org/wiki/{page_data.get('title').replace(' ', '_')}"
}
return None
def fetch_wikipedia_images(topic: str, limit: int = 1) -> List[str]:
"""Fetch image URLs for a given topic from Wikipedia, prioritizing relevant images."""
image_urls = []
# First, try to get images directly from pageimages (main image/thumbnail)
page_details_params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "pageimages",
"pithumbsize": 400, # Larger thumbnail
"redirects": 1
}
page_details_data = _get_cached_or_fetch(WIKIPEDIA_API, page_details_params)
if page_details_data and "query" in page_details_data and "pages" in page_details_data["query"]:
page_id = next(iter(page_details_data["query"]["pages"]))
page_data = page_details_data["query"]["pages"][page_id]
if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
image_urls.append(page_data["pageimages"]["thumbnail"]["source"])
if len(image_urls) >= limit:
return image_urls
# If not enough images, try fetching all images from the page and filter
params = {
"action": "query",
"format": "json",
"titles": topic,
"prop": "images",
"imlimit": "max", # Fetch all images used on the page
"redirects": 1
}
data = _get_cached_or_fetch(WIKIPEDIA_API, params)
if data and "query" in data and "pages" in data["query"]:
page_id = next(iter(data["query"]["pages"]))
page_data = data["query"]["pages"][page_id]
if "images" in page_data:
all_page_images = []
for img_info in page_data["images"]:
# Basic relevance heuristic: check if topic keywords are in image title
if topic.lower() in img_info["title"].lower() or any(keyword.lower() in img_info["title"].lower() for keyword in topic.split()):
all_page_images.append(img_info["title"])
random.shuffle(all_page_images) # Shuffle to get different relevant images each time
for image_title in all_page_images:
if len(image_urls) >= limit:
break
# Fetch image info to get direct URL
image_params = {
"action": "query",
"format": "json",
"titles": image_title,
"prop": "imageinfo",
"iiprop": "url",
}
image_data = _get_cached_or_fetch(WIKIPEDIA_API, image_params)
if image_data and "query" in image_data and "pages" in image_data["query"]:
img_page_id = next(iter(image_data["query"]["pages"]))
img_page_data = image_data["query"]["pages"][img_page_id]
if "imageinfo" in img_page_data and img_page_data["imageinfo"]:
image_urls.append(img_page_data["imageinfo"][0]["url"])
return image_urls[:limit] # Ensure limit is respected
|