File size: 7,997 Bytes
44073c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# modules/api_utils.py
"""API utility functions for Wikimedia services"""

import requests
from typing import Dict, List, Optional
import time
import random # Import the random module
from config.settings import (
    WIKIPEDIA_API,
    WIKIDATA_API,
    WIKIBOOKS_API,
    WIKI_REST_API,
    CACHE_TIMEOUT,
)

# Cache for API responses
_cache = {}


def _get_cached_or_fetch(url: str, params: Dict = None) -> Optional[Dict]:
    """Get cached response or fetch from API. Uses a simple in-memory cache."""
    cache_key = f"{url}_{str(params)}"

    if cache_key in _cache:
        cached_data, timestamp = _cache[cache_key]
        if time.time() - timestamp < CACHE_TIMEOUT: # Fixed: changed _cache_timeout to CACHE_TIMEOUT
            return cached_data

    try:
        response = requests.get(
            url, params=params, timeout=10
        )  # Increased timeout for robustness
        if response.status_code == 200:
            data = response.json()
            _cache[cache_key] = (data, time.time() + CACHE_TIMEOUT)
            return data
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")

    return None


def fetch_wikipedia_summary(topic: str) -> Optional[Dict]:
    """Fetch Wikipedia page summary with caching"""
    return _get_cached_or_fetch(f"{WIKI_REST_API}{topic}")


def search_wikipedia(query: str, limit: int = 5) -> List[str]:
    """Search Wikipedia for topics"""
    params = {"action": "opensearch", "search": query, "limit": limit, "format": "json"}

    data = _get_cached_or_fetch(WIKIPEDIA_API, params)
    if data and len(data) > 1:
        return data[1]
    return []


def fetch_wikidata_entity(entity_id: str) -> Optional[Dict]:
    """Fetch Wikidata entity information"""
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "format": "json",
        "languages": "en",
    }

    return _get_cached_or_fetch(WIKIDATA_API, params)


def fetch_wikipedia_categories(page_title: str) -> List[str]:
    """Fetch categories for a Wikipedia page"""
    params = {
        "action": "query",
        "prop": "categories",
        "titles": page_title,
        "format": "json",
        "cllimit": 10,
    }

    data = _get_cached_or_fetch(WIKIPEDIA_API, params)
    if data:
        pages = data.get("query", {}).get("pages", {})
        for page_id, page_data in pages.items():
            categories = page_data.get("categories", [])
            return [cat["title"].replace("Category:", "") for cat in categories]
    return []


def fetch_related_topics(topic: str, limit: int = 5) -> List[str]:
    """Fetch related topics from Wikipedia"""
    params = {
        "action": "query",
        "list": "search",
        "srsearch": topic,
        "srlimit": limit,
        "format": "json",
    }

    data = _get_cached_or_fetch(WIKIPEDIA_API, params)
    if data:
        search_results = data.get("query", {}).get("search", [])
        return [
            result["title"] for result in search_results if result["title"] != topic
        ]
    return []


def fetch_wikibooks_content(topic: str) -> Optional[str]:
    """Fetch content from Wikibooks"""
    params = {"action": "query", "list": "search", "srsearch": topic, "format": "json"}

    data = _get_cached_or_fetch(WIKIBOOKS_API, params)
    if data:
        search_results = data.get("query", {}).get("search", [])
        if search_results:
            return search_results[0].get("snippet", "")
    return None

def fetch_wikipedia_page_details(topic: str) -> Optional[Dict]:
    """Fetch detailed Wikipedia page content and images."""
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "extracts|pageimages",
        "exintro": False, # Get full extract
        "explaintext": True, # Get plain text
        "pithumbsize": 200, # Thumbnail size for images
        "redirects": 1 # Resolve redirects
    }
    data = _get_cached_or_fetch(WIKIPEDIA_API, params)

    if data and "query" in data and "pages" in data["query"]:
        page_id = next(iter(data["query"]["pages"])) # Get the first page ID
        page_data = data["query"]["pages"][page_id]

        if "missing" in page_data:
            return None # Page not found

        extract = page_data.get("extract", "")
        images = []
        if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
            images.append(page_data["pageimages"]["thumbnail"]["source"])
        
        return {
            "title": page_data.get("title"),
            "extract": extract,
            "images": images,
            "full_url": f"https://en.wikipedia.org/wiki/{page_data.get('title').replace(' ', '_')}"
        }
    return None

def fetch_wikipedia_images(topic: str, limit: int = 1) -> List[str]:
    """Fetch image URLs for a given topic from Wikipedia, prioritizing relevant images."""
    image_urls = []

    # First, try to get images directly from pageimages (main image/thumbnail)
    page_details_params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "pageimages",
        "pithumbsize": 400, # Larger thumbnail
        "redirects": 1
    }
    page_details_data = _get_cached_or_fetch(WIKIPEDIA_API, page_details_params)
    if page_details_data and "query" in page_details_data and "pages" in page_details_data["query"]:
        page_id = next(iter(page_details_data["query"]["pages"]))
        page_data = page_details_data["query"]["pages"][page_id]
        if "pageimages" in page_data and "thumbnail" in page_data["pageimages"]:
            image_urls.append(page_data["pageimages"]["thumbnail"]["source"])
            if len(image_urls) >= limit:
                return image_urls

    # If not enough images, try fetching all images from the page and filter
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "images",
        "imlimit": "max", # Fetch all images used on the page
        "redirects": 1
    }
    data = _get_cached_or_fetch(WIKIPEDIA_API, params)

    if data and "query" in data and "pages" in data["query"]:
        page_id = next(iter(data["query"]["pages"]))
        page_data = data["query"]["pages"][page_id]

        if "images" in page_data:
            all_page_images = []
            for img_info in page_data["images"]:
                # Basic relevance heuristic: check if topic keywords are in image title
                if topic.lower() in img_info["title"].lower() or any(keyword.lower() in img_info["title"].lower() for keyword in topic.split()):
                    all_page_images.append(img_info["title"])
            
            random.shuffle(all_page_images) # Shuffle to get different relevant images each time

            for image_title in all_page_images:
                if len(image_urls) >= limit:
                    break
                # Fetch image info to get direct URL
                image_params = {
                    "action": "query",
                    "format": "json",
                    "titles": image_title,
                    "prop": "imageinfo",
                    "iiprop": "url",
                }
                image_data = _get_cached_or_fetch(WIKIPEDIA_API, image_params)
                if image_data and "query" in image_data and "pages" in image_data["query"]:
                    img_page_id = next(iter(image_data["query"]["pages"]))
                    img_page_data = image_data["query"]["pages"][img_page_id]
                    if "imageinfo" in img_page_data and img_page_data["imageinfo"]:
                        image_urls.append(img_page_data["imageinfo"][0]["url"])
    
    return image_urls[:limit] # Ensure limit is respected