TriviaVerse_38 / utils /wiki_api.py
RaghavenderReddy's picture
Upload 13 files
33a2aaf verified
import requests
import json
from typing import Optional, Dict, Any
import time
class WikiAPI:
"""Wikipedia and Wikidata API client"""
def __init__(self):
self.wikipedia_base_url = "https://en.wikipedia.org/api/rest_v1"
self.wikidata_base_url = "https://www.wikidata.org/w/api.php"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'TriviaVerse/1.0 (https://github.com/your-repo/triviaverse)'
})
def fetch_content(self, topic: str, language: str = 'en') -> Optional[str]:
"""Fetch content from Wikipedia"""
try:
# First, search for the article
search_results = self._search_wikipedia(topic, language)
if not search_results:
return None
# Get the first result
page_title = search_results[0]['title']
# Fetch the full content
content = self._get_wikipedia_content(page_title, language)
return content
except Exception as e:
print(f"Error fetching content: {e}")
return None
def _search_wikipedia(self, query: str, language: str = 'en') -> Optional[list]:
"""Search Wikipedia for articles"""
# Map language codes to Wikipedia domains
lang_domains = {
'en': 'en.wikipedia.org',
'hi': 'hi.wikipedia.org',
'te': 'te.wikipedia.org',
'ta': 'ta.wikipedia.org',
'kn': 'kn.wikipedia.org',
'bn': 'bn.wikipedia.org'
}
domain = lang_domains.get(language, 'en.wikipedia.org')
url = f"https://{domain}/api/rest_v1/page/search/{query}"
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = response.json()
return data.get('pages', [])
except Exception as e:
print(f"Search error: {e}")
return None
def _get_wikipedia_content(self, page_title: str, language: str = 'en') -> Optional[str]:
"""Get full content of a Wikipedia page"""
# Map language codes to Wikipedia domains
lang_domains = {
'en': 'en.wikipedia.org',
'hi': 'hi.wikipedia.org',
'te': 'te.wikipedia.org',
'ta': 'ta.wikipedia.org',
'kn': 'kn.wikipedia.org',
'bn': 'bn.wikipedia.org'
}
domain = lang_domains.get(language, 'en.wikipedia.org')
# Get page summary first
summary_url = f"https://{domain}/api/rest_v1/page/summary/{page_title}"
try:
response = self.session.get(summary_url, timeout=10)
response.raise_for_status()
summary_data = response.json()
extract = summary_data.get('extract', '')
# Try to get more detailed content
content_url = f"https://{domain}/w/api.php"
params = {
'action': 'query',
'format': 'json',
'titles': page_title,
'prop': 'extracts',
'exintro': True,
'explaintext': True,
'exsectionformat': 'plain'
}
response = self.session.get(content_url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
pages = data.get('query', {}).get('pages', {})
for page_id, page_data in pages.items():
if 'extract' in page_data:
full_extract = page_data['extract']
# Return the longer content
return full_extract if len(full_extract) > len(extract) else extract
return extract
except Exception as e:
print(f"Content fetch error: {e}")
return None
def get_wikidata_info(self, topic: str) -> Optional[Dict[str, Any]]:
"""Get structured data from Wikidata"""
try:
# Search for Wikidata entity
search_url = f"{self.wikidata_base_url}"
params = {
'action': 'wbsearchentities',
'search': topic,
'language': 'en',
'format': 'json'
}
response = self.session.get(search_url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
entities = data.get('search', [])
if not entities:
return None
entity_id = entities[0]['id']
# Get entity data
entity_url = f"{self.wikidata_base_url}"
params = {
'action': 'wbgetentities',
'ids': entity_id,
'format': 'json',
'languages': 'en'
}
response = self.session.get(entity_url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
entity_data = data.get('entities', {}).get(entity_id, {})
return self._process_wikidata_entity(entity_data)
except Exception as e:
print(f"Wikidata error: {e}")
return None
def _process_wikidata_entity(self, entity_data: Dict[str, Any]) -> Dict[str, Any]:
"""Process Wikidata entity to extract useful information"""
processed_data = {
'label': '',
'description': '',
'claims': {},
'aliases': []
}
# Extract label
labels = entity_data.get('labels', {})
if 'en' in labels:
processed_data['label'] = labels['en']['value']
# Extract description
descriptions = entity_data.get('descriptions', {})
if 'en' in descriptions:
processed_data['description'] = descriptions['en']['value']
# Extract aliases
aliases = entity_data.get('aliases', {})
if 'en' in aliases:
processed_data['aliases'] = [alias['value'] for alias in aliases['en']]
# Extract some important claims
claims = entity_data.get('claims', {})
important_properties = [
'P31', # instance of
'P279', # subclass of
'P17', # country
'P569', # date of birth
'P570', # date of death
'P571', # inception
'P576', # dissolved
]
for prop in important_properties:
if prop in claims:
processed_data['claims'][prop] = claims[prop]
return processed_data
def get_related_topics(self, topic: str, limit: int = 5) -> list:
"""Get related topics for additional content"""
try:
# This is a simplified implementation
# In a real app, you might use Wikipedia's "See also" sections
# or Wikidata relationships
search_results = self._search_wikipedia(topic)
if not search_results:
return []
# Return related pages from search results
related = []
for result in search_results[1:limit+1]: # Skip first result (exact match)
related.append({
'title': result['title'],
'description': result.get('description', ''),
'url': result.get('content_urls', {}).get('desktop', {}).get('page', '')
})
return related
except Exception as e:
print(f"Related topics error: {e}")
return []
def get_random_article(self, language: str = 'en') -> Optional[Dict[str, str]]:
"""Get a random Wikipedia article"""
lang_domains = {
'en': 'en.wikipedia.org',
'hi': 'hi.wikipedia.org',
'te': 'te.wikipedia.org',
'ta': 'ta.wikipedia.org',
'kn': 'kn.wikipedia.org',
'bn': 'bn.wikipedia.org'
}
domain = lang_domains.get(language, 'en.wikipedia.org')
try:
url = f"https://{domain}/api/rest_v1/page/random/summary"
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = response.json()
return {
'title': data.get('title', ''),
'extract': data.get('extract', ''),
'url': data.get('content_urls', {}).get('desktop', {}).get('page', '')
}
except Exception as e:
print(f"Random article error: {e}")
return None