wikiverse-backend / Backend /utils /wikipedia_compare.py
trretretret's picture
Deploy FastAPI backend
95cca31
import requests
# Supported languages — adjust as needed
INDIC_LANGS = ['en', 'hi', 'te', 'ta', 'ml', 'kn']
def get_multilang_titles(topic: str, source_lang: str, langs: list) -> dict:
"""
Get translated article titles from Wikidata using the original topic in the source language.
"""
# Step 1: Get Wikidata ID from source language Wikipedia
wiki_id_url = f"https://{source_lang}.wikipedia.org/w/api.php"
params = {
"action": "query",
"titles": topic,
"prop": "pageprops",
"format": "json"
}
try:
id_res = requests.get(wiki_id_url, params=params).json()
pages = id_res.get("query", {}).get("pages", {})
page = next(iter(pages.values()))
wikidata_id = page.get("pageprops", {}).get("wikibase_item", "")
except Exception:
wikidata_id = ""
if not wikidata_id:
return {}
# Step 2: Get language-specific labels from Wikidata
entity_url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
try:
label_res = requests.get(entity_url).json()
labels = label_res.get("entities", {}).get(wikidata_id, {}).get("labels", {})
except Exception:
labels = {}
return {
lang: labels[lang]["value"]
for lang in langs if lang in labels
}
def fetch_summary(title: str, lang: str) -> str:
"""
Fetch plain-text summary of a given Wikipedia page title in a specific language.
"""
url = f"https://{lang}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "extracts",
"titles": title,
"exintro": True,
"explaintext": True
}
try:
res = requests.get(url, params=params).json()
pages = res.get("query", {}).get("pages", {})
if pages:
page = next(iter(pages.values()))
return page.get("extract", "No summary found.")
except Exception:
pass
return "No summary found."
def fetch_multilingual_summaries(topic: str, source_lang: str, langs: list) -> dict:
"""
Fetch summary from source language (raw topic) and translated target languages using Wikidata.
"""
summaries = {}
# Step 1: Identify target languages (excluding source)
target_langs = [lang for lang in langs if lang != source_lang]
# Step 2: Get translated titles via Wikidata
titles = get_multilang_titles(topic, source_lang, target_langs)
# Step 3: Fetch source summary directly using input topic
summaries[source_lang] = fetch_summary(topic, source_lang)
# Step 4: Fetch translated summaries
for lang in target_langs:
title = titles.get(lang, topic) # fallback to topic if no translation
summaries[lang] = fetch_summary(title, lang)
return summaries