anime-gen-api / app /services /fandom.py
AswinMathew's picture
Upload folder using huggingface_hub
7190fd0 verified
"""Fandom/MediaWiki API client for character enrichment."""
import re
import httpx
async def search_fandom_wiki(
novel_title: str,
character_name: str,
) -> dict | None:
"""Search Fandom wikis for character appearance data.
Returns {description, image_url} or None."""
# Generate wiki subdomain from novel title
slug = re.sub(r'[^a-zA-Z0-9]', '', novel_title.lower().replace(' ', ''))
wiki_url = f"https://{slug}.fandom.com"
try:
async with httpx.AsyncClient(timeout=15.0) as client:
# Search for character page
resp = await client.get(
f"{wiki_url}/api.php",
params={
"action": "query",
"list": "search",
"srsearch": character_name,
"srnamespace": "0",
"srlimit": "3",
"format": "json",
},
)
if resp.status_code != 200:
return None
data = resp.json()
results = data.get("query", {}).get("search", [])
if not results:
return None
page_title = results[0]["title"]
# Get page content (parse for appearance section)
resp2 = await client.get(
f"{wiki_url}/api.php",
params={
"action": "parse",
"page": page_title,
"prop": "wikitext|images",
"format": "json",
},
)
if resp2.status_code != 200:
return None
parse_data = resp2.json().get("parse", {})
wikitext = parse_data.get("wikitext", {}).get("*", "")
images = parse_data.get("images", [])
# Extract appearance section
appearance = _extract_section(wikitext, ["Appearance", "Physical Description", "Description"])
# Get first image URL
image_url = None
if images:
img_name = images[0]
img_resp = await client.get(
f"{wiki_url}/api.php",
params={
"action": "query",
"titles": f"File:{img_name}",
"prop": "imageinfo",
"iiprop": "url",
"format": "json",
},
)
if img_resp.status_code == 200:
pages = img_resp.json().get("query", {}).get("pages", {})
for page in pages.values():
ii = page.get("imageinfo", [])
if ii:
image_url = ii[0].get("url")
return {
"description": _clean_wikitext(appearance) if appearance else None,
"image_url": image_url,
}
except Exception:
return None
def _extract_section(wikitext: str, headings: list[str]) -> str | None:
"""Extract content under a wiki section heading."""
for heading in headings:
pattern = rf'==+\s*{re.escape(heading)}\s*==+(.*?)(?===|\Z)'
match = re.search(pattern, wikitext, re.DOTALL | re.IGNORECASE)
if match:
return match.group(1).strip()
return None
def _clean_wikitext(text: str) -> str:
"""Remove wiki markup, leaving plain text."""
text = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]*)\]\]', r'\1', text) # [[link|text]] → text
text = re.sub(r'\{\{[^}]*\}\}', '', text) # {{templates}}
text = re.sub(r"'''?", '', text) # bold/italic
text = re.sub(r'<[^>]+>', '', text) # HTML tags
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()