Spaces:

AswinMathew
/

anime-gen-api

Sleeping

App Files Files Community

anime-gen-api / app /services /fandom.py

AswinMathew

Upload folder using huggingface_hub

7190fd0 verified about 1 month ago

raw

history blame contribute delete

3.78 kB

	"""Fandom/MediaWiki API client for character enrichment."""
	import re
	import httpx


	async def search_fandom_wiki(
	novel_title: str,
	character_name: str,
	) -> dict \| None:
	"""Search Fandom wikis for character appearance data.
	Returns {description, image_url} or None."""

	# Generate wiki subdomain from novel title
	slug = re.sub(r'[^a-zA-Z0-9]', '', novel_title.lower().replace(' ', ''))
	wiki_url = f"https://{slug}.fandom.com"

	try:
	async with httpx.AsyncClient(timeout=15.0) as client:
	# Search for character page
	resp = await client.get(
	f"{wiki_url}/api.php",
	params={
	"action": "query",
	"list": "search",
	"srsearch": character_name,
	"srnamespace": "0",
	"srlimit": "3",
	"format": "json",
	},
	)
	if resp.status_code != 200:
	return None

	data = resp.json()
	results = data.get("query", {}).get("search", [])
	if not results:
	return None

	page_title = results[0]["title"]

	# Get page content (parse for appearance section)
	resp2 = await client.get(
	f"{wiki_url}/api.php",
	params={
	"action": "parse",
	"page": page_title,
	"prop": "wikitext\|images",
	"format": "json",
	},
	)
	if resp2.status_code != 200:
	return None

	parse_data = resp2.json().get("parse", {})
	wikitext = parse_data.get("wikitext", {}).get("*", "")
	images = parse_data.get("images", [])

	# Extract appearance section
	appearance = _extract_section(wikitext, ["Appearance", "Physical Description", "Description"])

	# Get first image URL
	image_url = None
	if images:
	img_name = images[0]
	img_resp = await client.get(
	f"{wiki_url}/api.php",
	params={
	"action": "query",
	"titles": f"File:{img_name}",
	"prop": "imageinfo",
	"iiprop": "url",
	"format": "json",
	},
	)
	if img_resp.status_code == 200:
	pages = img_resp.json().get("query", {}).get("pages", {})
	for page in pages.values():
	ii = page.get("imageinfo", [])
	if ii:
	image_url = ii[0].get("url")

	return {
	"description": _clean_wikitext(appearance) if appearance else None,
	"image_url": image_url,
	}
	except Exception:
	return None


	def _extract_section(wikitext: str, headings: list[str]) -> str \| None:
	"""Extract content under a wiki section heading."""
	for heading in headings:
	pattern = rf'==+\s{re.escape(heading)}\s==+(.*?)(?===\|\Z)'
	match = re.search(pattern, wikitext, re.DOTALL \| re.IGNORECASE)
	if match:
	return match.group(1).strip()
	return None


	def _clean_wikitext(text: str) -> str:
	"""Remove wiki markup, leaving plain text."""
	text = re.sub(r'\[\[(?:[^\|\]]\\|)?([^\]])\]\]', r'\1', text) # [[link\|text]] → text
	text = re.sub(r'\{\{[^}]*\}\}', '', text) # {{templates}}
	text = re.sub(r"'''?", '', text) # bold/italic
	text = re.sub(r'<[^>]+>', '', text) # HTML tags
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()