Spaces:
Sleeping
Sleeping
| """Fandom/MediaWiki API client for character enrichment.""" | |
| import re | |
| import httpx | |
| async def search_fandom_wiki( | |
| novel_title: str, | |
| character_name: str, | |
| ) -> dict | None: | |
| """Search Fandom wikis for character appearance data. | |
| Returns {description, image_url} or None.""" | |
| # Generate wiki subdomain from novel title | |
| slug = re.sub(r'[^a-zA-Z0-9]', '', novel_title.lower().replace(' ', '')) | |
| wiki_url = f"https://{slug}.fandom.com" | |
| try: | |
| async with httpx.AsyncClient(timeout=15.0) as client: | |
| # Search for character page | |
| resp = await client.get( | |
| f"{wiki_url}/api.php", | |
| params={ | |
| "action": "query", | |
| "list": "search", | |
| "srsearch": character_name, | |
| "srnamespace": "0", | |
| "srlimit": "3", | |
| "format": "json", | |
| }, | |
| ) | |
| if resp.status_code != 200: | |
| return None | |
| data = resp.json() | |
| results = data.get("query", {}).get("search", []) | |
| if not results: | |
| return None | |
| page_title = results[0]["title"] | |
| # Get page content (parse for appearance section) | |
| resp2 = await client.get( | |
| f"{wiki_url}/api.php", | |
| params={ | |
| "action": "parse", | |
| "page": page_title, | |
| "prop": "wikitext|images", | |
| "format": "json", | |
| }, | |
| ) | |
| if resp2.status_code != 200: | |
| return None | |
| parse_data = resp2.json().get("parse", {}) | |
| wikitext = parse_data.get("wikitext", {}).get("*", "") | |
| images = parse_data.get("images", []) | |
| # Extract appearance section | |
| appearance = _extract_section(wikitext, ["Appearance", "Physical Description", "Description"]) | |
| # Get first image URL | |
| image_url = None | |
| if images: | |
| img_name = images[0] | |
| img_resp = await client.get( | |
| f"{wiki_url}/api.php", | |
| params={ | |
| "action": "query", | |
| "titles": f"File:{img_name}", | |
| "prop": "imageinfo", | |
| "iiprop": "url", | |
| "format": "json", | |
| }, | |
| ) | |
| if img_resp.status_code == 200: | |
| pages = img_resp.json().get("query", {}).get("pages", {}) | |
| for page in pages.values(): | |
| ii = page.get("imageinfo", []) | |
| if ii: | |
| image_url = ii[0].get("url") | |
| return { | |
| "description": _clean_wikitext(appearance) if appearance else None, | |
| "image_url": image_url, | |
| } | |
| except Exception: | |
| return None | |
| def _extract_section(wikitext: str, headings: list[str]) -> str | None: | |
| """Extract content under a wiki section heading.""" | |
| for heading in headings: | |
| pattern = rf'==+\s*{re.escape(heading)}\s*==+(.*?)(?===|\Z)' | |
| match = re.search(pattern, wikitext, re.DOTALL | re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| return None | |
| def _clean_wikitext(text: str) -> str: | |
| """Remove wiki markup, leaving plain text.""" | |
| text = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]*)\]\]', r'\1', text) # [[link|text]] → text | |
| text = re.sub(r'\{\{[^}]*\}\}', '', text) # {{templates}} | |
| text = re.sub(r"'''?", '', text) # bold/italic | |
| text = re.sub(r'<[^>]+>', '', text) # HTML tags | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| return text.strip() | |