"""Extract tag group memberships and wiki definitions from wiki_pages CSV. Usage: python scripts/extract_wiki_data.py Outputs: data/tag_groups.json — {group_name: [member_tags]} data/tag_wiki_defs.json — {tag: first_sentence_of_wiki} """ from __future__ import annotations import csv, json, re, sys from pathlib import Path from typing import Dict, List _REPO_ROOT = Path(__file__).resolve().parents[1] def _extract_tag_links(body: str) -> List[str]: """Extract tag names from DText wiki markup. Patterns: - [[#tagname|display]] — anchor links in tag group pages - [[tagname]] — simple wiki links - * [[tagname|display]] — list items """ tags = [] # Navigation/heading anchors to skip _SKIP = {"top", "see_also", "related", "back", "contents", "toc"} # Anchor links: [[#tag_name|display_text]] for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body): tag = m.group(1) if tag not in _SKIP: tags.append(tag) # If no anchor links found, try regular wiki links in list items if not tags: for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body): tag = m.group(1) if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '): tags.append(tag) # Deduplicate while preserving order seen = set() deduped = [] for t in tags: if t not in seen: seen.add(t) deduped.append(t) return deduped def _first_sentence(body: str) -> str: """Extract first meaningful sentence from a wiki body for use as a tag definition.""" # Strip DText markup text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body) # [[link|text]] -> text text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text) # [[text]] -> text text = re.sub(r'h[1-6]\.\s*', '', text) # headings text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc. text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url # Find first sentence that's actually descriptive (not navigation/see-also) for line in text.split('\n'): line = line.strip().lstrip('* ') if not line: continue if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')): continue if len(line) < 10: continue # Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890") if re.fullmatch(r'(thumb\s*#\d+\s*)+', line): continue # Skip lines that are mostly thumbnail references with little text thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip() if len(thumb_stripped) < 10: continue # Use the thumb-stripped version for the definition line = thumb_stripped # Truncate at first period if it's a real sentence period = line.find('. ') if period > 20: return line[:period + 1] if len(line) > 30: return line[:300] return "" def main(): if len(sys.argv) < 2: print("Usage: python scripts/extract_wiki_data.py ") sys.exit(1) csv_path = Path(sys.argv[1]) if not csv_path.is_file(): print(f"File not found: {csv_path}") sys.exit(1) # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked tag_groups: Dict[str, List[str]] = {} tag_defs: Dict[str, str] = {} print(f"Reading {csv_path}...") with csv_path.open("r", encoding="utf-8") as f: reader = csv.reader(f) header = next(reader) print(f"Columns: {header}") # Find column indices title_idx = header.index("title") if "title" in header else 3 body_idx = header.index("body") if "body" in header else 4 for row in reader: if len(row) <= max(title_idx, body_idx): continue title = row[title_idx].strip() body = row[body_idx] if title.startswith("tag_group:"): group_name = title[len("tag_group:"):] members = _extract_tag_links(body) if members: tag_groups[group_name] = members elif not title.startswith(("help:", "howto:", "about:", "forum_")): # It's a tag wiki page — extract first sentence as definition defn = _first_sentence(body) if defn: tag_defs[title] = defn # Write outputs out_dir = _REPO_ROOT / "data" out_dir.mkdir(exist_ok=True) groups_path = out_dir / "tag_groups.json" with groups_path.open("w", encoding="utf-8") as f: json.dump(tag_groups, f, indent=2, ensure_ascii=False) print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}") for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]: print(f" {g}: {len(members)} tags") defs_path = out_dir / "tag_wiki_defs.json" with defs_path.open("w", encoding="utf-8") as f: json.dump(tag_defs, f, indent=2, ensure_ascii=False) print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}") # Show definitions for key structural tags structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female", "looking_at_viewer", "standing", "clothed", "clothing"] print(f"\nKey tag definitions:") for tag in structural: defn = tag_defs.get(tag, "(not found)") print(f" {tag}: {defn[:120]}") if __name__ == "__main__": main()