Spaces:
Running
Running
| """Extract tag group memberships and wiki definitions from wiki_pages CSV. | |
| Usage: | |
| python scripts/extract_wiki_data.py <path_to_wiki_pages_csv> | |
| Outputs: | |
| data/tag_groups.json β {group_name: [member_tags]} | |
| data/tag_wiki_defs.json β {tag: first_sentence_of_wiki} | |
| """ | |
| from __future__ import annotations | |
| import csv, json, re, sys | |
| from pathlib import Path | |
| from typing import Dict, List | |
| _REPO_ROOT = Path(__file__).resolve().parents[1] | |
| def _extract_tag_links(body: str) -> List[str]: | |
| """Extract tag names from DText wiki markup. | |
| Patterns: | |
| - [[#tagname|display]] β anchor links in tag group pages | |
| - [[tagname]] β simple wiki links | |
| - * [[tagname|display]] β list items | |
| """ | |
| tags = [] | |
| # Navigation/heading anchors to skip | |
| _SKIP = {"top", "see_also", "related", "back", "contents", "toc"} | |
| # Anchor links: [[#tag_name|display_text]] | |
| for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body): | |
| tag = m.group(1) | |
| if tag not in _SKIP: | |
| tags.append(tag) | |
| # If no anchor links found, try regular wiki links in list items | |
| if not tags: | |
| for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body): | |
| tag = m.group(1) | |
| if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '): | |
| tags.append(tag) | |
| # Deduplicate while preserving order | |
| seen = set() | |
| deduped = [] | |
| for t in tags: | |
| if t not in seen: | |
| seen.add(t) | |
| deduped.append(t) | |
| return deduped | |
| def _first_sentence(body: str) -> str: | |
| """Extract first meaningful sentence from a wiki body for use as a tag definition.""" | |
| # Strip DText markup | |
| text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body) # [[link|text]] -> text | |
| text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text) # [[text]] -> text | |
| text = re.sub(r'h[1-6]\.\s*', '', text) # headings | |
| text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc. | |
| text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url | |
| # Find first sentence that's actually descriptive (not navigation/see-also) | |
| for line in text.split('\n'): | |
| line = line.strip().lstrip('* ') | |
| if not line: | |
| continue | |
| if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')): | |
| continue | |
| if len(line) < 10: | |
| continue | |
| # Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890") | |
| if re.fullmatch(r'(thumb\s*#\d+\s*)+', line): | |
| continue | |
| # Skip lines that are mostly thumbnail references with little text | |
| thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip() | |
| if len(thumb_stripped) < 10: | |
| continue | |
| # Use the thumb-stripped version for the definition | |
| line = thumb_stripped | |
| # Truncate at first period if it's a real sentence | |
| period = line.find('. ') | |
| if period > 20: | |
| return line[:period + 1] | |
| if len(line) > 30: | |
| return line[:300] | |
| return "" | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>") | |
| sys.exit(1) | |
| csv_path = Path(sys.argv[1]) | |
| if not csv_path.is_file(): | |
| print(f"File not found: {csv_path}") | |
| sys.exit(1) | |
| # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked | |
| tag_groups: Dict[str, List[str]] = {} | |
| tag_defs: Dict[str, str] = {} | |
| print(f"Reading {csv_path}...") | |
| with csv_path.open("r", encoding="utf-8") as f: | |
| reader = csv.reader(f) | |
| header = next(reader) | |
| print(f"Columns: {header}") | |
| # Find column indices | |
| title_idx = header.index("title") if "title" in header else 3 | |
| body_idx = header.index("body") if "body" in header else 4 | |
| for row in reader: | |
| if len(row) <= max(title_idx, body_idx): | |
| continue | |
| title = row[title_idx].strip() | |
| body = row[body_idx] | |
| if title.startswith("tag_group:"): | |
| group_name = title[len("tag_group:"):] | |
| members = _extract_tag_links(body) | |
| if members: | |
| tag_groups[group_name] = members | |
| elif not title.startswith(("help:", "howto:", "about:", "forum_")): | |
| # It's a tag wiki page β extract first sentence as definition | |
| defn = _first_sentence(body) | |
| if defn: | |
| tag_defs[title] = defn | |
| # Write outputs | |
| out_dir = _REPO_ROOT / "data" | |
| out_dir.mkdir(exist_ok=True) | |
| groups_path = out_dir / "tag_groups.json" | |
| with groups_path.open("w", encoding="utf-8") as f: | |
| json.dump(tag_groups, f, indent=2, ensure_ascii=False) | |
| print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}") | |
| for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]: | |
| print(f" {g}: {len(members)} tags") | |
| defs_path = out_dir / "tag_wiki_defs.json" | |
| with defs_path.open("w", encoding="utf-8") as f: | |
| json.dump(tag_defs, f, indent=2, ensure_ascii=False) | |
| print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}") | |
| # Show definitions for key structural tags | |
| structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female", | |
| "looking_at_viewer", "standing", "clothed", "clothing"] | |
| print(f"\nKey tag definitions:") | |
| for tag in structural: | |
| defn = tag_defs.get(tag, "(not found)") | |
| print(f" {tag}: {defn[:120]}") | |
| if __name__ == "__main__": | |
| main() | |