Prompt_Squirrel_RAG / scripts /extract_wiki_data.py
Claude
Redesign structural inference as group-based system with wiki data
684cf99
"""Extract tag group memberships and wiki definitions from wiki_pages CSV.
Usage:
python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>
Outputs:
data/tag_groups.json β€” {group_name: [member_tags]}
data/tag_wiki_defs.json β€” {tag: first_sentence_of_wiki}
"""
from __future__ import annotations
import csv, json, re, sys
from pathlib import Path
from typing import Dict, List
_REPO_ROOT = Path(__file__).resolve().parents[1]
def _extract_tag_links(body: str) -> List[str]:
"""Extract tag names from DText wiki markup.
Patterns:
- [[#tagname|display]] β€” anchor links in tag group pages
- [[tagname]] β€” simple wiki links
- * [[tagname|display]] β€” list items
"""
tags = []
# Navigation/heading anchors to skip
_SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
# Anchor links: [[#tag_name|display_text]]
for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
tag = m.group(1)
if tag not in _SKIP:
tags.append(tag)
# If no anchor links found, try regular wiki links in list items
if not tags:
for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
tag = m.group(1)
if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
tags.append(tag)
# Deduplicate while preserving order
seen = set()
deduped = []
for t in tags:
if t not in seen:
seen.add(t)
deduped.append(t)
return deduped
def _first_sentence(body: str) -> str:
"""Extract first meaningful sentence from a wiki body for use as a tag definition."""
# Strip DText markup
text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body) # [[link|text]] -> text
text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text) # [[text]] -> text
text = re.sub(r'h[1-6]\.\s*', '', text) # headings
text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc.
text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url
# Find first sentence that's actually descriptive (not navigation/see-also)
for line in text.split('\n'):
line = line.strip().lstrip('* ')
if not line:
continue
if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
continue
if len(line) < 10:
continue
# Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
if re.fullmatch(r'(thumb\s*#\d+\s*)+', line):
continue
# Skip lines that are mostly thumbnail references with little text
thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
if len(thumb_stripped) < 10:
continue
# Use the thumb-stripped version for the definition
line = thumb_stripped
# Truncate at first period if it's a real sentence
period = line.find('. ')
if period > 20:
return line[:period + 1]
if len(line) > 30:
return line[:300]
return ""
def main():
if len(sys.argv) < 2:
print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
sys.exit(1)
csv_path = Path(sys.argv[1])
if not csv_path.is_file():
print(f"File not found: {csv_path}")
sys.exit(1)
# The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
tag_groups: Dict[str, List[str]] = {}
tag_defs: Dict[str, str] = {}
print(f"Reading {csv_path}...")
with csv_path.open("r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
print(f"Columns: {header}")
# Find column indices
title_idx = header.index("title") if "title" in header else 3
body_idx = header.index("body") if "body" in header else 4
for row in reader:
if len(row) <= max(title_idx, body_idx):
continue
title = row[title_idx].strip()
body = row[body_idx]
if title.startswith("tag_group:"):
group_name = title[len("tag_group:"):]
members = _extract_tag_links(body)
if members:
tag_groups[group_name] = members
elif not title.startswith(("help:", "howto:", "about:", "forum_")):
# It's a tag wiki page β€” extract first sentence as definition
defn = _first_sentence(body)
if defn:
tag_defs[title] = defn
# Write outputs
out_dir = _REPO_ROOT / "data"
out_dir.mkdir(exist_ok=True)
groups_path = out_dir / "tag_groups.json"
with groups_path.open("w", encoding="utf-8") as f:
json.dump(tag_groups, f, indent=2, ensure_ascii=False)
print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
print(f" {g}: {len(members)} tags")
defs_path = out_dir / "tag_wiki_defs.json"
with defs_path.open("w", encoding="utf-8") as f:
json.dump(tag_defs, f, indent=2, ensure_ascii=False)
print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")
# Show definitions for key structural tags
structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
"looking_at_viewer", "standing", "clothed", "clothing"]
print(f"\nKey tag definitions:")
for tag in structural:
defn = tag_defs.get(tag, "(not found)")
print(f" {tag}: {defn[:120]}")
if __name__ == "__main__":
main()