Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Prompt_Squirrel_RAG / scripts /extract_wiki_data.py

Claude

Redesign structural inference as group-based system with wiki data

684cf99 about 1 month ago

5.82 kB

	"""Extract tag group memberships and wiki definitions from wiki_pages CSV.

	Usage:
	python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>

	Outputs:
	data/tag_groups.json — {group_name: [member_tags]}
	data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}
	"""
	from __future__ import annotations
	import csv, json, re, sys
	from pathlib import Path
	from typing import Dict, List

	_REPO_ROOT = Path(__file__).resolve().parents[1]


	def _extract_tag_links(body: str) -> List[str]:
	"""Extract tag names from DText wiki markup.

	Patterns:
	- [[#tagname\|display]] — anchor links in tag group pages
	- [[tagname]] — simple wiki links
	- * [[tagname\|display]] — list items
	"""
	tags = []
	# Navigation/heading anchors to skip
	_SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
	# Anchor links: [[#tag_name\|display_text]]
	for m in re.finditer(r'\[\[#([a-z0-9_]+)\\|', body):
	tag = m.group(1)
	if tag not in _SKIP:
	tags.append(tag)
	# If no anchor links found, try regular wiki links in list items
	if not tags:
	for m in re.finditer(r'\\s\[\[([a-z0-9_()]+?)(?:\\|\|\]\])', body):
	tag = m.group(1)
	if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
	tags.append(tag)
	# Deduplicate while preserving order
	seen = set()
	deduped = []
	for t in tags:
	if t not in seen:
	seen.add(t)
	deduped.append(t)
	return deduped


	def _first_sentence(body: str) -> str:
	"""Extract first meaningful sentence from a wiki body for use as a tag definition."""
	# Strip DText markup
	text = re.sub(r'\[\[#?\w+\\|([^\]]+)\]\]', r'\1', body) # [[link\|text]] -> text
	text = re.sub(r'\[\[([^\]\|]+)\]\]', r'\1', text) # [[text]] -> text
	text = re.sub(r'h[1-6]\.\s*', '', text) # headings
	text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc.
	text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url

	# Find first sentence that's actually descriptive (not navigation/see-also)
	for line in text.split('\n'):
	line = line.strip().lstrip('* ')
	if not line:
	continue
	if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
	continue
	if len(line) < 10:
	continue
	# Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
	if re.fullmatch(r'(thumb\s#\d+\s)+', line):
	continue
	# Skip lines that are mostly thumbnail references with little text
	thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
	if len(thumb_stripped) < 10:
	continue
	# Use the thumb-stripped version for the definition
	line = thumb_stripped
	# Truncate at first period if it's a real sentence
	period = line.find('. ')
	if period > 20:
	return line[:period + 1]
	if len(line) > 30:
	return line[:300]
	return ""


	def main():
	if len(sys.argv) < 2:
	print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
	sys.exit(1)

	csv_path = Path(sys.argv[1])
	if not csv_path.is_file():
	print(f"File not found: {csv_path}")
	sys.exit(1)

	# The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
	tag_groups: Dict[str, List[str]] = {}
	tag_defs: Dict[str, str] = {}

	print(f"Reading {csv_path}...")
	with csv_path.open("r", encoding="utf-8") as f:
	reader = csv.reader(f)
	header = next(reader)
	print(f"Columns: {header}")

	# Find column indices
	title_idx = header.index("title") if "title" in header else 3
	body_idx = header.index("body") if "body" in header else 4

	for row in reader:
	if len(row) <= max(title_idx, body_idx):
	continue
	title = row[title_idx].strip()
	body = row[body_idx]

	if title.startswith("tag_group:"):
	group_name = title[len("tag_group:"):]
	members = _extract_tag_links(body)
	if members:
	tag_groups[group_name] = members

	elif not title.startswith(("help:", "howto:", "about:", "forum_")):
	# It's a tag wiki page — extract first sentence as definition
	defn = _first_sentence(body)
	if defn:
	tag_defs[title] = defn

	# Write outputs
	out_dir = _REPO_ROOT / "data"
	out_dir.mkdir(exist_ok=True)

	groups_path = out_dir / "tag_groups.json"
	with groups_path.open("w", encoding="utf-8") as f:
	json.dump(tag_groups, f, indent=2, ensure_ascii=False)
	print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
	for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
	print(f" {g}: {len(members)} tags")

	defs_path = out_dir / "tag_wiki_defs.json"
	with defs_path.open("w", encoding="utf-8") as f:
	json.dump(tag_defs, f, indent=2, ensure_ascii=False)
	print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")

	# Show definitions for key structural tags
	structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
	"looking_at_viewer", "standing", "clothed", "clothing"]
	print(f"\nKey tag definitions:")
	for tag in structural:
	defn = tag_defs.get(tag, "(not found)")
	print(f" {tag}: {defn[:120]}")


	if __name__ == "__main__":
	main()