Spaces:
Running
Running
File size: 5,815 Bytes
019823a 684cf99 019823a 684cf99 019823a 684cf99 019823a 684cf99 019823a 684cf99 019823a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | """Extract tag group memberships and wiki definitions from wiki_pages CSV.
Usage:
python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>
Outputs:
data/tag_groups.json — {group_name: [member_tags]}
data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}
"""
from __future__ import annotations
import csv, json, re, sys
from pathlib import Path
from typing import Dict, List
_REPO_ROOT = Path(__file__).resolve().parents[1]
def _extract_tag_links(body: str) -> List[str]:
"""Extract tag names from DText wiki markup.
Patterns:
- [[#tagname|display]] — anchor links in tag group pages
- [[tagname]] — simple wiki links
- * [[tagname|display]] — list items
"""
tags = []
# Navigation/heading anchors to skip
_SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
# Anchor links: [[#tag_name|display_text]]
for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
tag = m.group(1)
if tag not in _SKIP:
tags.append(tag)
# If no anchor links found, try regular wiki links in list items
if not tags:
for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
tag = m.group(1)
if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
tags.append(tag)
# Deduplicate while preserving order
seen = set()
deduped = []
for t in tags:
if t not in seen:
seen.add(t)
deduped.append(t)
return deduped
def _first_sentence(body: str) -> str:
"""Extract first meaningful sentence from a wiki body for use as a tag definition."""
# Strip DText markup
text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body) # [[link|text]] -> text
text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text) # [[text]] -> text
text = re.sub(r'h[1-6]\.\s*', '', text) # headings
text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc.
text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url
# Find first sentence that's actually descriptive (not navigation/see-also)
for line in text.split('\n'):
line = line.strip().lstrip('* ')
if not line:
continue
if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
continue
if len(line) < 10:
continue
# Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
if re.fullmatch(r'(thumb\s*#\d+\s*)+', line):
continue
# Skip lines that are mostly thumbnail references with little text
thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
if len(thumb_stripped) < 10:
continue
# Use the thumb-stripped version for the definition
line = thumb_stripped
# Truncate at first period if it's a real sentence
period = line.find('. ')
if period > 20:
return line[:period + 1]
if len(line) > 30:
return line[:300]
return ""
def main():
if len(sys.argv) < 2:
print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
sys.exit(1)
csv_path = Path(sys.argv[1])
if not csv_path.is_file():
print(f"File not found: {csv_path}")
sys.exit(1)
# The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
tag_groups: Dict[str, List[str]] = {}
tag_defs: Dict[str, str] = {}
print(f"Reading {csv_path}...")
with csv_path.open("r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
print(f"Columns: {header}")
# Find column indices
title_idx = header.index("title") if "title" in header else 3
body_idx = header.index("body") if "body" in header else 4
for row in reader:
if len(row) <= max(title_idx, body_idx):
continue
title = row[title_idx].strip()
body = row[body_idx]
if title.startswith("tag_group:"):
group_name = title[len("tag_group:"):]
members = _extract_tag_links(body)
if members:
tag_groups[group_name] = members
elif not title.startswith(("help:", "howto:", "about:", "forum_")):
# It's a tag wiki page — extract first sentence as definition
defn = _first_sentence(body)
if defn:
tag_defs[title] = defn
# Write outputs
out_dir = _REPO_ROOT / "data"
out_dir.mkdir(exist_ok=True)
groups_path = out_dir / "tag_groups.json"
with groups_path.open("w", encoding="utf-8") as f:
json.dump(tag_groups, f, indent=2, ensure_ascii=False)
print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
print(f" {g}: {len(members)} tags")
defs_path = out_dir / "tag_wiki_defs.json"
with defs_path.open("w", encoding="utf-8") as f:
json.dump(tag_defs, f, indent=2, ensure_ascii=False)
print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")
# Show definitions for key structural tags
structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
"looking_at_viewer", "standing", "clothed", "clothing"]
print(f"\nKey tag definitions:")
for tag in structural:
defn = tag_defs.get(tag, "(not found)")
print(f" {tag}: {defn[:120]}")
if __name__ == "__main__":
main()
|