File size: 5,815 Bytes
019823a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684cf99
 
019823a
 
684cf99
 
 
019823a
 
 
 
684cf99
019823a
684cf99
 
 
 
 
 
 
 
019823a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684cf99
 
 
 
 
 
 
 
 
019823a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Extract tag group memberships and wiki definitions from wiki_pages CSV.



Usage:

    python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>



Outputs:

    data/tag_groups.json   — {group_name: [member_tags]}

    data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}

"""
from __future__ import annotations
import csv, json, re, sys
from pathlib import Path
from typing import Dict, List

_REPO_ROOT = Path(__file__).resolve().parents[1]


def _extract_tag_links(body: str) -> List[str]:
    """Extract tag names from DText wiki markup.



    Patterns:

    - [[#tagname|display]] — anchor links in tag group pages

    - [[tagname]] — simple wiki links

    - * [[tagname|display]] — list items

    """
    tags = []
    # Navigation/heading anchors to skip
    _SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
    # Anchor links: [[#tag_name|display_text]]
    for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
        tag = m.group(1)
        if tag not in _SKIP:
            tags.append(tag)
    # If no anchor links found, try regular wiki links in list items
    if not tags:
        for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
            tag = m.group(1)
            if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
                tags.append(tag)
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for t in tags:
        if t not in seen:
            seen.add(t)
            deduped.append(t)
    return deduped


def _first_sentence(body: str) -> str:
    """Extract first meaningful sentence from a wiki body for use as a tag definition."""
    # Strip DText markup
    text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body)  # [[link|text]] -> text
    text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text)  # [[text]] -> text
    text = re.sub(r'h[1-6]\.\s*', '', text)  # headings
    text = re.sub(r'\[/?[a-z]+\]', '', text)  # [b], [/b], etc.
    text = re.sub(r'"[^"]*":\S+', '', text)  # DText links "text":url

    # Find first sentence that's actually descriptive (not navigation/see-also)
    for line in text.split('\n'):
        line = line.strip().lstrip('* ')
        if not line:
            continue
        if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
            continue
        if len(line) < 10:
            continue
        # Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
        if re.fullmatch(r'(thumb\s*#\d+\s*)+', line):
            continue
        # Skip lines that are mostly thumbnail references with little text
        thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
        if len(thumb_stripped) < 10:
            continue
        # Use the thumb-stripped version for the definition
        line = thumb_stripped
        # Truncate at first period if it's a real sentence
        period = line.find('. ')
        if period > 20:
            return line[:period + 1]
        if len(line) > 30:
            return line[:300]
    return ""


def main():
    if len(sys.argv) < 2:
        print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
        sys.exit(1)

    csv_path = Path(sys.argv[1])
    if not csv_path.is_file():
        print(f"File not found: {csv_path}")
        sys.exit(1)

    # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
    tag_groups: Dict[str, List[str]] = {}
    tag_defs: Dict[str, str] = {}

    print(f"Reading {csv_path}...")
    with csv_path.open("r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader)
        print(f"Columns: {header}")

        # Find column indices
        title_idx = header.index("title") if "title" in header else 3
        body_idx = header.index("body") if "body" in header else 4

        for row in reader:
            if len(row) <= max(title_idx, body_idx):
                continue
            title = row[title_idx].strip()
            body = row[body_idx]

            if title.startswith("tag_group:"):
                group_name = title[len("tag_group:"):]
                members = _extract_tag_links(body)
                if members:
                    tag_groups[group_name] = members

            elif not title.startswith(("help:", "howto:", "about:", "forum_")):
                # It's a tag wiki page — extract first sentence as definition
                defn = _first_sentence(body)
                if defn:
                    tag_defs[title] = defn

    # Write outputs
    out_dir = _REPO_ROOT / "data"
    out_dir.mkdir(exist_ok=True)

    groups_path = out_dir / "tag_groups.json"
    with groups_path.open("w", encoding="utf-8") as f:
        json.dump(tag_groups, f, indent=2, ensure_ascii=False)
    print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
    for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
        print(f"  {g}: {len(members)} tags")

    defs_path = out_dir / "tag_wiki_defs.json"
    with defs_path.open("w", encoding="utf-8") as f:
        json.dump(tag_defs, f, indent=2, ensure_ascii=False)
    print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")

    # Show definitions for key structural tags
    structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
                   "looking_at_viewer", "standing", "clothed", "clothing"]
    print(f"\nKey tag definitions:")
    for tag in structural:
        defn = tag_defs.get(tag, "(not found)")
        print(f"  {tag}: {defn[:120]}")


if __name__ == "__main__":
    main()