Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

File size: 5,815 Bytes

"""Extract tag group memberships and wiki definitions from wiki_pages CSV.



Usage:

    python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>



Outputs:

    data/tag_groups.json   — {group_name: [member_tags]}

    data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}

"""
from __future__ import annotations
import csv, json, re, sys
from pathlib import Path
from typing import Dict, List

_REPO_ROOT = Path(__file__).resolve().parents[1]


def _extract_tag_links(body: str) -> List[str]:
    """Extract tag names from DText wiki markup.



    Patterns:

    - [[#tagname|display]] — anchor links in tag group pages

    - [[tagname]] — simple wiki links

    - * [[tagname|display]] — list items

    """
    tags = []
    # Navigation/heading anchors to skip
    _SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
    # Anchor links: [[#tag_name|display_text]]
    for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
        tag = m.group(1)
        if tag not in _SKIP:
            tags.append(tag)
    # If no anchor links found, try regular wiki links in list items
    if not tags:
        for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
            tag = m.group(1)
            if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
                tags.append(tag)
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for t in tags:
        if t not in seen:
            seen.add(t)
            deduped.append(t)
    return deduped


def _first_sentence(body: str) -> str:
    """Extract first meaningful sentence from a wiki body for use as a tag definition."""
    # Strip DText markup
    text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body)  # [[link|text]] -> text
    text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text)  # [[text]] -> text
    text = re.sub(r'h[1-6]\.\s*', '', text)  # headings
    text = re.sub(r'\[/?[a-z]+\]', '', text)  # [b], [/b], etc.
    text = re.sub(r'"[^"]*":\S+', '', text)  # DText links "text":url

    # Find first sentence that's actually descriptive (not navigation/see-also)
    for line in text.split('\n'):
        line = line.strip().lstrip('* ')
        if not line:
            continue
        if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
            continue
        if len(line) < 10:
            continue
        # Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
        if re.fullmatch(r'(thumb\s*#\d+\s*)+', line):
            continue
        # Skip lines that are mostly thumbnail references with little text
        thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
        if len(thumb_stripped) < 10:
            continue
        # Use the thumb-stripped version for the definition
        line = thumb_stripped
        # Truncate at first period if it's a real sentence
        period = line.find('. ')
        if period > 20:
            return line[:period + 1]
        if len(line) > 30:
            return line[:300]
    return ""


def main():
    if len(sys.argv) < 2:
        print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
        sys.exit(1)

    csv_path = Path(sys.argv[1])
    if not csv_path.is_file():
        print(f"File not found: {csv_path}")
        sys.exit(1)

    # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
    tag_groups: Dict[str, List[str]] = {}
    tag_defs: Dict[str, str] = {}

    print(f"Reading {csv_path}...")
    with csv_path.open("r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader)
        print(f"Columns: {header}")

        # Find column indices
        title_idx = header.index("title") if "title" in header else 3
        body_idx = header.index("body") if "body" in header else 4

        for row in reader:
            if len(row) <= max(title_idx, body_idx):
                continue
            title = row[title_idx].strip()
            body = row[body_idx]

            if title.startswith("tag_group:"):
                group_name = title[len("tag_group:"):]
                members = _extract_tag_links(body)
                if members:
                    tag_groups[group_name] = members

            elif not title.startswith(("help:", "howto:", "about:", "forum_")):
                # It's a tag wiki page — extract first sentence as definition
                defn = _first_sentence(body)
                if defn:
                    tag_defs[title] = defn

    # Write outputs
    out_dir = _REPO_ROOT / "data"
    out_dir.mkdir(exist_ok=True)

    groups_path = out_dir / "tag_groups.json"
    with groups_path.open("w", encoding="utf-8") as f:
        json.dump(tag_groups, f, indent=2, ensure_ascii=False)
    print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
    for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
        print(f"  {g}: {len(members)} tags")

    defs_path = out_dir / "tag_wiki_defs.json"
    with defs_path.open("w", encoding="utf-8") as f:
        json.dump(tag_defs, f, indent=2, ensure_ascii=False)
    print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")

    # Show definitions for key structural tags
    structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
                   "looking_at_viewer", "standing", "clothed", "clothing"]
    print(f"\nKey tag definitions:")
    for tag in structural:
        defn = tag_defs.get(tag, "(not found)")
        print(f"  {tag}: {defn[:120]}")


if __name__ == "__main__":
    main()