Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Sleeping

Claude commited on Feb 12

Commit

019823a

1 Parent(s): 16c5aa4

Add per-tag evidence tracking and wiki extraction script

Evidence tracking: each selected tag now records its source (stage3/structural/
implied), the LLM's 'why' level, and retrieval score. Stored in compact output
as extra_evidence (for false positives only) and in detail output as full
tag_evidence dict. Analysis script reports evidence source breakdown.

Wiki extraction: new script to parse wiki_pages CSV into tag_groups.json
(group memberships) and tag_wiki_defs.json (first-sentence definitions).
These will be used for principled structural inference and tag presentation.

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (3) hide show

scripts/analyze_compact_eval.py +38 -0
scripts/eval_pipeline.py +18 -0
scripts/extract_wiki_data.py +134 -0

scripts/analyze_compact_eval.py CHANGED Viewed

@@ -173,6 +173,44 @@ def main():
             freq = tag_count.get(tag, 0)
             print(f"    {tag:40s} extra {cnt:>2}/{N}  freq={freq:>9,}")
     # ── REPORT 4: Leaf vs non-leaf in missed ──
     print("\n" + "=" * 70)
     print("MISSED: LEAF vs IMPLIED ANCESTORS")

             freq = tag_count.get(tag, 0)
             print(f"    {tag:40s} extra {cnt:>2}/{N}  freq={freq:>9,}")
+    # ── REPORT 3b: Evidence sources for false positives ──
+    # (Only available in new format with extra_evidence field)
+    source_counts = Counter()  # source -> count of FP tags
+    why_fp_counts = Counter()  # why level -> count of FP tags from stage3
+    score_buckets = {"high (>0.5)": 0, "medium (0.2-0.5)": 0, "low (<0.2)": 0}
+    has_evidence = False
+    for s in samples:
+        ev = s.get("extra_evidence", {})
+        if ev:
+            has_evidence = True
+        for tag, info in ev.items():
+            src = info.get("source", "unknown")
+            source_counts[src] += 1
+            if src == "stage3":
+                why_fp_counts[info.get("why", "unknown")] += 1
+                score = info.get("retrieval_score", 0)
+                if score > 0.5: score_buckets["high (>0.5)"] += 1
+                elif score > 0.2: score_buckets["medium (0.2-0.5)"] += 1
+                else: score_buckets["low (<0.2)"] += 1
+    if has_evidence:
+        print("\n" + "=" * 70)
+        print("FALSE POSITIVE EVIDENCE SOURCES")
+        print("=" * 70)
+        total_fp = sum(source_counts.values())
+        print(f"\n  How did {total_fp} false positive tags get through?")
+        for src, cnt in source_counts.most_common():
+            print(f"    {src:20s} {cnt:>4} ({cnt/max(1,total_fp)*100:.0f}%)")
+        if why_fp_counts:
+            print(f"\n  Stage 3 false positives by 'why' level:")
+            for why, cnt in why_fp_counts.most_common():
+                print(f"    {why:20s} {cnt:>4}")
+        print(f"\n  Stage 3 false positives by retrieval score:")
+        for bucket, cnt in score_buckets.items():
+            print(f"    {bucket:20s} {cnt:>4}")
     # ── REPORT 4: Leaf vs non-leaf in missed ──
     print("\n" + "=" * 70)
     print("MISSED: LEAF vs IMPLIED ANCESTORS")

scripts/eval_pipeline.py CHANGED Viewed

@@ -153,6 +153,8 @@ class SampleResult:
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
     structural_tags: List[str] = field(default_factory=list)
     # Leaf-only metrics (strips implied ancestors from both sides)
     leaf_precision: float = 0.0
     leaf_recall: float = 0.0
@@ -286,6 +288,15 @@ def _process_one_sample(
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
         # Why distribution
         why_counts: Dict[str, int] = {}
         for w in tag_why.values():
@@ -302,6 +313,8 @@ def _process_one_sample(
             result.structural_tags = structural
             # Add structural tags not already selected
             for st in structural:
                 result.selected_tags.add(st)
             log(f"Structural: {structural}")
@@ -309,6 +322,8 @@ def _process_one_sample(
         if expand_implications and result.selected_tags:
             expanded, implied_only = expand_tags_via_implications(result.selected_tags)
             result.implied_tags = implied_only
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
@@ -873,6 +888,8 @@ def main(argv=None) -> int:
                 # Diff sets (small — only the errors, not the full lists)
                 "missed": missed_tags,
                 "extra": extra_tags,
                 # Structural tags inferred
                 "structural": r.structural_tags,
                 # Timing
@@ -899,6 +916,7 @@ def main(argv=None) -> int:
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
                 "why_counts": r.why_counts,
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),

     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
     structural_tags: List[str] = field(default_factory=list)
+    # Per-tag evidence: tag -> {"source": "stage3"|"structural"|"implied", "why": ..., "score": ...}
+    tag_evidence: Dict[str, Dict[str, Any]] = field(default_factory=dict)
     # Leaf-only metrics (strips implied ancestors from both sides)
     leaf_precision: float = 0.0
     leaf_recall: float = 0.0
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
+        # Build per-tag evidence from Stage 3 selection
+        for idx in picked_indices:
+            tag = candidates[idx].tag
+            result.tag_evidence[tag] = {
+                "source": "stage3",
+                "why": tag_why.get(tag, "unknown"),
+                "retrieval_score": round(candidates[idx].score_combined, 4),
+            }
         # Why distribution
         why_counts: Dict[str, int] = {}
         for w in tag_why.values():
             result.structural_tags = structural
             # Add structural tags not already selected
             for st in structural:
+                if st not in result.selected_tags:
+                    result.tag_evidence[st] = {"source": "structural"}
                 result.selected_tags.add(st)
             log(f"Structural: {structural}")
         if expand_implications and result.selected_tags:
             expanded, implied_only = expand_tags_via_implications(result.selected_tags)
             result.implied_tags = implied_only
+            for imp_tag in implied_only:
+                result.tag_evidence[imp_tag] = {"source": "implied"}
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
                 # Diff sets (small — only the errors, not the full lists)
                 "missed": missed_tags,
                 "extra": extra_tags,
+                # Evidence for extra tags (why did these false positives get through?)
+                "extra_evidence": {t: r.tag_evidence.get(t, {}) for t in extra_tags},
                 # Structural tags inferred
                 "structural": r.structural_tags,
                 # Timing
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
                 "why_counts": r.why_counts,
+                "tag_evidence": r.tag_evidence,
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),

scripts/extract_wiki_data.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Extract tag group memberships and wiki definitions from wiki_pages CSV.
+Usage:
+    python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>
+Outputs:
+    data/tag_groups.json   — {group_name: [member_tags]}
+    data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}
+"""
+from __future__ import annotations
+import csv, json, re, sys
+from pathlib import Path
+from typing import Dict, List
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+def _extract_tag_links(body: str) -> List[str]:
+    """Extract tag names from DText wiki markup.
+    Patterns:
+    - [[#tagname|display]] — anchor links in tag group pages
+    - [[tagname]] — simple wiki links
+    - * [[tagname|display]] — list items
+    """
+    tags = []
+    # Anchor links: [[#tag_name|display_text]]
+    for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
+        tags.append(m.group(1))
+    # If no anchor links found, try regular wiki links in list items
+    if not tags:
+        for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
+            tag = m.group(1)
+            if not tag.startswith('tag_group:') and not tag.startswith('tag '):
+                tags.append(tag)
+    return tags
+def _first_sentence(body: str) -> str:
+    """Extract first meaningful sentence from a wiki body for use as a tag definition."""
+    # Strip DText markup
+    text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body)  # [[link|text]] -> text
+    text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text)  # [[text]] -> text
+    text = re.sub(r'h[1-6]\.\s*', '', text)  # headings
+    text = re.sub(r'\[/?[a-z]+\]', '', text)  # [b], [/b], etc.
+    text = re.sub(r'"[^"]*":\S+', '', text)  # DText links "text":url
+    # Find first sentence that's actually descriptive (not navigation/see-also)
+    for line in text.split('\n'):
+        line = line.strip().lstrip('* ')
+        if not line:
+            continue
+        if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
+            continue
+        if len(line) < 10:
+            continue
+        # Truncate at first period if it's a real sentence
+        period = line.find('. ')
+        if period > 20:
+            return line[:period + 1]
+        if len(line) > 30:
+            return line[:300]
+    return ""
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
+        sys.exit(1)
+    csv_path = Path(sys.argv[1])
+    if not csv_path.is_file():
+        print(f"File not found: {csv_path}")
+        sys.exit(1)
+    # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
+    tag_groups: Dict[str, List[str]] = {}
+    tag_defs: Dict[str, str] = {}
+    print(f"Reading {csv_path}...")
+    with csv_path.open("r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        header = next(reader)
+        print(f"Columns: {header}")
+        # Find column indices
+        title_idx = header.index("title") if "title" in header else 3
+        body_idx = header.index("body") if "body" in header else 4
+        for row in reader:
+            if len(row) <= max(title_idx, body_idx):
+                continue
+            title = row[title_idx].strip()
+            body = row[body_idx]
+            if title.startswith("tag_group:"):
+                group_name = title[len("tag_group:"):]
+                members = _extract_tag_links(body)
+                if members:
+                    tag_groups[group_name] = members
+            elif not title.startswith(("help:", "howto:", "about:", "forum_")):
+                # It's a tag wiki page — extract first sentence as definition
+                defn = _first_sentence(body)
+                if defn:
+                    tag_defs[title] = defn
+    # Write outputs
+    out_dir = _REPO_ROOT / "data"
+    out_dir.mkdir(exist_ok=True)
+    groups_path = out_dir / "tag_groups.json"
+    with groups_path.open("w", encoding="utf-8") as f:
+        json.dump(tag_groups, f, indent=2, ensure_ascii=False)
+    print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
+    for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
+        print(f"  {g}: {len(members)} tags")
+    defs_path = out_dir / "tag_wiki_defs.json"
+    with defs_path.open("w", encoding="utf-8") as f:
+        json.dump(tag_defs, f, indent=2, ensure_ascii=False)
+    print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")
+    # Show definitions for key structural tags
+    structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
+                   "looking_at_viewer", "standing", "clothed", "clothing"]
+    print(f"\nKey tag definitions:")
+    for tag in structural:
+        defn = tag_defs.get(tag, "(not found)")
+        print(f"  {tag}: {defn[:120]}")
+if __name__ == "__main__":
+    main()