Claude commited on
Commit
019823a
·
1 Parent(s): 16c5aa4

Add per-tag evidence tracking and wiki extraction script

Browse files

Evidence tracking: each selected tag now records its source (stage3/structural/
implied), the LLM's 'why' level, and retrieval score. Stored in compact output
as extra_evidence (for false positives only) and in detail output as full
tag_evidence dict. Analysis script reports evidence source breakdown.

Wiki extraction: new script to parse wiki_pages CSV into tag_groups.json
(group memberships) and tag_wiki_defs.json (first-sentence definitions).
These will be used for principled structural inference and tag presentation.

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

scripts/analyze_compact_eval.py CHANGED
@@ -173,6 +173,44 @@ def main():
173
  freq = tag_count.get(tag, 0)
174
  print(f" {tag:40s} extra {cnt:>2}/{N} freq={freq:>9,}")
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # ── REPORT 4: Leaf vs non-leaf in missed ──
177
  print("\n" + "=" * 70)
178
  print("MISSED: LEAF vs IMPLIED ANCESTORS")
 
173
  freq = tag_count.get(tag, 0)
174
  print(f" {tag:40s} extra {cnt:>2}/{N} freq={freq:>9,}")
175
 
176
+ # ── REPORT 3b: Evidence sources for false positives ──
177
+ # (Only available in new format with extra_evidence field)
178
+ source_counts = Counter() # source -> count of FP tags
179
+ why_fp_counts = Counter() # why level -> count of FP tags from stage3
180
+ score_buckets = {"high (>0.5)": 0, "medium (0.2-0.5)": 0, "low (<0.2)": 0}
181
+ has_evidence = False
182
+ for s in samples:
183
+ ev = s.get("extra_evidence", {})
184
+ if ev:
185
+ has_evidence = True
186
+ for tag, info in ev.items():
187
+ src = info.get("source", "unknown")
188
+ source_counts[src] += 1
189
+ if src == "stage3":
190
+ why_fp_counts[info.get("why", "unknown")] += 1
191
+ score = info.get("retrieval_score", 0)
192
+ if score > 0.5: score_buckets["high (>0.5)"] += 1
193
+ elif score > 0.2: score_buckets["medium (0.2-0.5)"] += 1
194
+ else: score_buckets["low (<0.2)"] += 1
195
+
196
+ if has_evidence:
197
+ print("\n" + "=" * 70)
198
+ print("FALSE POSITIVE EVIDENCE SOURCES")
199
+ print("=" * 70)
200
+ total_fp = sum(source_counts.values())
201
+ print(f"\n How did {total_fp} false positive tags get through?")
202
+ for src, cnt in source_counts.most_common():
203
+ print(f" {src:20s} {cnt:>4} ({cnt/max(1,total_fp)*100:.0f}%)")
204
+
205
+ if why_fp_counts:
206
+ print(f"\n Stage 3 false positives by 'why' level:")
207
+ for why, cnt in why_fp_counts.most_common():
208
+ print(f" {why:20s} {cnt:>4}")
209
+
210
+ print(f"\n Stage 3 false positives by retrieval score:")
211
+ for bucket, cnt in score_buckets.items():
212
+ print(f" {bucket:20s} {cnt:>4}")
213
+
214
  # ── REPORT 4: Leaf vs non-leaf in missed ──
215
  print("\n" + "=" * 70)
216
  print("MISSED: LEAF vs IMPLIED ANCESTORS")
scripts/eval_pipeline.py CHANGED
@@ -153,6 +153,8 @@ class SampleResult:
153
  implied_tags: Set[str] = field(default_factory=set) # tags added via implications (not LLM-selected)
154
  # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
155
  structural_tags: List[str] = field(default_factory=list)
 
 
156
  # Leaf-only metrics (strips implied ancestors from both sides)
157
  leaf_precision: float = 0.0
158
  leaf_recall: float = 0.0
@@ -286,6 +288,15 @@ def _process_one_sample(
286
 
287
  result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
288
 
 
 
 
 
 
 
 
 
 
289
  # Why distribution
290
  why_counts: Dict[str, int] = {}
291
  for w in tag_why.values():
@@ -302,6 +313,8 @@ def _process_one_sample(
302
  result.structural_tags = structural
303
  # Add structural tags not already selected
304
  for st in structural:
 
 
305
  result.selected_tags.add(st)
306
  log(f"Structural: {structural}")
307
 
@@ -309,6 +322,8 @@ def _process_one_sample(
309
  if expand_implications and result.selected_tags:
310
  expanded, implied_only = expand_tags_via_implications(result.selected_tags)
311
  result.implied_tags = implied_only
 
 
312
  result.selected_tags = expanded
313
  log(f"Implications: +{len(implied_only)} tags")
314
 
@@ -873,6 +888,8 @@ def main(argv=None) -> int:
873
  # Diff sets (small — only the errors, not the full lists)
874
  "missed": missed_tags,
875
  "extra": extra_tags,
 
 
876
  # Structural tags inferred
877
  "structural": r.structural_tags,
878
  # Timing
@@ -899,6 +916,7 @@ def main(argv=None) -> int:
899
  "implied_tags": sorted(r.implied_tags),
900
  "structural_tags": r.structural_tags,
901
  "why_counts": r.why_counts,
 
902
  "gt_character_tags": sorted(r.gt_character_tags),
903
  "selected_character_tags": sorted(r.selected_character_tags),
904
  "gt_general_tags": sorted(r.gt_general_tags),
 
153
  implied_tags: Set[str] = field(default_factory=set) # tags added via implications (not LLM-selected)
154
  # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
155
  structural_tags: List[str] = field(default_factory=list)
156
+ # Per-tag evidence: tag -> {"source": "stage3"|"structural"|"implied", "why": ..., "score": ...}
157
+ tag_evidence: Dict[str, Dict[str, Any]] = field(default_factory=dict)
158
  # Leaf-only metrics (strips implied ancestors from both sides)
159
  leaf_precision: float = 0.0
160
  leaf_recall: float = 0.0
 
288
 
289
  result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
290
 
291
+ # Build per-tag evidence from Stage 3 selection
292
+ for idx in picked_indices:
293
+ tag = candidates[idx].tag
294
+ result.tag_evidence[tag] = {
295
+ "source": "stage3",
296
+ "why": tag_why.get(tag, "unknown"),
297
+ "retrieval_score": round(candidates[idx].score_combined, 4),
298
+ }
299
+
300
  # Why distribution
301
  why_counts: Dict[str, int] = {}
302
  for w in tag_why.values():
 
313
  result.structural_tags = structural
314
  # Add structural tags not already selected
315
  for st in structural:
316
+ if st not in result.selected_tags:
317
+ result.tag_evidence[st] = {"source": "structural"}
318
  result.selected_tags.add(st)
319
  log(f"Structural: {structural}")
320
 
 
322
  if expand_implications and result.selected_tags:
323
  expanded, implied_only = expand_tags_via_implications(result.selected_tags)
324
  result.implied_tags = implied_only
325
+ for imp_tag in implied_only:
326
+ result.tag_evidence[imp_tag] = {"source": "implied"}
327
  result.selected_tags = expanded
328
  log(f"Implications: +{len(implied_only)} tags")
329
 
 
888
  # Diff sets (small — only the errors, not the full lists)
889
  "missed": missed_tags,
890
  "extra": extra_tags,
891
+ # Evidence for extra tags (why did these false positives get through?)
892
+ "extra_evidence": {t: r.tag_evidence.get(t, {}) for t in extra_tags},
893
  # Structural tags inferred
894
  "structural": r.structural_tags,
895
  # Timing
 
916
  "implied_tags": sorted(r.implied_tags),
917
  "structural_tags": r.structural_tags,
918
  "why_counts": r.why_counts,
919
+ "tag_evidence": r.tag_evidence,
920
  "gt_character_tags": sorted(r.gt_character_tags),
921
  "selected_character_tags": sorted(r.selected_character_tags),
922
  "gt_general_tags": sorted(r.gt_general_tags),
scripts/extract_wiki_data.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract tag group memberships and wiki definitions from wiki_pages CSV.
2
+
3
+ Usage:
4
+ python scripts/extract_wiki_data.py <path_to_wiki_pages_csv>
5
+
6
+ Outputs:
7
+ data/tag_groups.json — {group_name: [member_tags]}
8
+ data/tag_wiki_defs.json — {tag: first_sentence_of_wiki}
9
+ """
10
+ from __future__ import annotations
11
+ import csv, json, re, sys
12
+ from pathlib import Path
13
+ from typing import Dict, List
14
+
15
+ _REPO_ROOT = Path(__file__).resolve().parents[1]
16
+
17
+
18
+ def _extract_tag_links(body: str) -> List[str]:
19
+ """Extract tag names from DText wiki markup.
20
+
21
+ Patterns:
22
+ - [[#tagname|display]] — anchor links in tag group pages
23
+ - [[tagname]] — simple wiki links
24
+ - * [[tagname|display]] — list items
25
+ """
26
+ tags = []
27
+ # Anchor links: [[#tag_name|display_text]]
28
+ for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
29
+ tags.append(m.group(1))
30
+ # If no anchor links found, try regular wiki links in list items
31
+ if not tags:
32
+ for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
33
+ tag = m.group(1)
34
+ if not tag.startswith('tag_group:') and not tag.startswith('tag '):
35
+ tags.append(tag)
36
+ return tags
37
+
38
+
39
+ def _first_sentence(body: str) -> str:
40
+ """Extract first meaningful sentence from a wiki body for use as a tag definition."""
41
+ # Strip DText markup
42
+ text = re.sub(r'\[\[#?\w+\|([^\]]+)\]\]', r'\1', body) # [[link|text]] -> text
43
+ text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', text) # [[text]] -> text
44
+ text = re.sub(r'h[1-6]\.\s*', '', text) # headings
45
+ text = re.sub(r'\[/?[a-z]+\]', '', text) # [b], [/b], etc.
46
+ text = re.sub(r'"[^"]*":\S+', '', text) # DText links "text":url
47
+
48
+ # Find first sentence that's actually descriptive (not navigation/see-also)
49
+ for line in text.split('\n'):
50
+ line = line.strip().lstrip('* ')
51
+ if not line:
52
+ continue
53
+ if line.startswith(('Back:', 'See ', 'Related:', 'Not to be confused')):
54
+ continue
55
+ if len(line) < 10:
56
+ continue
57
+ # Truncate at first period if it's a real sentence
58
+ period = line.find('. ')
59
+ if period > 20:
60
+ return line[:period + 1]
61
+ if len(line) > 30:
62
+ return line[:300]
63
+ return ""
64
+
65
+
66
+ def main():
67
+ if len(sys.argv) < 2:
68
+ print("Usage: python scripts/extract_wiki_data.py <wiki_pages_csv>")
69
+ sys.exit(1)
70
+
71
+ csv_path = Path(sys.argv[1])
72
+ if not csv_path.is_file():
73
+ print(f"File not found: {csv_path}")
74
+ sys.exit(1)
75
+
76
+ # The CSV columns are: id, created_at, updated_at, title, body, creator_id, updater_id, is_locked
77
+ tag_groups: Dict[str, List[str]] = {}
78
+ tag_defs: Dict[str, str] = {}
79
+
80
+ print(f"Reading {csv_path}...")
81
+ with csv_path.open("r", encoding="utf-8") as f:
82
+ reader = csv.reader(f)
83
+ header = next(reader)
84
+ print(f"Columns: {header}")
85
+
86
+ # Find column indices
87
+ title_idx = header.index("title") if "title" in header else 3
88
+ body_idx = header.index("body") if "body" in header else 4
89
+
90
+ for row in reader:
91
+ if len(row) <= max(title_idx, body_idx):
92
+ continue
93
+ title = row[title_idx].strip()
94
+ body = row[body_idx]
95
+
96
+ if title.startswith("tag_group:"):
97
+ group_name = title[len("tag_group:"):]
98
+ members = _extract_tag_links(body)
99
+ if members:
100
+ tag_groups[group_name] = members
101
+
102
+ elif not title.startswith(("help:", "howto:", "about:", "forum_")):
103
+ # It's a tag wiki page — extract first sentence as definition
104
+ defn = _first_sentence(body)
105
+ if defn:
106
+ tag_defs[title] = defn
107
+
108
+ # Write outputs
109
+ out_dir = _REPO_ROOT / "data"
110
+ out_dir.mkdir(exist_ok=True)
111
+
112
+ groups_path = out_dir / "tag_groups.json"
113
+ with groups_path.open("w", encoding="utf-8") as f:
114
+ json.dump(tag_groups, f, indent=2, ensure_ascii=False)
115
+ print(f"\nTag groups: {len(tag_groups)} groups written to {groups_path}")
116
+ for g, members in sorted(tag_groups.items(), key=lambda x: -len(x[1]))[:20]:
117
+ print(f" {g}: {len(members)} tags")
118
+
119
+ defs_path = out_dir / "tag_wiki_defs.json"
120
+ with defs_path.open("w", encoding="utf-8") as f:
121
+ json.dump(tag_defs, f, indent=2, ensure_ascii=False)
122
+ print(f"\nTag definitions: {len(tag_defs)} tags written to {defs_path}")
123
+
124
+ # Show definitions for key structural tags
125
+ structural = ["anthro", "feral", "humanoid", "solo", "duo", "male", "female",
126
+ "looking_at_viewer", "standing", "clothed", "clothing"]
127
+ print(f"\nKey tag definitions:")
128
+ for tag in structural:
129
+ defn = tag_defs.get(tag, "(not found)")
130
+ print(f" {tag}: {defn[:120]}")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()