Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 11

Commit

14e5c38

1 Parent(s): 6fc4b56

Normalize GT annotations: expand implications, exclude non-evaluable tags

Addresses annotation inconsistency where 30% of GT samples were missing
implied taxonomy tags (e.g. fox present but canid/mammal absent).

- preprocess_eval_data.py: expands GT through implication graph, writes
_expanded.jsonl with tags_ground_truth_expanded field
- eval_pipeline.py: uses expanded GT, strips _EVAL_EXCLUDED_TAGS
(invalid_*, hi_res, structural backgrounds) from both sides,
reports leaf-only metrics alongside expanded metrics
- state.py: adds get_leaf_tags() to strip implied ancestors from a tag set

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (4) hide show

data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_expanded.jsonl +0 -0
psq_rag/retrieval/state.py +23 -0
scripts/eval_pipeline.py +82 -10
scripts/preprocess_eval_data.py +99 -0

data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_expanded.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

psq_rag/retrieval/state.py CHANGED Viewed

@@ -327,6 +327,29 @@ def expand_tags_via_implications(tags: Set[str]) -> Tuple[Set[str], Set[str]]:
     return expanded, implied_only
 def get_tfidf_tag_vectors() -> Dict[str, Any]:
     global _tfidf_tag_vectors
     if _tfidf_tag_vectors is not None:

     return expanded, implied_only
+def get_leaf_tags(tags: Set[str]) -> Set[str]:
+    """Return only leaf tags — those not implied by any other tag in the set.
+    For example, given {fox, canine, canid, mammal}, returns {fox} because
+    canine/canid/mammal are all reachable from fox via implications.
+    """
+    impl = get_tag_implications()
+    # For each tag, compute what it implies; mark those as non-leaves
+    non_leaves: Set[str] = set()
+    for tag in tags:
+        visited: Set[str] = set()
+        queue = [tag]
+        while queue:
+            t = queue.pop()
+            for parent in impl.get(t, ()):
+                if parent not in visited:
+                    visited.add(parent)
+                    if parent in tags:
+                        non_leaves.add(parent)
+                    queue.append(parent)
+    return tags - non_leaves
 def get_tfidf_tag_vectors() -> Dict[str, Any]:
     global _tfidf_tag_vectors
     if _tfidf_tag_vectors is not None:

scripts/eval_pipeline.py CHANGED Viewed

@@ -57,13 +57,29 @@ if str(_REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(_REPO_ROOT))
 os.chdir(_REPO_ROOT)
-EVAL_DATA_PATH = _REPO_ROOT / "data" / "eval_samples" / "e621_sfw_sample_1000_seed123_buffer10000.jsonl"
 # Character tag types that go through the alias filter pipeline
 _CHARACTER_TYPES = {"character"}
 # Copyright tags are filtered out entirely
 _COPYRIGHT_TYPES = {"copyright"}
 def _classify_tags(tags: Set[str], get_type_fn) -> Tuple[Set[str], Set[str]]:
     """Split tags into (character_tags, general_tags).
@@ -135,6 +151,12 @@ class SampleResult:
     why_counts: Dict[str, int] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Timing
     stage1_time: float = 0.0
     stage2_time: float = 0.0
@@ -179,7 +201,7 @@ def _process_one_sample(
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
     from psq_rag.llm.select import llm_select_indices
-    from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications
     def log(msg: str) -> None:
         if verbose:
@@ -273,13 +295,27 @@ def _process_one_sample(
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
-        # Overall selection metrics
         p, r, f1 = _compute_metrics(result.selected_tags, gt_tags)
         result.selection_precision = p
         result.selection_recall = r
         result.selection_f1 = f1
-        # New diagnostic metrics
         retrieved_and_gt = result.retrieved_tags & gt_tags
         selected_and_gt = result.selected_tags & gt_tags
         if result.retrieved_tags:
@@ -370,26 +406,41 @@ def run_eval(
     expand_implications: bool = False,
 ) -> List[SampleResult]:
-    # Load eval samples
-    if not EVAL_DATA_PATH.is_file():
-        print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
-        sys.exit(1)
     all_samples = []
-    with EVAL_DATA_PATH.open("r", encoding="utf-8") as f:
         for line in f:
             row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
                 continue
-            gt_tags = _flatten_ground_truth_tags(row.get("tags_ground_truth_categorized", ""))
             if not gt_tags:
                 continue
             all_samples.append({
                 "id": row.get("id", row.get("row_id", len(all_samples))),
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
     if shuffle:
         rng = random.Random(seed)
@@ -512,6 +563,21 @@ def print_summary(results: List[SampleResult]) -> None:
     if avg_implied > 0:
         print(f"  Avg implied tags:     {avg_implied:.1f}  (added via tag implications)")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
     print()
     print("Diagnostic Metrics:")
     print(f"  Retrieval precision:  {avg_retrieval_precision:.4f}  (|ret∩gt|/|ret|, noise level fed to Stage 3)")
@@ -761,6 +827,12 @@ def main(argv=None) -> int:
                 "over_selection_ratio": round(r.over_selection_ratio, 2),
                 "why_counts": r.why_counts,
                 "implied_tags": sorted(r.implied_tags),
                 # Timing
                 "stage1_time": round(r.stage1_time, 3),
                 "stage2_time": round(r.stage2_time, 3),

     sys.path.insert(0, str(_REPO_ROOT))
 os.chdir(_REPO_ROOT)
+EVAL_DATA_PATH = _REPO_ROOT / "data" / "eval_samples" / "e621_sfw_sample_1000_seed123_buffer10000_expanded.jsonl"
+EVAL_DATA_PATH_RAW = _REPO_ROOT / "data" / "eval_samples" / "e621_sfw_sample_1000_seed123_buffer10000.jsonl"
 # Character tag types that go through the alias filter pipeline
 _CHARACTER_TYPES = {"character"}
 # Copyright tags are filtered out entirely
 _COPYRIGHT_TYPES = {"copyright"}
+# Tags excluded from evaluation metrics but NOT removed from the pipeline.
+# These are tags that either: can't be inferred from a caption (resolution,
+# art medium), describe structural properties better handled outside the
+# retrieval pipeline (backgrounds), or are annotation artifacts.
+_EVAL_EXCLUDED_TAGS = frozenset({
+    # Annotation artifacts
+    "invalid_tag", "invalid_background",
+    # Resolution / file meta — not inferrable from caption
+    "hi_res", "absurd_res", "low_res", "superabsurd_res",
+    # Structural background tags — better recommended independently
+    "simple_background", "abstract_background", "detailed_background",
+    "gradient_background", "blurred_background", "textured_background",
+    "transparent_background", "white_background",
+})
 def _classify_tags(tags: Set[str], get_type_fn) -> Tuple[Set[str], Set[str]]:
     """Split tags into (character_tags, general_tags).
     why_counts: Dict[str, int] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
+    # Leaf-only metrics (strips implied ancestors from both sides)
+    leaf_precision: float = 0.0
+    leaf_recall: float = 0.0
+    leaf_f1: float = 0.0
+    leaf_selected_count: int = 0
+    leaf_gt_count: int = 0
     # Timing
     stage1_time: float = 0.0
     stage2_time: float = 0.0
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
     from psq_rag.llm.select import llm_select_indices
+    from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
     def log(msg: str) -> None:
         if verbose:
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
+        # Remove eval-excluded tags from predictions before scoring
+        result.selected_tags -= _EVAL_EXCLUDED_TAGS
+        result.retrieved_tags -= _EVAL_EXCLUDED_TAGS
+        # Overall selection metrics (expanded — both sides have full implication chains)
         p, r, f1 = _compute_metrics(result.selected_tags, gt_tags)
         result.selection_precision = p
         result.selection_recall = r
         result.selection_f1 = f1
+        # Leaf-only metrics (strips implied ancestors from both sides)
+        leaf_sel = get_leaf_tags(result.selected_tags)
+        leaf_gt = get_leaf_tags(gt_tags)
+        lp, lr, lf1 = _compute_metrics(leaf_sel, leaf_gt)
+        result.leaf_precision = lp
+        result.leaf_recall = lr
+        result.leaf_f1 = lf1
+        result.leaf_selected_count = len(leaf_sel)
+        result.leaf_gt_count = len(leaf_gt)
+        # Diagnostic metrics
         retrieved_and_gt = result.retrieved_tags & gt_tags
         selected_and_gt = result.selected_tags & gt_tags
         if result.retrieved_tags:
     expand_implications: bool = False,
 ) -> List[SampleResult]:
+    # Load eval samples — prefer expanded file, fall back to raw
+    eval_path = EVAL_DATA_PATH
+    if not eval_path.is_file():
+        eval_path = EVAL_DATA_PATH_RAW
+        if not eval_path.is_file():
+            print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
+            sys.exit(1)
+        print(f"WARNING: Expanded eval data not found, falling back to raw: {eval_path}")
+        print("  Run: python scripts/preprocess_eval_data.py")
     all_samples = []
+    using_expanded = False
+    with eval_path.open("r", encoding="utf-8") as f:
         for line in f:
             row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
                 continue
+            # Prefer pre-expanded GT; fall back to flattening categorized
+            if "tags_ground_truth_expanded" in row:
+                gt_tags = set(row["tags_ground_truth_expanded"])
+                using_expanded = True
+            else:
+                gt_tags = _flatten_ground_truth_tags(row.get("tags_ground_truth_categorized", ""))
             if not gt_tags:
                 continue
+            # Remove eval-excluded tags from GT
+            gt_tags -= _EVAL_EXCLUDED_TAGS
             all_samples.append({
                 "id": row.get("id", row.get("row_id", len(all_samples))),
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
+    if using_expanded:
+        print("Using implication-expanded ground truth")
     if shuffle:
         rng = random.Random(seed)
     if avg_implied > 0:
         print(f"  Avg implied tags:     {avg_implied:.1f}  (added via tag implications)")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
+    # Leaf-only metrics
+    avg_leaf_p = _safe_avg([r.leaf_precision for r in valid])
+    avg_leaf_r = _safe_avg([r.leaf_recall for r in valid])
+    avg_leaf_f1 = _safe_avg([r.leaf_f1 for r in valid])
+    avg_leaf_sel = _safe_avg([r.leaf_selected_count for r in valid])
+    avg_leaf_gt = _safe_avg([r.leaf_gt_count for r in valid])
+    print()
+    print("Stage 3 - Selection (LEAF tags only — implied ancestors stripped):")
+    print(f"  Avg precision:        {avg_leaf_p:.4f}")
+    print(f"  Avg recall:           {avg_leaf_r:.4f}")
+    print(f"  Avg F1:               {avg_leaf_f1:.4f}")
+    print(f"  Avg leaf selected:    {avg_leaf_sel:.1f}")
+    print(f"  Avg leaf ground-truth:{avg_leaf_gt:.1f}")
     print()
     print("Diagnostic Metrics:")
     print(f"  Retrieval precision:  {avg_retrieval_precision:.4f}  (|ret∩gt|/|ret|, noise level fed to Stage 3)")
                 "over_selection_ratio": round(r.over_selection_ratio, 2),
                 "why_counts": r.why_counts,
                 "implied_tags": sorted(r.implied_tags),
+                # Leaf metrics
+                "leaf_precision": round(r.leaf_precision, 4),
+                "leaf_recall": round(r.leaf_recall, 4),
+                "leaf_f1": round(r.leaf_f1, 4),
+                "leaf_selected_count": r.leaf_selected_count,
+                "leaf_gt_count": r.leaf_gt_count,
                 # Timing
                 "stage1_time": round(r.stage1_time, 3),
                 "stage2_time": round(r.stage2_time, 3),

scripts/preprocess_eval_data.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Preprocess eval dataset: expand ground-truth tags through implication chains.
+Reads the raw eval JSONL, expands each sample's GT tags via the e621 tag
+implication graph, removes known garbage tags, and writes a new JSONL with
+an additional `tags_ground_truth_expanded` field (flat sorted list).
+The original `tags_ground_truth_categorized` field is preserved unchanged.
+Usage:
+    python scripts/preprocess_eval_data.py
+Input:  data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000.jsonl
+Output: data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_expanded.jsonl
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+# Add project root to path so we can import psq_rag
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+from psq_rag.retrieval.state import expand_tags_via_implications, get_tag_implications
+# Tags that are annotation artifacts, not real content tags
+GARBAGE_TAGS = frozenset({
+    "invalid_tag",
+    "invalid_background",
+})
+INPUT_PATH = _REPO_ROOT / "data" / "eval_samples" / "e621_sfw_sample_1000_seed123_buffer10000.jsonl"
+OUTPUT_PATH = INPUT_PATH.with_name(INPUT_PATH.stem + "_expanded.jsonl")
+def flatten_ground_truth(tags_categorized_str: str) -> set[str]:
+    """Parse the categorized ground-truth JSON into a flat set of tags."""
+    if not tags_categorized_str:
+        return set()
+    cats = json.loads(tags_categorized_str)
+    tags = set()
+    for tag_list in cats.values():
+        if isinstance(tag_list, list):
+            for t in tag_list:
+                tags.add(t.strip())
+    return tags
+def main() -> int:
+    if not INPUT_PATH.is_file():
+        print(f"ERROR: Input not found: {INPUT_PATH}")
+        return 1
+    # Pre-warm implication graph
+    impl = get_tag_implications()
+    print(f"Loaded {sum(len(v) for v in impl.values())} active implications")
+    samples_read = 0
+    samples_expanded = 0
+    total_tags_added = 0
+    total_garbage_removed = 0
+    with INPUT_PATH.open("r", encoding="utf-8") as fin, \
+         OUTPUT_PATH.open("w", encoding="utf-8") as fout:
+        for line in fin:
+            row = json.loads(line)
+            samples_read += 1
+            gt_raw = flatten_ground_truth(row.get("tags_ground_truth_categorized", ""))
+            # Remove garbage tags
+            garbage_found = gt_raw & GARBAGE_TAGS
+            if garbage_found:
+                total_garbage_removed += len(garbage_found)
+                gt_raw -= garbage_found
+            # Expand through implications
+            gt_expanded, implied_only = expand_tags_via_implications(gt_raw)
+            if implied_only:
+                samples_expanded += 1
+                total_tags_added += len(implied_only)
+            # Store expanded flat list alongside original categorized field
+            row["tags_ground_truth_expanded"] = sorted(gt_expanded)
+            fout.write(json.dumps(row, ensure_ascii=False) + "\n")
+    print(f"Processed {samples_read} samples")
+    print(f"  {samples_expanded} samples had missing implications ({samples_expanded}/{samples_read} = {100*samples_expanded/samples_read:.1f}%)")
+    print(f"  {total_tags_added} implied tags added total (avg {total_tags_added/samples_read:.1f} per sample)")
+    print(f"  {total_garbage_removed} garbage tags removed")
+    print(f"Output: {OUTPUT_PATH}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())