Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 9

Commit

ea9e11c

1 Parent(s): c6be992

Expand alias filter tests with real CSV data and pipeline tests

- Real data tests: verify alias matching against actual fluffyrock_3m.csv
aliases (garfield, tails, pikachu_libre, sonic, copyright types)
- Pipeline tests: simulate full _split_candidates_by_type + alias filter
flow with mock Candidate objects, verifying copyright filtering,
entity/general split, and multi-character queries
- Fix pokemon copyright tag name (real tag is "pokemon" not "pokemon_(series)")
- 41 tests total (23 mock + 9 real data + 9 pipeline)

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (1) hide show

scripts/test_alias_filter.py +220 -1

scripts/test_alias_filter.py CHANGED Viewed

@@ -7,10 +7,14 @@ Tests _character_matches_via_aliases() and related helper functions to ensure:
 - Fuzzy matching handles common typos
 - Generic descriptions (e.g. "orange cat") do NOT match character tags
 Usage:
     python scripts/test_alias_filter.py
-Requires: rapidfuzz (no CSV data files needed - uses mock alias data)
 """
 from __future__ import annotations
@@ -23,12 +27,16 @@ _REPO_ROOT = Path(__file__).resolve().parents[1]
 if str(_REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(_REPO_ROOT))
 from psq_rag.llm.select import (
     _normalize_for_matching,
     _query_words,
     _alias_matches_query,
     _character_matches_via_aliases,
 )
 # ---------------------------------------------------------------------------
 # Mock alias data matching real e621 patterns
@@ -285,6 +293,217 @@ def run_tests() -> int:
         False,
     )
     # -----------------------------------------------------------------------
     # Summary
     # -----------------------------------------------------------------------

 - Fuzzy matching handles common typos
 - Generic descriptions (e.g. "orange cat") do NOT match character tags
+Also tests against real alias data from fluffyrock_3m.csv when available,
+and verifies the full candidate split + alias filter pipeline.
 Usage:
     python scripts/test_alias_filter.py
+Requires: rapidfuzz
+Optional: fluffyrock_3m.csv (for real-data tests; skipped if missing)
 """
 from __future__ import annotations
 if str(_REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(_REPO_ROOT))
+import os
 from psq_rag.llm.select import (
     _normalize_for_matching,
     _query_words,
     _alias_matches_query,
     _character_matches_via_aliases,
+    _split_candidates_by_type,
 )
+from psq_rag.retrieval.psq_retrieval import Candidate
 # ---------------------------------------------------------------------------
 # Mock alias data matching real e621 patterns
         False,
     )
+    # ===================================================================
+    # REAL DATA TESTS (using fluffyrock_3m.csv if available)
+    # ===================================================================
+    csv_path = _REPO_ROOT / "fluffyrock_3m.csv"
+    if csv_path.is_file() and csv_path.stat().st_size > 1000:
+        print("\n=== Real CSV data tests (fluffyrock_3m.csv) ===")
+        os.chdir(_REPO_ROOT)  # state.py reads from cwd
+        from psq_rag.retrieval.state import get_tag2aliases, get_tag_type_name
+        real_t2a = get_tag2aliases()
+        # Garfield: real aliases include "garfield"
+        query = "garfield sleeping on a table"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'garfield sleeping' matches garfield_the_cat",
+            _character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
+            True,
+        )
+        # Miles Prower: real aliases include "tails_(sonic)"
+        query = "tails flying through the sky"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'tails flying' matches miles_prower",
+            _character_matches_via_aliases("miles_prower", query, real_t2a, qwords, qnorm),
+            True,
+        )
+        # Pikachu libre should NOT match just "pikachu"
+        query = "pikachu with red cheeks"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'pikachu with red cheeks' does NOT match pikachu_libre",
+            _character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
+            False,
+        )
+        # Pikachu libre SHOULD match when variant is mentioned
+        query = "pikachu libre wrestling"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'pikachu libre wrestling' matches pikachu_libre",
+            _character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
+            True,
+        )
+        # Sonic: real aliases include "sonic_(character)"
+        query = "sonic running fast"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'sonic running fast' matches sonic_the_hedgehog",
+            _character_matches_via_aliases("sonic_the_hedgehog", query, real_t2a, qwords, qnorm),
+            True,
+        )
+        # Generic "orange cat" should not match garfield
+        query = "orange cat sitting outside"
+        qwords = _query_words(query)
+        qnorm = _normalize_for_matching(query)
+        check(
+            "[real] 'orange cat sitting outside' does NOT match garfield_the_cat",
+            _character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
+            False,
+        )
+        # Verify pikachu is type "species" (goes through general pipeline, not entity)
+        check(
+            "[real] pikachu is type 'species' (handled by general pipeline, not entity)",
+            get_tag_type_name("pikachu") == "species",
+            True,
+        )
+        # Verify garfield_the_cat is type "character"
+        check(
+            "[real] garfield_the_cat is type 'character'",
+            get_tag_type_name("garfield_the_cat") == "character",
+            True,
+        )
+        # Verify copyright tags are filtered (real tag is "pokemon", not "pokemon_(series)")
+        check(
+            "[real] pokemon is type 'copyright' (would be filtered)",
+            get_tag_type_name("pokemon") == "copyright",
+            True,
+        )
+    else:
+        print("\n=== Skipping real CSV data tests (fluffyrock_3m.csv not found) ===")
+    # ===================================================================
+    # PIPELINE TEST: _split_candidates_by_type + alias filter
+    # Simulates what llm_select_indices does without needing an API key
+    # ===================================================================
+    print("\n=== Pipeline test: candidate split + alias filter ===")
+    def make_cand(tag: str) -> Candidate:
+        return Candidate(
+            tag=tag, score_combined=0.5, score_fasttext=None,
+            score_context=None, count=100, sources=["test"],
+        )
+    # Simulate a mixed candidate list like Stage 2 would produce
+    test_candidates = [
+        make_cand("sitting"),              # general (type 0)
+        make_cand("orange_body"),          # general (type 0)
+        make_cand("domestic_cat"),         # species (type 5)
+        make_cand("garfield_the_cat"),     # character (type 4)
+        make_cand("cat_busters"),          # copyright (type 3)
+        make_cand("pikachu_libre"),        # character (type 4)
+        make_cand("miles_prower"),         # character (type 4)
+    ]
+    # Split by type
+    general_with_idx, entity_with_idx = _split_candidates_by_type(test_candidates, log=None)
+    general_tags = {c.tag for _, c in general_with_idx}
+    entity_tags = {c.tag for _, c in entity_with_idx}
+    check(
+        "[pipeline] general tags include sitting, orange_body, domestic_cat",
+        {"sitting", "orange_body", "domestic_cat"}.issubset(general_tags),
+        True,
+    )
+    check(
+        "[pipeline] cat_busters (copyright) is filtered out of both lists",
+        "cat_busters" not in general_tags and "cat_busters" not in entity_tags,
+        True,
+    )
+    check(
+        "[pipeline] entity tags include garfield_the_cat, pikachu_libre, miles_prower",
+        {"garfield_the_cat", "pikachu_libre", "miles_prower"}.issubset(entity_tags),
+        True,
+    )
+    # Now simulate alias filtering on entity candidates with query "garfield sleeping"
+    query = "garfield sleeping"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    filtered = []
+    rejected = []
+    for _, cand in entity_with_idx:
+        if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
+            filtered.append(cand.tag)
+        else:
+            rejected.append(cand.tag)
+    check(
+        "[pipeline] 'garfield sleeping': garfield_the_cat survives alias filter",
+        "garfield_the_cat" in filtered,
+        True,
+    )
+    check(
+        "[pipeline] 'garfield sleeping': pikachu_libre rejected by alias filter",
+        "pikachu_libre" in rejected,
+        True,
+    )
+    check(
+        "[pipeline] 'garfield sleeping': miles_prower rejected by alias filter",
+        "miles_prower" in rejected,
+        True,
+    )
+    # Simulate with query "tails and garfield"
+    query = "tails and garfield"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    filtered = []
+    for _, cand in entity_with_idx:
+        if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
+            filtered.append(cand.tag)
+    check(
+        "[pipeline] 'tails and garfield': both garfield_the_cat and miles_prower survive",
+        "garfield_the_cat" in filtered and "miles_prower" in filtered,
+        True,
+    )
+    check(
+        "[pipeline] 'tails and garfield': pikachu_libre still rejected",
+        "pikachu_libre" not in filtered,
+        True,
+    )
+    # Simulate with generic query "orange cat sitting outside" — no characters should survive
+    query = "orange cat sitting outside"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    filtered = []
+    for _, cand in entity_with_idx:
+        if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
+            filtered.append(cand.tag)
+    check(
+        "[pipeline] 'orange cat sitting outside': NO character tags survive alias filter",
+        len(filtered) == 0,
+        True,
+    )
     # -----------------------------------------------------------------------
     # Summary
     # -----------------------------------------------------------------------