Spaces:
Running
Running
Claude commited on
Commit ·
ea9e11c
1
Parent(s): c6be992
Expand alias filter tests with real CSV data and pipeline tests
Browse files- Real data tests: verify alias matching against actual fluffyrock_3m.csv
aliases (garfield, tails, pikachu_libre, sonic, copyright types)
- Pipeline tests: simulate full _split_candidates_by_type + alias filter
flow with mock Candidate objects, verifying copyright filtering,
entity/general split, and multi-character queries
- Fix pokemon copyright tag name (real tag is "pokemon" not "pokemon_(series)")
- 41 tests total (23 mock + 9 real data + 9 pipeline)
https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG
- scripts/test_alias_filter.py +220 -1
scripts/test_alias_filter.py
CHANGED
|
@@ -7,10 +7,14 @@ Tests _character_matches_via_aliases() and related helper functions to ensure:
|
|
| 7 |
- Fuzzy matching handles common typos
|
| 8 |
- Generic descriptions (e.g. "orange cat") do NOT match character tags
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
Usage:
|
| 11 |
python scripts/test_alias_filter.py
|
| 12 |
|
| 13 |
-
Requires: rapidfuzz
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
|
@@ -23,12 +27,16 @@ _REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
| 23 |
if str(_REPO_ROOT) not in sys.path:
|
| 24 |
sys.path.insert(0, str(_REPO_ROOT))
|
| 25 |
|
|
|
|
|
|
|
| 26 |
from psq_rag.llm.select import (
|
| 27 |
_normalize_for_matching,
|
| 28 |
_query_words,
|
| 29 |
_alias_matches_query,
|
| 30 |
_character_matches_via_aliases,
|
|
|
|
| 31 |
)
|
|
|
|
| 32 |
|
| 33 |
# ---------------------------------------------------------------------------
|
| 34 |
# Mock alias data matching real e621 patterns
|
|
@@ -285,6 +293,217 @@ def run_tests() -> int:
|
|
| 285 |
False,
|
| 286 |
)
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
# -----------------------------------------------------------------------
|
| 289 |
# Summary
|
| 290 |
# -----------------------------------------------------------------------
|
|
|
|
| 7 |
- Fuzzy matching handles common typos
|
| 8 |
- Generic descriptions (e.g. "orange cat") do NOT match character tags
|
| 9 |
|
| 10 |
+
Also tests against real alias data from fluffyrock_3m.csv when available,
|
| 11 |
+
and verifies the full candidate split + alias filter pipeline.
|
| 12 |
+
|
| 13 |
Usage:
|
| 14 |
python scripts/test_alias_filter.py
|
| 15 |
|
| 16 |
+
Requires: rapidfuzz
|
| 17 |
+
Optional: fluffyrock_3m.csv (for real-data tests; skipped if missing)
|
| 18 |
"""
|
| 19 |
|
| 20 |
from __future__ import annotations
|
|
|
|
| 27 |
if str(_REPO_ROOT) not in sys.path:
|
| 28 |
sys.path.insert(0, str(_REPO_ROOT))
|
| 29 |
|
| 30 |
+
import os
|
| 31 |
+
|
| 32 |
from psq_rag.llm.select import (
|
| 33 |
_normalize_for_matching,
|
| 34 |
_query_words,
|
| 35 |
_alias_matches_query,
|
| 36 |
_character_matches_via_aliases,
|
| 37 |
+
_split_candidates_by_type,
|
| 38 |
)
|
| 39 |
+
from psq_rag.retrieval.psq_retrieval import Candidate
|
| 40 |
|
| 41 |
# ---------------------------------------------------------------------------
|
| 42 |
# Mock alias data matching real e621 patterns
|
|
|
|
| 293 |
False,
|
| 294 |
)
|
| 295 |
|
| 296 |
+
# ===================================================================
|
| 297 |
+
# REAL DATA TESTS (using fluffyrock_3m.csv if available)
|
| 298 |
+
# ===================================================================
|
| 299 |
+
csv_path = _REPO_ROOT / "fluffyrock_3m.csv"
|
| 300 |
+
if csv_path.is_file() and csv_path.stat().st_size > 1000:
|
| 301 |
+
print("\n=== Real CSV data tests (fluffyrock_3m.csv) ===")
|
| 302 |
+
os.chdir(_REPO_ROOT) # state.py reads from cwd
|
| 303 |
+
from psq_rag.retrieval.state import get_tag2aliases, get_tag_type_name
|
| 304 |
+
|
| 305 |
+
real_t2a = get_tag2aliases()
|
| 306 |
+
|
| 307 |
+
# Garfield: real aliases include "garfield"
|
| 308 |
+
query = "garfield sleeping on a table"
|
| 309 |
+
qwords = _query_words(query)
|
| 310 |
+
qnorm = _normalize_for_matching(query)
|
| 311 |
+
|
| 312 |
+
check(
|
| 313 |
+
"[real] 'garfield sleeping' matches garfield_the_cat",
|
| 314 |
+
_character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
|
| 315 |
+
True,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Miles Prower: real aliases include "tails_(sonic)"
|
| 319 |
+
query = "tails flying through the sky"
|
| 320 |
+
qwords = _query_words(query)
|
| 321 |
+
qnorm = _normalize_for_matching(query)
|
| 322 |
+
|
| 323 |
+
check(
|
| 324 |
+
"[real] 'tails flying' matches miles_prower",
|
| 325 |
+
_character_matches_via_aliases("miles_prower", query, real_t2a, qwords, qnorm),
|
| 326 |
+
True,
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Pikachu libre should NOT match just "pikachu"
|
| 330 |
+
query = "pikachu with red cheeks"
|
| 331 |
+
qwords = _query_words(query)
|
| 332 |
+
qnorm = _normalize_for_matching(query)
|
| 333 |
+
|
| 334 |
+
check(
|
| 335 |
+
"[real] 'pikachu with red cheeks' does NOT match pikachu_libre",
|
| 336 |
+
_character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
|
| 337 |
+
False,
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Pikachu libre SHOULD match when variant is mentioned
|
| 341 |
+
query = "pikachu libre wrestling"
|
| 342 |
+
qwords = _query_words(query)
|
| 343 |
+
qnorm = _normalize_for_matching(query)
|
| 344 |
+
|
| 345 |
+
check(
|
| 346 |
+
"[real] 'pikachu libre wrestling' matches pikachu_libre",
|
| 347 |
+
_character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
|
| 348 |
+
True,
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
# Sonic: real aliases include "sonic_(character)"
|
| 352 |
+
query = "sonic running fast"
|
| 353 |
+
qwords = _query_words(query)
|
| 354 |
+
qnorm = _normalize_for_matching(query)
|
| 355 |
+
|
| 356 |
+
check(
|
| 357 |
+
"[real] 'sonic running fast' matches sonic_the_hedgehog",
|
| 358 |
+
_character_matches_via_aliases("sonic_the_hedgehog", query, real_t2a, qwords, qnorm),
|
| 359 |
+
True,
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
# Generic "orange cat" should not match garfield
|
| 363 |
+
query = "orange cat sitting outside"
|
| 364 |
+
qwords = _query_words(query)
|
| 365 |
+
qnorm = _normalize_for_matching(query)
|
| 366 |
+
|
| 367 |
+
check(
|
| 368 |
+
"[real] 'orange cat sitting outside' does NOT match garfield_the_cat",
|
| 369 |
+
_character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
|
| 370 |
+
False,
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
# Verify pikachu is type "species" (goes through general pipeline, not entity)
|
| 374 |
+
check(
|
| 375 |
+
"[real] pikachu is type 'species' (handled by general pipeline, not entity)",
|
| 376 |
+
get_tag_type_name("pikachu") == "species",
|
| 377 |
+
True,
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
# Verify garfield_the_cat is type "character"
|
| 381 |
+
check(
|
| 382 |
+
"[real] garfield_the_cat is type 'character'",
|
| 383 |
+
get_tag_type_name("garfield_the_cat") == "character",
|
| 384 |
+
True,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# Verify copyright tags are filtered (real tag is "pokemon", not "pokemon_(series)")
|
| 388 |
+
check(
|
| 389 |
+
"[real] pokemon is type 'copyright' (would be filtered)",
|
| 390 |
+
get_tag_type_name("pokemon") == "copyright",
|
| 391 |
+
True,
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
else:
|
| 395 |
+
print("\n=== Skipping real CSV data tests (fluffyrock_3m.csv not found) ===")
|
| 396 |
+
|
| 397 |
+
# ===================================================================
|
| 398 |
+
# PIPELINE TEST: _split_candidates_by_type + alias filter
|
| 399 |
+
# Simulates what llm_select_indices does without needing an API key
|
| 400 |
+
# ===================================================================
|
| 401 |
+
print("\n=== Pipeline test: candidate split + alias filter ===")
|
| 402 |
+
|
| 403 |
+
def make_cand(tag: str) -> Candidate:
|
| 404 |
+
return Candidate(
|
| 405 |
+
tag=tag, score_combined=0.5, score_fasttext=None,
|
| 406 |
+
score_context=None, count=100, sources=["test"],
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
# Simulate a mixed candidate list like Stage 2 would produce
|
| 410 |
+
test_candidates = [
|
| 411 |
+
make_cand("sitting"), # general (type 0)
|
| 412 |
+
make_cand("orange_body"), # general (type 0)
|
| 413 |
+
make_cand("domestic_cat"), # species (type 5)
|
| 414 |
+
make_cand("garfield_the_cat"), # character (type 4)
|
| 415 |
+
make_cand("cat_busters"), # copyright (type 3)
|
| 416 |
+
make_cand("pikachu_libre"), # character (type 4)
|
| 417 |
+
make_cand("miles_prower"), # character (type 4)
|
| 418 |
+
]
|
| 419 |
+
|
| 420 |
+
# Split by type
|
| 421 |
+
general_with_idx, entity_with_idx = _split_candidates_by_type(test_candidates, log=None)
|
| 422 |
+
general_tags = {c.tag for _, c in general_with_idx}
|
| 423 |
+
entity_tags = {c.tag for _, c in entity_with_idx}
|
| 424 |
+
|
| 425 |
+
check(
|
| 426 |
+
"[pipeline] general tags include sitting, orange_body, domestic_cat",
|
| 427 |
+
{"sitting", "orange_body", "domestic_cat"}.issubset(general_tags),
|
| 428 |
+
True,
|
| 429 |
+
)
|
| 430 |
+
check(
|
| 431 |
+
"[pipeline] cat_busters (copyright) is filtered out of both lists",
|
| 432 |
+
"cat_busters" not in general_tags and "cat_busters" not in entity_tags,
|
| 433 |
+
True,
|
| 434 |
+
)
|
| 435 |
+
check(
|
| 436 |
+
"[pipeline] entity tags include garfield_the_cat, pikachu_libre, miles_prower",
|
| 437 |
+
{"garfield_the_cat", "pikachu_libre", "miles_prower"}.issubset(entity_tags),
|
| 438 |
+
True,
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
# Now simulate alias filtering on entity candidates with query "garfield sleeping"
|
| 442 |
+
query = "garfield sleeping"
|
| 443 |
+
qwords = _query_words(query)
|
| 444 |
+
qnorm = _normalize_for_matching(query)
|
| 445 |
+
|
| 446 |
+
filtered = []
|
| 447 |
+
rejected = []
|
| 448 |
+
for _, cand in entity_with_idx:
|
| 449 |
+
if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
|
| 450 |
+
filtered.append(cand.tag)
|
| 451 |
+
else:
|
| 452 |
+
rejected.append(cand.tag)
|
| 453 |
+
|
| 454 |
+
check(
|
| 455 |
+
"[pipeline] 'garfield sleeping': garfield_the_cat survives alias filter",
|
| 456 |
+
"garfield_the_cat" in filtered,
|
| 457 |
+
True,
|
| 458 |
+
)
|
| 459 |
+
check(
|
| 460 |
+
"[pipeline] 'garfield sleeping': pikachu_libre rejected by alias filter",
|
| 461 |
+
"pikachu_libre" in rejected,
|
| 462 |
+
True,
|
| 463 |
+
)
|
| 464 |
+
check(
|
| 465 |
+
"[pipeline] 'garfield sleeping': miles_prower rejected by alias filter",
|
| 466 |
+
"miles_prower" in rejected,
|
| 467 |
+
True,
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Simulate with query "tails and garfield"
|
| 471 |
+
query = "tails and garfield"
|
| 472 |
+
qwords = _query_words(query)
|
| 473 |
+
qnorm = _normalize_for_matching(query)
|
| 474 |
+
|
| 475 |
+
filtered = []
|
| 476 |
+
for _, cand in entity_with_idx:
|
| 477 |
+
if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
|
| 478 |
+
filtered.append(cand.tag)
|
| 479 |
+
|
| 480 |
+
check(
|
| 481 |
+
"[pipeline] 'tails and garfield': both garfield_the_cat and miles_prower survive",
|
| 482 |
+
"garfield_the_cat" in filtered and "miles_prower" in filtered,
|
| 483 |
+
True,
|
| 484 |
+
)
|
| 485 |
+
check(
|
| 486 |
+
"[pipeline] 'tails and garfield': pikachu_libre still rejected",
|
| 487 |
+
"pikachu_libre" not in filtered,
|
| 488 |
+
True,
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
# Simulate with generic query "orange cat sitting outside" — no characters should survive
|
| 492 |
+
query = "orange cat sitting outside"
|
| 493 |
+
qwords = _query_words(query)
|
| 494 |
+
qnorm = _normalize_for_matching(query)
|
| 495 |
+
|
| 496 |
+
filtered = []
|
| 497 |
+
for _, cand in entity_with_idx:
|
| 498 |
+
if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
|
| 499 |
+
filtered.append(cand.tag)
|
| 500 |
+
|
| 501 |
+
check(
|
| 502 |
+
"[pipeline] 'orange cat sitting outside': NO character tags survive alias filter",
|
| 503 |
+
len(filtered) == 0,
|
| 504 |
+
True,
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
# -----------------------------------------------------------------------
|
| 508 |
# Summary
|
| 509 |
# -----------------------------------------------------------------------
|