Claude commited on
Commit
ea9e11c
·
1 Parent(s): c6be992

Expand alias filter tests with real CSV data and pipeline tests

Browse files

- Real data tests: verify alias matching against actual fluffyrock_3m.csv
aliases (garfield, tails, pikachu_libre, sonic, copyright types)
- Pipeline tests: simulate full _split_candidates_by_type + alias filter
flow with mock Candidate objects, verifying copyright filtering,
entity/general split, and multi-character queries
- Fix pokemon copyright tag name (real tag is "pokemon" not "pokemon_(series)")
- 41 tests total (23 mock + 9 real data + 9 pipeline)

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (1) hide show
  1. scripts/test_alias_filter.py +220 -1
scripts/test_alias_filter.py CHANGED
@@ -7,10 +7,14 @@ Tests _character_matches_via_aliases() and related helper functions to ensure:
7
  - Fuzzy matching handles common typos
8
  - Generic descriptions (e.g. "orange cat") do NOT match character tags
9
 
 
 
 
10
  Usage:
11
  python scripts/test_alias_filter.py
12
 
13
- Requires: rapidfuzz (no CSV data files needed - uses mock alias data)
 
14
  """
15
 
16
  from __future__ import annotations
@@ -23,12 +27,16 @@ _REPO_ROOT = Path(__file__).resolve().parents[1]
23
  if str(_REPO_ROOT) not in sys.path:
24
  sys.path.insert(0, str(_REPO_ROOT))
25
 
 
 
26
  from psq_rag.llm.select import (
27
  _normalize_for_matching,
28
  _query_words,
29
  _alias_matches_query,
30
  _character_matches_via_aliases,
 
31
  )
 
32
 
33
  # ---------------------------------------------------------------------------
34
  # Mock alias data matching real e621 patterns
@@ -285,6 +293,217 @@ def run_tests() -> int:
285
  False,
286
  )
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  # -----------------------------------------------------------------------
289
  # Summary
290
  # -----------------------------------------------------------------------
 
7
  - Fuzzy matching handles common typos
8
  - Generic descriptions (e.g. "orange cat") do NOT match character tags
9
 
10
+ Also tests against real alias data from fluffyrock_3m.csv when available,
11
+ and verifies the full candidate split + alias filter pipeline.
12
+
13
  Usage:
14
  python scripts/test_alias_filter.py
15
 
16
+ Requires: rapidfuzz
17
+ Optional: fluffyrock_3m.csv (for real-data tests; skipped if missing)
18
  """
19
 
20
  from __future__ import annotations
 
27
  if str(_REPO_ROOT) not in sys.path:
28
  sys.path.insert(0, str(_REPO_ROOT))
29
 
30
+ import os
31
+
32
  from psq_rag.llm.select import (
33
  _normalize_for_matching,
34
  _query_words,
35
  _alias_matches_query,
36
  _character_matches_via_aliases,
37
+ _split_candidates_by_type,
38
  )
39
+ from psq_rag.retrieval.psq_retrieval import Candidate
40
 
41
  # ---------------------------------------------------------------------------
42
  # Mock alias data matching real e621 patterns
 
293
  False,
294
  )
295
 
296
+ # ===================================================================
297
+ # REAL DATA TESTS (using fluffyrock_3m.csv if available)
298
+ # ===================================================================
299
+ csv_path = _REPO_ROOT / "fluffyrock_3m.csv"
300
+ if csv_path.is_file() and csv_path.stat().st_size > 1000:
301
+ print("\n=== Real CSV data tests (fluffyrock_3m.csv) ===")
302
+ os.chdir(_REPO_ROOT) # state.py reads from cwd
303
+ from psq_rag.retrieval.state import get_tag2aliases, get_tag_type_name
304
+
305
+ real_t2a = get_tag2aliases()
306
+
307
+ # Garfield: real aliases include "garfield"
308
+ query = "garfield sleeping on a table"
309
+ qwords = _query_words(query)
310
+ qnorm = _normalize_for_matching(query)
311
+
312
+ check(
313
+ "[real] 'garfield sleeping' matches garfield_the_cat",
314
+ _character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
315
+ True,
316
+ )
317
+
318
+ # Miles Prower: real aliases include "tails_(sonic)"
319
+ query = "tails flying through the sky"
320
+ qwords = _query_words(query)
321
+ qnorm = _normalize_for_matching(query)
322
+
323
+ check(
324
+ "[real] 'tails flying' matches miles_prower",
325
+ _character_matches_via_aliases("miles_prower", query, real_t2a, qwords, qnorm),
326
+ True,
327
+ )
328
+
329
+ # Pikachu libre should NOT match just "pikachu"
330
+ query = "pikachu with red cheeks"
331
+ qwords = _query_words(query)
332
+ qnorm = _normalize_for_matching(query)
333
+
334
+ check(
335
+ "[real] 'pikachu with red cheeks' does NOT match pikachu_libre",
336
+ _character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
337
+ False,
338
+ )
339
+
340
+ # Pikachu libre SHOULD match when variant is mentioned
341
+ query = "pikachu libre wrestling"
342
+ qwords = _query_words(query)
343
+ qnorm = _normalize_for_matching(query)
344
+
345
+ check(
346
+ "[real] 'pikachu libre wrestling' matches pikachu_libre",
347
+ _character_matches_via_aliases("pikachu_libre", query, real_t2a, qwords, qnorm),
348
+ True,
349
+ )
350
+
351
+ # Sonic: real aliases include "sonic_(character)"
352
+ query = "sonic running fast"
353
+ qwords = _query_words(query)
354
+ qnorm = _normalize_for_matching(query)
355
+
356
+ check(
357
+ "[real] 'sonic running fast' matches sonic_the_hedgehog",
358
+ _character_matches_via_aliases("sonic_the_hedgehog", query, real_t2a, qwords, qnorm),
359
+ True,
360
+ )
361
+
362
+ # Generic "orange cat" should not match garfield
363
+ query = "orange cat sitting outside"
364
+ qwords = _query_words(query)
365
+ qnorm = _normalize_for_matching(query)
366
+
367
+ check(
368
+ "[real] 'orange cat sitting outside' does NOT match garfield_the_cat",
369
+ _character_matches_via_aliases("garfield_the_cat", query, real_t2a, qwords, qnorm),
370
+ False,
371
+ )
372
+
373
+ # Verify pikachu is type "species" (goes through general pipeline, not entity)
374
+ check(
375
+ "[real] pikachu is type 'species' (handled by general pipeline, not entity)",
376
+ get_tag_type_name("pikachu") == "species",
377
+ True,
378
+ )
379
+
380
+ # Verify garfield_the_cat is type "character"
381
+ check(
382
+ "[real] garfield_the_cat is type 'character'",
383
+ get_tag_type_name("garfield_the_cat") == "character",
384
+ True,
385
+ )
386
+
387
+ # Verify copyright tags are filtered (real tag is "pokemon", not "pokemon_(series)")
388
+ check(
389
+ "[real] pokemon is type 'copyright' (would be filtered)",
390
+ get_tag_type_name("pokemon") == "copyright",
391
+ True,
392
+ )
393
+
394
+ else:
395
+ print("\n=== Skipping real CSV data tests (fluffyrock_3m.csv not found) ===")
396
+
397
+ # ===================================================================
398
+ # PIPELINE TEST: _split_candidates_by_type + alias filter
399
+ # Simulates what llm_select_indices does without needing an API key
400
+ # ===================================================================
401
+ print("\n=== Pipeline test: candidate split + alias filter ===")
402
+
403
+ def make_cand(tag: str) -> Candidate:
404
+ return Candidate(
405
+ tag=tag, score_combined=0.5, score_fasttext=None,
406
+ score_context=None, count=100, sources=["test"],
407
+ )
408
+
409
+ # Simulate a mixed candidate list like Stage 2 would produce
410
+ test_candidates = [
411
+ make_cand("sitting"), # general (type 0)
412
+ make_cand("orange_body"), # general (type 0)
413
+ make_cand("domestic_cat"), # species (type 5)
414
+ make_cand("garfield_the_cat"), # character (type 4)
415
+ make_cand("cat_busters"), # copyright (type 3)
416
+ make_cand("pikachu_libre"), # character (type 4)
417
+ make_cand("miles_prower"), # character (type 4)
418
+ ]
419
+
420
+ # Split by type
421
+ general_with_idx, entity_with_idx = _split_candidates_by_type(test_candidates, log=None)
422
+ general_tags = {c.tag for _, c in general_with_idx}
423
+ entity_tags = {c.tag for _, c in entity_with_idx}
424
+
425
+ check(
426
+ "[pipeline] general tags include sitting, orange_body, domestic_cat",
427
+ {"sitting", "orange_body", "domestic_cat"}.issubset(general_tags),
428
+ True,
429
+ )
430
+ check(
431
+ "[pipeline] cat_busters (copyright) is filtered out of both lists",
432
+ "cat_busters" not in general_tags and "cat_busters" not in entity_tags,
433
+ True,
434
+ )
435
+ check(
436
+ "[pipeline] entity tags include garfield_the_cat, pikachu_libre, miles_prower",
437
+ {"garfield_the_cat", "pikachu_libre", "miles_prower"}.issubset(entity_tags),
438
+ True,
439
+ )
440
+
441
+ # Now simulate alias filtering on entity candidates with query "garfield sleeping"
442
+ query = "garfield sleeping"
443
+ qwords = _query_words(query)
444
+ qnorm = _normalize_for_matching(query)
445
+
446
+ filtered = []
447
+ rejected = []
448
+ for _, cand in entity_with_idx:
449
+ if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
450
+ filtered.append(cand.tag)
451
+ else:
452
+ rejected.append(cand.tag)
453
+
454
+ check(
455
+ "[pipeline] 'garfield sleeping': garfield_the_cat survives alias filter",
456
+ "garfield_the_cat" in filtered,
457
+ True,
458
+ )
459
+ check(
460
+ "[pipeline] 'garfield sleeping': pikachu_libre rejected by alias filter",
461
+ "pikachu_libre" in rejected,
462
+ True,
463
+ )
464
+ check(
465
+ "[pipeline] 'garfield sleeping': miles_prower rejected by alias filter",
466
+ "miles_prower" in rejected,
467
+ True,
468
+ )
469
+
470
+ # Simulate with query "tails and garfield"
471
+ query = "tails and garfield"
472
+ qwords = _query_words(query)
473
+ qnorm = _normalize_for_matching(query)
474
+
475
+ filtered = []
476
+ for _, cand in entity_with_idx:
477
+ if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
478
+ filtered.append(cand.tag)
479
+
480
+ check(
481
+ "[pipeline] 'tails and garfield': both garfield_the_cat and miles_prower survive",
482
+ "garfield_the_cat" in filtered and "miles_prower" in filtered,
483
+ True,
484
+ )
485
+ check(
486
+ "[pipeline] 'tails and garfield': pikachu_libre still rejected",
487
+ "pikachu_libre" not in filtered,
488
+ True,
489
+ )
490
+
491
+ # Simulate with generic query "orange cat sitting outside" — no characters should survive
492
+ query = "orange cat sitting outside"
493
+ qwords = _query_words(query)
494
+ qnorm = _normalize_for_matching(query)
495
+
496
+ filtered = []
497
+ for _, cand in entity_with_idx:
498
+ if _character_matches_via_aliases(cand.tag, query, MOCK_TAG2ALIASES, qwords, qnorm):
499
+ filtered.append(cand.tag)
500
+
501
+ check(
502
+ "[pipeline] 'orange cat sitting outside': NO character tags survive alias filter",
503
+ len(filtered) == 0,
504
+ True,
505
+ )
506
+
507
  # -----------------------------------------------------------------------
508
  # Summary
509
  # -----------------------------------------------------------------------