ResearchIT / tests /test_reranker_integration.py
siddhm11
Phase 6: LightGBM reranker integration (37-feature schema, 141-tree model)
63b37fd
"""
Phase 6: LightGBM Reranker Integration Tests
Tests:
1. Smoke test β€” load model, predict on dummy input
2. Feature computation β€” verify 37-feature vector shape and values
3. Heuristic fallback β€” verify scoring works without model
4. End-to-end β€” full pipeline with simulated user state
5. Latency benchmark β€” confirm < 1ms for 100 candidates
6. Backward compatibility β€” old call signature still works
"""
import sys
import os
import time
import numpy as np
# Add project root to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
# ── Test 1: Smoke Test ───────────────────────────────────────────────────────
def test_smoke():
"""Load the LightGBM model directly and predict on dummy input."""
import lightgbm as lgb
model_path = os.path.join(
os.path.dirname(__file__), "..",
"models", "reranker-phase6", "production_model", "reranker_v1.txt"
)
model_path = os.path.normpath(model_path)
assert os.path.isfile(model_path), f"Model file not found: {model_path}"
model = lgb.Booster(model_file=model_path)
# Verify model properties
assert model.num_feature() == 37, f"Expected 37 features, got {model.num_feature()}"
print(f" Model loaded: {model.num_trees()} trees, {model.num_feature()} features")
# Predict on zeros
dummy = np.zeros((5, 37), dtype=np.float32)
scores = model.predict(dummy)
assert scores.shape == (5,), f"Expected (5,), got {scores.shape}"
assert not np.any(np.isnan(scores)), "NaN in predictions"
print(f" Zero-input scores: {scores}")
# Predict on random input
random_input = np.random.randn(10, 37).astype(np.float32)
scores = model.predict(random_input)
assert scores.shape == (10,)
assert not np.any(np.isnan(scores))
print(f" Random-input score range: [{scores.min():.4f}, {scores.max():.4f}]")
print(" βœ… Smoke test PASSED")
# ── Test 2: Feature Computation ──────────────────────────────────────────────
def test_feature_computation():
"""Verify compute_features produces correct 37-feature matrix."""
from app.recommend.reranker import compute_features, NUM_FEATURES
n = 5
embeddings = np.random.randn(n, 1024).astype(np.float32)
metadata = [
{
"arxiv_id": f"2401.{i:05d}",
"category": "cs.CL",
"published": "2024-01-15",
"citation_count": i * 100,
"influential_citations": i * 10,
"authors": '["Alice Smith", "Bob Jones"]',
}
for i in range(n)
]
lt_vec = np.random.randn(1024).astype(np.float32)
st_vec = np.random.randn(1024).astype(np.float32)
neg_vec = np.random.randn(1024).astype(np.float32)
qdrant_scores = [0.95 - i * 0.05 for i in range(n)]
features = compute_features(
embeddings, metadata, lt_vec, st_vec, neg_vec,
qdrant_scores=qdrant_scores,
cluster_importance=0.75,
suppressed_categories={"cs.CR"},
onboarding_categories={"cs.CL", "cs.LG"},
user_total_saves=42,
user_total_dismissals=8,
)
assert features.shape == (n, NUM_FEATURES), f"Expected ({n}, {NUM_FEATURES}), got {features.shape}"
assert features.dtype == np.float32
# Check specific feature values
for i in range(n):
# Feature 0: qdrant_cosine_score
assert abs(features[i, 0] - qdrant_scores[i]) < 1e-5, \
f"Feature 0 mismatch: {features[i, 0]} vs {qdrant_scores[i]}"
# Feature 1: position = i
assert features[i, 1] == float(i)
# Feature 2: citation_count
assert features[i, 2] == float(i * 100)
# Feature 3: log_citations = log(100i + 1)
assert abs(features[i, 3] - np.log(i * 100 + 1)) < 1e-5
# Feature 6: recency_score > 0 (2024-01-15 is recent-ish)
assert features[i, 6] > 0, f"Recency should be > 0, got {features[i, 6]}"
# Feature 20: ewma_longterm should be non-zero (we provided profiles)
assert features[i, 20] != 0.0, "EWMA long-term should be computed"
# Feature 23: cluster_importance
assert features[i, 23] == 0.75
# Feature 25: suppressed = 0 (category is cs.CL, not cs.CR)
assert features[i, 25] == 0.0
# Feature 26: onboarding = 1 (cs.CL is in onboarding set)
assert features[i, 26] == 1.0
# Feature 27: total_saves
assert features[i, 27] == 42.0
# Feature 35: position_inverse = 1/(i+1)
assert abs(features[i, 35] - 1.0 / (i + 1)) < 1e-5
# Check no NaN
assert not np.any(np.isnan(features)), "NaN in features"
print(f" Feature matrix shape: {features.shape}")
print(f" Feature value range: [{features.min():.4f}, {features.max():.4f}]")
print(f" Non-zero features per row: {(features != 0).sum(axis=1)}")
print(" βœ… Feature computation test PASSED")
# ── Test 3: Heuristic Fallback ───────────────────────────────────────────────
def test_heuristic_fallback():
"""Verify heuristic scoring works correctly."""
from app.recommend.reranker import heuristic_score
n = 10
features = np.zeros((n, 37), dtype=np.float32)
# Set some features that affect heuristic scoring
for i in range(n):
features[i, 0] = 0.9 - i * 0.05 # qdrant_cosine (decreasing)
features[i, 6] = np.exp(-0.002 * i * 30) # recency (decreasing age)
features[i, 35] = 1.0 / (i + 1) # position_inverse
scores = heuristic_score(features)
assert scores.shape == (n,)
assert not np.any(np.isnan(scores))
# First candidate should score higher (better cosine, recency, position)
assert scores[0] > scores[-1], \
f"First candidate ({scores[0]:.4f}) should score higher than last ({scores[-1]:.4f})"
print(f" Heuristic scores: [{scores[0]:.4f}, .., {scores[-1]:.4f}]")
print(" βœ… Heuristic fallback test PASSED")
# ── Test 4: End-to-End Pipeline ──────────────────────────────────────────────
def test_e2e_pipeline():
"""Full pipeline: feature computation β†’ model prediction β†’ ranking."""
from app.recommend.reranker import rerank_candidates, _USE_LGB
n = 50
candidate_ids = [f"2401.{i:05d}" for i in range(n)]
embeddings = np.random.randn(n, 1024).astype(np.float32)
metadata = [
{
"arxiv_id": cid,
"category": f"cs.{'CL' if i % 3 == 0 else 'LG' if i % 3 == 1 else 'CV'}",
"published": f"2024-{1 + (i % 12):02d}-{1 + (i % 28):02d}",
"citation_count": max(0, 500 - i * 10 + np.random.randint(-50, 50)),
"influential_citations": max(0, 50 - i + np.random.randint(-5, 5)),
"authors": '["Author A", "Author B"]',
}
for i, cid in enumerate(candidate_ids)
]
lt_vec = np.random.randn(1024).astype(np.float32)
st_vec = np.random.randn(1024).astype(np.float32)
neg_vec = np.random.randn(1024).astype(np.float32)
qdrant_scores = [0.95 - i * 0.01 for i in range(n)]
sorted_ids, sorted_scores, sorted_embs = rerank_candidates(
candidate_ids=candidate_ids,
candidate_embeddings=embeddings,
candidate_metadata=metadata,
long_term_vec=lt_vec,
short_term_vec=st_vec,
negative_vec=neg_vec,
qdrant_scores=qdrant_scores,
cluster_importance=0.6,
user_total_saves=25,
user_total_dismissals=5,
)
assert len(sorted_ids) == n
assert len(sorted_scores) == n
assert sorted_embs.shape == (n, 1024)
# Scores should be in descending order
for i in range(len(sorted_scores) - 1):
assert sorted_scores[i] >= sorted_scores[i + 1], \
f"Scores not sorted at index {i}: {sorted_scores[i]} < {sorted_scores[i + 1]}"
# The order should differ from the input (reranking should change something)
if _USE_LGB:
assert sorted_ids != candidate_ids, "LightGBM reranking should change the order"
print(f" Using: LightGBM")
else:
print(f" Using: Heuristic fallback")
print(f" Reranked {n} candidates")
print(f" Score range: [{sorted_scores[-1]:.4f}, {sorted_scores[0]:.4f}]")
print(f" Top-5 IDs: {sorted_ids[:5]}")
print(" βœ… End-to-end pipeline test PASSED")
# ── Test 5: Latency Benchmark ───────────────────────────────────────────────
def test_latency():
"""Verify LightGBM prediction is under 1ms for 100 candidates."""
from app.recommend.reranker import _lgb_model, _USE_LGB
if not _USE_LGB:
print(" ⏭️ Skipping latency test (no LightGBM model loaded)")
return
features = np.random.randn(100, 37).astype(np.float32)
# Warm up
for _ in range(50):
_lgb_model.predict(features)
# Benchmark
n_iters = 1000
t0 = time.perf_counter()
for _ in range(n_iters):
_lgb_model.predict(features)
elapsed_ms = (time.perf_counter() - t0) * 1000 / n_iters
print(f" LightGBM predict latency: {elapsed_ms:.3f}ms per 100 candidates")
assert elapsed_ms < 1.0, f"Too slow: {elapsed_ms:.3f}ms (target: <1ms)"
print(" βœ… Latency test PASSED")
# ── Test 6: Backward Compatibility ──────────────────────────────────────────
def test_backward_compat():
"""Verify old call signature still works (no qdrant_scores, no cluster params)."""
from app.recommend.reranker import rerank_candidates
n = 10
ids = [f"2401.{i:05d}" for i in range(n)]
embs = np.random.randn(n, 1024).astype(np.float32)
meta = [
{"arxiv_id": cid, "published": "2024-01-01", "category": "cs.CL"}
for cid in ids
]
# Old signature: just ids, embeddings, metadata, and optional profile vecs
sorted_ids, sorted_scores, sorted_embs = rerank_candidates(
candidate_ids=ids,
candidate_embeddings=embs,
candidate_metadata=meta,
)
assert len(sorted_ids) == n
assert len(sorted_scores) == n
assert sorted_embs.shape == (n, 1024)
print(" βœ… Backward compatibility test PASSED")
# ── Test 7: LightGBM vs Heuristic Comparison ───────────────────────────────
def test_lgb_vs_heuristic():
"""Compare LightGBM and heuristic scores on same input."""
from app.recommend.reranker import compute_features, heuristic_score, _lgb_model, _USE_LGB
if not _USE_LGB:
print(" ⏭️ Skipping comparison (no LightGBM model)")
return
n = 20
embeddings = np.random.randn(n, 1024).astype(np.float32)
metadata = [
{
"arxiv_id": f"2401.{i:05d}",
"category": "cs.CL",
"published": f"2024-{1 + i % 12:02d}-15",
"citation_count": i * 50,
"influential_citations": i * 5,
"authors": '["Author A"]',
}
for i in range(n)
]
qdrant_scores = [0.9 - i * 0.02 for i in range(n)]
features = compute_features(
embeddings, metadata,
qdrant_scores=qdrant_scores,
user_total_saves=10,
)
heur_scores = heuristic_score(features)
lgb_scores = _lgb_model.predict(features)
# Rankings should differ
heur_order = np.argsort(-heur_scores)
lgb_order = np.argsort(-lgb_scores)
overlap_top5 = len(set(heur_order[:5]) & set(lgb_order[:5]))
print(f" Heuristic score range: [{heur_scores.min():.4f}, {heur_scores.max():.4f}]")
print(f" LightGBM score range: [{lgb_scores.min():.4f}, {lgb_scores.max():.4f}]")
print(f" Top-5 overlap: {overlap_top5}/5")
print(f" Heuristic top-5 positions: {heur_order[:5]}")
print(f" LightGBM top-5 positions: {lgb_order[:5]}")
# Kendall's tau - rank correlation
from scipy.stats import kendalltau
tau, _ = kendalltau(heur_order, lgb_order)
print(f" Kendall's tau (rank correlation): {tau:.4f}")
print(" βœ… LGB vs Heuristic comparison PASSED")
# ── Run All Tests ────────────────────────────────────────────────────────────
if __name__ == "__main__":
tests = [
("Smoke Test", test_smoke),
("Feature Computation", test_feature_computation),
("Heuristic Fallback", test_heuristic_fallback),
("End-to-End Pipeline", test_e2e_pipeline),
("Latency Benchmark", test_latency),
("Backward Compatibility", test_backward_compat),
("LGB vs Heuristic", test_lgb_vs_heuristic),
]
print("=" * 60)
print("Phase 6: LightGBM Reranker Integration Tests")
print("=" * 60)
passed = 0
failed = 0
for name, test_fn in tests:
print(f"\n─── {name} ───")
try:
test_fn()
passed += 1
except Exception as e:
print(f" ❌ FAILED: {e}")
import traceback
traceback.print_exc()
failed += 1
print(f"\n{'=' * 60}")
print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
if failed == 0:
print("βœ… ALL TESTS PASSED")
else:
print("❌ SOME TESTS FAILED")
print("=" * 60)