Spaces:
Running
Running
| """ | |
| Phase 6: LightGBM Reranker Integration Tests | |
| Tests: | |
| 1. Smoke test β load model, predict on dummy input | |
| 2. Feature computation β verify 37-feature vector shape and values | |
| 3. Heuristic fallback β verify scoring works without model | |
| 4. End-to-end β full pipeline with simulated user state | |
| 5. Latency benchmark β confirm < 1ms for 100 candidates | |
| 6. Backward compatibility β old call signature still works | |
| """ | |
| import sys | |
| import os | |
| import time | |
| import numpy as np | |
| # Add project root to path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) | |
| # ββ Test 1: Smoke Test βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_smoke(): | |
| """Load the LightGBM model directly and predict on dummy input.""" | |
| import lightgbm as lgb | |
| model_path = os.path.join( | |
| os.path.dirname(__file__), "..", | |
| "models", "reranker-phase6", "production_model", "reranker_v1.txt" | |
| ) | |
| model_path = os.path.normpath(model_path) | |
| assert os.path.isfile(model_path), f"Model file not found: {model_path}" | |
| model = lgb.Booster(model_file=model_path) | |
| # Verify model properties | |
| assert model.num_feature() == 37, f"Expected 37 features, got {model.num_feature()}" | |
| print(f" Model loaded: {model.num_trees()} trees, {model.num_feature()} features") | |
| # Predict on zeros | |
| dummy = np.zeros((5, 37), dtype=np.float32) | |
| scores = model.predict(dummy) | |
| assert scores.shape == (5,), f"Expected (5,), got {scores.shape}" | |
| assert not np.any(np.isnan(scores)), "NaN in predictions" | |
| print(f" Zero-input scores: {scores}") | |
| # Predict on random input | |
| random_input = np.random.randn(10, 37).astype(np.float32) | |
| scores = model.predict(random_input) | |
| assert scores.shape == (10,) | |
| assert not np.any(np.isnan(scores)) | |
| print(f" Random-input score range: [{scores.min():.4f}, {scores.max():.4f}]") | |
| print(" β Smoke test PASSED") | |
| # ββ Test 2: Feature Computation ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_feature_computation(): | |
| """Verify compute_features produces correct 37-feature matrix.""" | |
| from app.recommend.reranker import compute_features, NUM_FEATURES | |
| n = 5 | |
| embeddings = np.random.randn(n, 1024).astype(np.float32) | |
| metadata = [ | |
| { | |
| "arxiv_id": f"2401.{i:05d}", | |
| "category": "cs.CL", | |
| "published": "2024-01-15", | |
| "citation_count": i * 100, | |
| "influential_citations": i * 10, | |
| "authors": '["Alice Smith", "Bob Jones"]', | |
| } | |
| for i in range(n) | |
| ] | |
| lt_vec = np.random.randn(1024).astype(np.float32) | |
| st_vec = np.random.randn(1024).astype(np.float32) | |
| neg_vec = np.random.randn(1024).astype(np.float32) | |
| qdrant_scores = [0.95 - i * 0.05 for i in range(n)] | |
| features = compute_features( | |
| embeddings, metadata, lt_vec, st_vec, neg_vec, | |
| qdrant_scores=qdrant_scores, | |
| cluster_importance=0.75, | |
| suppressed_categories={"cs.CR"}, | |
| onboarding_categories={"cs.CL", "cs.LG"}, | |
| user_total_saves=42, | |
| user_total_dismissals=8, | |
| ) | |
| assert features.shape == (n, NUM_FEATURES), f"Expected ({n}, {NUM_FEATURES}), got {features.shape}" | |
| assert features.dtype == np.float32 | |
| # Check specific feature values | |
| for i in range(n): | |
| # Feature 0: qdrant_cosine_score | |
| assert abs(features[i, 0] - qdrant_scores[i]) < 1e-5, \ | |
| f"Feature 0 mismatch: {features[i, 0]} vs {qdrant_scores[i]}" | |
| # Feature 1: position = i | |
| assert features[i, 1] == float(i) | |
| # Feature 2: citation_count | |
| assert features[i, 2] == float(i * 100) | |
| # Feature 3: log_citations = log(100i + 1) | |
| assert abs(features[i, 3] - np.log(i * 100 + 1)) < 1e-5 | |
| # Feature 6: recency_score > 0 (2024-01-15 is recent-ish) | |
| assert features[i, 6] > 0, f"Recency should be > 0, got {features[i, 6]}" | |
| # Feature 20: ewma_longterm should be non-zero (we provided profiles) | |
| assert features[i, 20] != 0.0, "EWMA long-term should be computed" | |
| # Feature 23: cluster_importance | |
| assert features[i, 23] == 0.75 | |
| # Feature 25: suppressed = 0 (category is cs.CL, not cs.CR) | |
| assert features[i, 25] == 0.0 | |
| # Feature 26: onboarding = 1 (cs.CL is in onboarding set) | |
| assert features[i, 26] == 1.0 | |
| # Feature 27: total_saves | |
| assert features[i, 27] == 42.0 | |
| # Feature 35: position_inverse = 1/(i+1) | |
| assert abs(features[i, 35] - 1.0 / (i + 1)) < 1e-5 | |
| # Check no NaN | |
| assert not np.any(np.isnan(features)), "NaN in features" | |
| print(f" Feature matrix shape: {features.shape}") | |
| print(f" Feature value range: [{features.min():.4f}, {features.max():.4f}]") | |
| print(f" Non-zero features per row: {(features != 0).sum(axis=1)}") | |
| print(" β Feature computation test PASSED") | |
| # ββ Test 3: Heuristic Fallback βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_heuristic_fallback(): | |
| """Verify heuristic scoring works correctly.""" | |
| from app.recommend.reranker import heuristic_score | |
| n = 10 | |
| features = np.zeros((n, 37), dtype=np.float32) | |
| # Set some features that affect heuristic scoring | |
| for i in range(n): | |
| features[i, 0] = 0.9 - i * 0.05 # qdrant_cosine (decreasing) | |
| features[i, 6] = np.exp(-0.002 * i * 30) # recency (decreasing age) | |
| features[i, 35] = 1.0 / (i + 1) # position_inverse | |
| scores = heuristic_score(features) | |
| assert scores.shape == (n,) | |
| assert not np.any(np.isnan(scores)) | |
| # First candidate should score higher (better cosine, recency, position) | |
| assert scores[0] > scores[-1], \ | |
| f"First candidate ({scores[0]:.4f}) should score higher than last ({scores[-1]:.4f})" | |
| print(f" Heuristic scores: [{scores[0]:.4f}, .., {scores[-1]:.4f}]") | |
| print(" β Heuristic fallback test PASSED") | |
| # ββ Test 4: End-to-End Pipeline ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_e2e_pipeline(): | |
| """Full pipeline: feature computation β model prediction β ranking.""" | |
| from app.recommend.reranker import rerank_candidates, _USE_LGB | |
| n = 50 | |
| candidate_ids = [f"2401.{i:05d}" for i in range(n)] | |
| embeddings = np.random.randn(n, 1024).astype(np.float32) | |
| metadata = [ | |
| { | |
| "arxiv_id": cid, | |
| "category": f"cs.{'CL' if i % 3 == 0 else 'LG' if i % 3 == 1 else 'CV'}", | |
| "published": f"2024-{1 + (i % 12):02d}-{1 + (i % 28):02d}", | |
| "citation_count": max(0, 500 - i * 10 + np.random.randint(-50, 50)), | |
| "influential_citations": max(0, 50 - i + np.random.randint(-5, 5)), | |
| "authors": '["Author A", "Author B"]', | |
| } | |
| for i, cid in enumerate(candidate_ids) | |
| ] | |
| lt_vec = np.random.randn(1024).astype(np.float32) | |
| st_vec = np.random.randn(1024).astype(np.float32) | |
| neg_vec = np.random.randn(1024).astype(np.float32) | |
| qdrant_scores = [0.95 - i * 0.01 for i in range(n)] | |
| sorted_ids, sorted_scores, sorted_embs = rerank_candidates( | |
| candidate_ids=candidate_ids, | |
| candidate_embeddings=embeddings, | |
| candidate_metadata=metadata, | |
| long_term_vec=lt_vec, | |
| short_term_vec=st_vec, | |
| negative_vec=neg_vec, | |
| qdrant_scores=qdrant_scores, | |
| cluster_importance=0.6, | |
| user_total_saves=25, | |
| user_total_dismissals=5, | |
| ) | |
| assert len(sorted_ids) == n | |
| assert len(sorted_scores) == n | |
| assert sorted_embs.shape == (n, 1024) | |
| # Scores should be in descending order | |
| for i in range(len(sorted_scores) - 1): | |
| assert sorted_scores[i] >= sorted_scores[i + 1], \ | |
| f"Scores not sorted at index {i}: {sorted_scores[i]} < {sorted_scores[i + 1]}" | |
| # The order should differ from the input (reranking should change something) | |
| if _USE_LGB: | |
| assert sorted_ids != candidate_ids, "LightGBM reranking should change the order" | |
| print(f" Using: LightGBM") | |
| else: | |
| print(f" Using: Heuristic fallback") | |
| print(f" Reranked {n} candidates") | |
| print(f" Score range: [{sorted_scores[-1]:.4f}, {sorted_scores[0]:.4f}]") | |
| print(f" Top-5 IDs: {sorted_ids[:5]}") | |
| print(" β End-to-end pipeline test PASSED") | |
| # ββ Test 5: Latency Benchmark βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_latency(): | |
| """Verify LightGBM prediction is under 1ms for 100 candidates.""" | |
| from app.recommend.reranker import _lgb_model, _USE_LGB | |
| if not _USE_LGB: | |
| print(" βοΈ Skipping latency test (no LightGBM model loaded)") | |
| return | |
| features = np.random.randn(100, 37).astype(np.float32) | |
| # Warm up | |
| for _ in range(50): | |
| _lgb_model.predict(features) | |
| # Benchmark | |
| n_iters = 1000 | |
| t0 = time.perf_counter() | |
| for _ in range(n_iters): | |
| _lgb_model.predict(features) | |
| elapsed_ms = (time.perf_counter() - t0) * 1000 / n_iters | |
| print(f" LightGBM predict latency: {elapsed_ms:.3f}ms per 100 candidates") | |
| assert elapsed_ms < 1.0, f"Too slow: {elapsed_ms:.3f}ms (target: <1ms)" | |
| print(" β Latency test PASSED") | |
| # ββ Test 6: Backward Compatibility ββββββββββββββββββββββββββββββββββββββββββ | |
| def test_backward_compat(): | |
| """Verify old call signature still works (no qdrant_scores, no cluster params).""" | |
| from app.recommend.reranker import rerank_candidates | |
| n = 10 | |
| ids = [f"2401.{i:05d}" for i in range(n)] | |
| embs = np.random.randn(n, 1024).astype(np.float32) | |
| meta = [ | |
| {"arxiv_id": cid, "published": "2024-01-01", "category": "cs.CL"} | |
| for cid in ids | |
| ] | |
| # Old signature: just ids, embeddings, metadata, and optional profile vecs | |
| sorted_ids, sorted_scores, sorted_embs = rerank_candidates( | |
| candidate_ids=ids, | |
| candidate_embeddings=embs, | |
| candidate_metadata=meta, | |
| ) | |
| assert len(sorted_ids) == n | |
| assert len(sorted_scores) == n | |
| assert sorted_embs.shape == (n, 1024) | |
| print(" β Backward compatibility test PASSED") | |
| # ββ Test 7: LightGBM vs Heuristic Comparison βββββββββββββββββββββββββββββββ | |
| def test_lgb_vs_heuristic(): | |
| """Compare LightGBM and heuristic scores on same input.""" | |
| from app.recommend.reranker import compute_features, heuristic_score, _lgb_model, _USE_LGB | |
| if not _USE_LGB: | |
| print(" βοΈ Skipping comparison (no LightGBM model)") | |
| return | |
| n = 20 | |
| embeddings = np.random.randn(n, 1024).astype(np.float32) | |
| metadata = [ | |
| { | |
| "arxiv_id": f"2401.{i:05d}", | |
| "category": "cs.CL", | |
| "published": f"2024-{1 + i % 12:02d}-15", | |
| "citation_count": i * 50, | |
| "influential_citations": i * 5, | |
| "authors": '["Author A"]', | |
| } | |
| for i in range(n) | |
| ] | |
| qdrant_scores = [0.9 - i * 0.02 for i in range(n)] | |
| features = compute_features( | |
| embeddings, metadata, | |
| qdrant_scores=qdrant_scores, | |
| user_total_saves=10, | |
| ) | |
| heur_scores = heuristic_score(features) | |
| lgb_scores = _lgb_model.predict(features) | |
| # Rankings should differ | |
| heur_order = np.argsort(-heur_scores) | |
| lgb_order = np.argsort(-lgb_scores) | |
| overlap_top5 = len(set(heur_order[:5]) & set(lgb_order[:5])) | |
| print(f" Heuristic score range: [{heur_scores.min():.4f}, {heur_scores.max():.4f}]") | |
| print(f" LightGBM score range: [{lgb_scores.min():.4f}, {lgb_scores.max():.4f}]") | |
| print(f" Top-5 overlap: {overlap_top5}/5") | |
| print(f" Heuristic top-5 positions: {heur_order[:5]}") | |
| print(f" LightGBM top-5 positions: {lgb_order[:5]}") | |
| # Kendall's tau - rank correlation | |
| from scipy.stats import kendalltau | |
| tau, _ = kendalltau(heur_order, lgb_order) | |
| print(f" Kendall's tau (rank correlation): {tau:.4f}") | |
| print(" β LGB vs Heuristic comparison PASSED") | |
| # ββ Run All Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| tests = [ | |
| ("Smoke Test", test_smoke), | |
| ("Feature Computation", test_feature_computation), | |
| ("Heuristic Fallback", test_heuristic_fallback), | |
| ("End-to-End Pipeline", test_e2e_pipeline), | |
| ("Latency Benchmark", test_latency), | |
| ("Backward Compatibility", test_backward_compat), | |
| ("LGB vs Heuristic", test_lgb_vs_heuristic), | |
| ] | |
| print("=" * 60) | |
| print("Phase 6: LightGBM Reranker Integration Tests") | |
| print("=" * 60) | |
| passed = 0 | |
| failed = 0 | |
| for name, test_fn in tests: | |
| print(f"\nβββ {name} βββ") | |
| try: | |
| test_fn() | |
| passed += 1 | |
| except Exception as e: | |
| print(f" β FAILED: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| failed += 1 | |
| print(f"\n{'=' * 60}") | |
| print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests") | |
| if failed == 0: | |
| print("β ALL TESTS PASSED") | |
| else: | |
| print("β SOME TESTS FAILED") | |
| print("=" * 60) | |