"""
Phase 6: LightGBM Reranker Integration Tests

Tests:
  1. Smoke test — load model, predict on dummy input
  2. Feature computation — verify 37-feature vector shape and values
  3. Heuristic fallback — verify scoring works without model
  4. End-to-end — full pipeline with simulated user state
  5. Latency benchmark — confirm < 1ms for 100 candidates
  6. Backward compatibility — old call signature still works
"""
import sys
import os
import time
import numpy as np

# Add project root to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

# ── Test 1: Smoke Test ───────────────────────────────────────────────────────

def test_smoke():
    """Load the LightGBM model directly and predict on dummy input."""
    import lightgbm as lgb

    model_path = os.path.join(
        os.path.dirname(__file__), "..",
        "models", "reranker-phase6", "production_model", "reranker_v1.txt"
    )
    model_path = os.path.normpath(model_path)

    assert os.path.isfile(model_path), f"Model file not found: {model_path}"

    model = lgb.Booster(model_file=model_path)

    # Verify model properties
    assert model.num_feature() == 37, f"Expected 37 features, got {model.num_feature()}"
    print(f"  Model loaded: {model.num_trees()} trees, {model.num_feature()} features")

    # Predict on zeros
    dummy = np.zeros((5, 37), dtype=np.float32)
    scores = model.predict(dummy)
    assert scores.shape == (5,), f"Expected (5,), got {scores.shape}"
    assert not np.any(np.isnan(scores)), "NaN in predictions"
    print(f"  Zero-input scores: {scores}")

    # Predict on random input
    random_input = np.random.randn(10, 37).astype(np.float32)
    scores = model.predict(random_input)
    assert scores.shape == (10,)
    assert not np.any(np.isnan(scores))
    print(f"  Random-input score range: [{scores.min():.4f}, {scores.max():.4f}]")

    print("  ✅ Smoke test PASSED")


# ── Test 2: Feature Computation ──────────────────────────────────────────────

def test_feature_computation():
    """Verify compute_features produces correct 37-feature matrix."""
    from app.recommend.reranker import compute_features, NUM_FEATURES

    n = 5
    embeddings = np.random.randn(n, 1024).astype(np.float32)
    metadata = [
        {
            "arxiv_id": f"2401.{i:05d}",
            "category": "cs.CL",
            "published": "2024-01-15",
            "citation_count": i * 100,
            "influential_citations": i * 10,
            "authors": '["Alice Smith", "Bob Jones"]',
        }
        for i in range(n)
    ]
    lt_vec = np.random.randn(1024).astype(np.float32)
    st_vec = np.random.randn(1024).astype(np.float32)
    neg_vec = np.random.randn(1024).astype(np.float32)
    qdrant_scores = [0.95 - i * 0.05 for i in range(n)]

    features = compute_features(
        embeddings, metadata, lt_vec, st_vec, neg_vec,
        qdrant_scores=qdrant_scores,
        cluster_importance=0.75,
        suppressed_categories={"cs.CR"},
        onboarding_categories={"cs.CL", "cs.LG"},
        user_total_saves=42,
        user_total_dismissals=8,
    )

    assert features.shape == (n, NUM_FEATURES), f"Expected ({n}, {NUM_FEATURES}), got {features.shape}"
    assert features.dtype == np.float32

    # Check specific feature values
    for i in range(n):
        # Feature 0: qdrant_cosine_score
        assert abs(features[i, 0] - qdrant_scores[i]) < 1e-5, \
            f"Feature 0 mismatch: {features[i, 0]} vs {qdrant_scores[i]}"

        # Feature 1: position = i
        assert features[i, 1] == float(i)

        # Feature 2: citation_count
        assert features[i, 2] == float(i * 100)

        # Feature 3: log_citations = log(100i + 1)
        assert abs(features[i, 3] - np.log(i * 100 + 1)) < 1e-5

        # Feature 6: recency_score > 0 (2024-01-15 is recent-ish)
        assert features[i, 6] > 0, f"Recency should be > 0, got {features[i, 6]}"

        # Feature 20: ewma_longterm should be non-zero (we provided profiles)
        assert features[i, 20] != 0.0, "EWMA long-term should be computed"

        # Feature 23: cluster_importance
        assert features[i, 23] == 0.75

        # Feature 25: suppressed = 0 (category is cs.CL, not cs.CR)
        assert features[i, 25] == 0.0

        # Feature 26: onboarding = 1 (cs.CL is in onboarding set)
        assert features[i, 26] == 1.0

        # Feature 27: total_saves
        assert features[i, 27] == 42.0

        # Feature 35: position_inverse = 1/(i+1)
        assert abs(features[i, 35] - 1.0 / (i + 1)) < 1e-5

    # Check no NaN
    assert not np.any(np.isnan(features)), "NaN in features"

    print(f"  Feature matrix shape: {features.shape}")
    print(f"  Feature value range: [{features.min():.4f}, {features.max():.4f}]")
    print(f"  Non-zero features per row: {(features != 0).sum(axis=1)}")
    print("  ✅ Feature computation test PASSED")


# ── Test 3: Heuristic Fallback ───────────────────────────────────────────────

def test_heuristic_fallback():
    """Verify heuristic scoring works correctly."""
    from app.recommend.reranker import heuristic_score

    n = 10
    features = np.zeros((n, 37), dtype=np.float32)

    # Set some features that affect heuristic scoring
    for i in range(n):
        features[i, 0] = 0.9 - i * 0.05       # qdrant_cosine (decreasing)
        features[i, 6] = np.exp(-0.002 * i * 30)  # recency (decreasing age)
        features[i, 35] = 1.0 / (i + 1)         # position_inverse

    scores = heuristic_score(features)

    assert scores.shape == (n,)
    assert not np.any(np.isnan(scores))
    # First candidate should score higher (better cosine, recency, position)
    assert scores[0] > scores[-1], \
        f"First candidate ({scores[0]:.4f}) should score higher than last ({scores[-1]:.4f})"

    print(f"  Heuristic scores: [{scores[0]:.4f}, .., {scores[-1]:.4f}]")
    print("  ✅ Heuristic fallback test PASSED")


# ── Test 4: End-to-End Pipeline ──────────────────────────────────────────────

def test_e2e_pipeline():
    """Full pipeline: feature computation → model prediction → ranking."""
    from app.recommend.reranker import rerank_candidates, _USE_LGB

    n = 50
    candidate_ids = [f"2401.{i:05d}" for i in range(n)]
    embeddings = np.random.randn(n, 1024).astype(np.float32)
    metadata = [
        {
            "arxiv_id": cid,
            "category": f"cs.{'CL' if i % 3 == 0 else 'LG' if i % 3 == 1 else 'CV'}",
            "published": f"2024-{1 + (i % 12):02d}-{1 + (i % 28):02d}",
            "citation_count": max(0, 500 - i * 10 + np.random.randint(-50, 50)),
            "influential_citations": max(0, 50 - i + np.random.randint(-5, 5)),
            "authors": '["Author A", "Author B"]',
        }
        for i, cid in enumerate(candidate_ids)
    ]
    lt_vec = np.random.randn(1024).astype(np.float32)
    st_vec = np.random.randn(1024).astype(np.float32)
    neg_vec = np.random.randn(1024).astype(np.float32)
    qdrant_scores = [0.95 - i * 0.01 for i in range(n)]

    sorted_ids, sorted_scores, sorted_embs = rerank_candidates(
        candidate_ids=candidate_ids,
        candidate_embeddings=embeddings,
        candidate_metadata=metadata,
        long_term_vec=lt_vec,
        short_term_vec=st_vec,
        negative_vec=neg_vec,
        qdrant_scores=qdrant_scores,
        cluster_importance=0.6,
        user_total_saves=25,
        user_total_dismissals=5,
    )

    assert len(sorted_ids) == n
    assert len(sorted_scores) == n
    assert sorted_embs.shape == (n, 1024)

    # Scores should be in descending order
    for i in range(len(sorted_scores) - 1):
        assert sorted_scores[i] >= sorted_scores[i + 1], \
            f"Scores not sorted at index {i}: {sorted_scores[i]} < {sorted_scores[i + 1]}"

    # The order should differ from the input (reranking should change something)
    if _USE_LGB:
        assert sorted_ids != candidate_ids, "LightGBM reranking should change the order"
        print(f"  Using: LightGBM")
    else:
        print(f"  Using: Heuristic fallback")

    print(f"  Reranked {n} candidates")
    print(f"  Score range: [{sorted_scores[-1]:.4f}, {sorted_scores[0]:.4f}]")
    print(f"  Top-5 IDs: {sorted_ids[:5]}")
    print("  ✅ End-to-end pipeline test PASSED")


# ── Test 5: Latency Benchmark ───────────────────────────────────────────────

def test_latency():
    """Verify LightGBM prediction is under 1ms for 100 candidates."""
    from app.recommend.reranker import _lgb_model, _USE_LGB

    if not _USE_LGB:
        print("  ⏭️ Skipping latency test (no LightGBM model loaded)")
        return

    features = np.random.randn(100, 37).astype(np.float32)

    # Warm up
    for _ in range(50):
        _lgb_model.predict(features)

    # Benchmark
    n_iters = 1000
    t0 = time.perf_counter()
    for _ in range(n_iters):
        _lgb_model.predict(features)
    elapsed_ms = (time.perf_counter() - t0) * 1000 / n_iters

    print(f"  LightGBM predict latency: {elapsed_ms:.3f}ms per 100 candidates")
    assert elapsed_ms < 1.0, f"Too slow: {elapsed_ms:.3f}ms (target: <1ms)"
    print("  ✅ Latency test PASSED")


# ── Test 6: Backward Compatibility ──────────────────────────────────────────

def test_backward_compat():
    """Verify old call signature still works (no qdrant_scores, no cluster params)."""
    from app.recommend.reranker import rerank_candidates

    n = 10
    ids = [f"2401.{i:05d}" for i in range(n)]
    embs = np.random.randn(n, 1024).astype(np.float32)
    meta = [
        {"arxiv_id": cid, "published": "2024-01-01", "category": "cs.CL"}
        for cid in ids
    ]

    # Old signature: just ids, embeddings, metadata, and optional profile vecs
    sorted_ids, sorted_scores, sorted_embs = rerank_candidates(
        candidate_ids=ids,
        candidate_embeddings=embs,
        candidate_metadata=meta,
    )

    assert len(sorted_ids) == n
    assert len(sorted_scores) == n
    assert sorted_embs.shape == (n, 1024)
    print("  ✅ Backward compatibility test PASSED")


# ── Test 7: LightGBM vs Heuristic Comparison ───────────────────────────────

def test_lgb_vs_heuristic():
    """Compare LightGBM and heuristic scores on same input."""
    from app.recommend.reranker import compute_features, heuristic_score, _lgb_model, _USE_LGB

    if not _USE_LGB:
        print("  ⏭️ Skipping comparison (no LightGBM model)")
        return

    n = 20
    embeddings = np.random.randn(n, 1024).astype(np.float32)
    metadata = [
        {
            "arxiv_id": f"2401.{i:05d}",
            "category": "cs.CL",
            "published": f"2024-{1 + i % 12:02d}-15",
            "citation_count": i * 50,
            "influential_citations": i * 5,
            "authors": '["Author A"]',
        }
        for i in range(n)
    ]
    qdrant_scores = [0.9 - i * 0.02 for i in range(n)]

    features = compute_features(
        embeddings, metadata,
        qdrant_scores=qdrant_scores,
        user_total_saves=10,
    )

    heur_scores = heuristic_score(features)
    lgb_scores = _lgb_model.predict(features)

    # Rankings should differ
    heur_order = np.argsort(-heur_scores)
    lgb_order = np.argsort(-lgb_scores)

    overlap_top5 = len(set(heur_order[:5]) & set(lgb_order[:5]))

    print(f"  Heuristic score range: [{heur_scores.min():.4f}, {heur_scores.max():.4f}]")
    print(f"  LightGBM score range:  [{lgb_scores.min():.4f}, {lgb_scores.max():.4f}]")
    print(f"  Top-5 overlap: {overlap_top5}/5")
    print(f"  Heuristic top-5 positions: {heur_order[:5]}")
    print(f"  LightGBM  top-5 positions: {lgb_order[:5]}")

    # Kendall's tau - rank correlation
    from scipy.stats import kendalltau
    tau, _ = kendalltau(heur_order, lgb_order)
    print(f"  Kendall's tau (rank correlation): {tau:.4f}")
    print("  ✅ LGB vs Heuristic comparison PASSED")


# ── Run All Tests ────────────────────────────────────────────────────────────

if __name__ == "__main__":
    tests = [
        ("Smoke Test", test_smoke),
        ("Feature Computation", test_feature_computation),
        ("Heuristic Fallback", test_heuristic_fallback),
        ("End-to-End Pipeline", test_e2e_pipeline),
        ("Latency Benchmark", test_latency),
        ("Backward Compatibility", test_backward_compat),
        ("LGB vs Heuristic", test_lgb_vs_heuristic),
    ]

    print("=" * 60)
    print("Phase 6: LightGBM Reranker Integration Tests")
    print("=" * 60)

    passed = 0
    failed = 0
    for name, test_fn in tests:
        print(f"\n─── {name} ───")
        try:
            test_fn()
            passed += 1
        except Exception as e:
            print(f"  ❌ FAILED: {e}")
            import traceback
            traceback.print_exc()
            failed += 1

    print(f"\n{'=' * 60}")
    print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
    if failed == 0:
        print("✅ ALL TESTS PASSED")
    else:
        print("❌ SOME TESTS FAILED")
    print("=" * 60)