File size: 2,315 Bytes
fec1e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Quick-start: Load and use the ResearchIT Phase 6 reranker.

Usage:
    python load_model.py

Or import in your code:
    from load_model import load_reranker, predict_scores
"""
import numpy as np

def load_reranker(model_path: str = "production_model/reranker_v1.txt"):
    """Load the LightGBM reranker model."""
    import lightgbm as lgb
    model = lgb.Booster(model_file=model_path)
    assert model.num_feature() == 37, f"Expected 37 features, got {model.num_feature()}"
    return model

def predict_scores(model, features: np.ndarray) -> np.ndarray:
    """
    Predict reranking scores for candidates.
    
    Args:
        model: LightGBM Booster
        features: (N, 37) float32 array — see feature_schema.json for column order
        
    Returns:
        (N,) float64 array — higher score = more relevant
    """
    assert features.shape[1] == 37, f"Expected 37 features, got {features.shape[1]}"
    return model.predict(features)

# Feature schema (must match this exact order)
FEATURE_SCHEMA = [
    "qdrant_cosine_score", "candidate_position", "candidate_citation_count",
    "candidate_log_citations", "candidate_influential_citations",
    "candidate_age_days", "candidate_recency_score", "query_citation_count",
    "query_age_days", "year_diff", "same_primary_category", "co_citation_count",
    "shared_author_count", "candidate_is_newer", "query_log_citations",
    "citation_count_ratio", "age_ratio", "candidate_citations_per_year",
    "query_num_references", "candidate_num_cited_by",
    "ewma_longterm_similarity", "ewma_shortterm_similarity",
    "ewma_negative_similarity", "cluster_importance",
    "cluster_distance_to_medoid", "is_suppressed_category",
    "onboarding_category_match", "user_total_saves", "user_total_dismissals",
    "user_days_since_last_save", "user_session_save_count",
    "cosine_x_recency", "cosine_x_citations", "category_x_recency",
    "cosine_x_cocitation", "position_inverse", "citations_x_recency",
]

if __name__ == "__main__":
    model = load_reranker()
    print(f"Model loaded: {model.num_trees()} trees, {model.num_feature()} features")
    
    # Test with dummy input
    dummy = np.zeros((10, 37), dtype=np.float32)
    scores = predict_scores(model, dummy)
    print(f"Dummy scores: {scores[:5]}")
    print("✅ Model works!")