Add 03_train_lightgbm.py

41fda29 verified about 1 month ago

22 kB

	"""
	Step 3: Train LightGBM lambdarank reranker + compare against heuristic baseline.

	Produces:
	- reranker_v1.txt — trained LightGBM model (~100KB)
	- eval_metrics.json — nDCG@10, Recall@50, label distribution, feature importance
	- feature_importance.csv — ranked feature importance
	- baseline_comparison.json — LightGBM vs heuristic scorer on same eval set

	Usage:
	python 03_train_lightgbm.py \
	--train-file ltr_dataset/train.parquet \
	--eval-file ltr_dataset/eval.parquet \
	--output-dir ./model_output \
	--num-boost-round 500 \
	--learning-rate 0.05

	Prerequisites:
	- train.parquet + eval.parquet from Step 2
	- pip install lightgbm pyarrow numpy

	The heuristic baseline replicates the EXACT scoring logic from
	app/recommend/reranker.py → heuristic_score():
	score = 0.40 × lt_sim + 0.25 × st_sim + 0.15 × recency
	+ 0.10 × rrf_conf - 0.15 × neg_penalty

	Since pseudo-label training has no user profiles (features 20-30 = 0),
	the heuristic baseline for pseudo-labels simplifies to:
	score = 0.15 × recency + 0.10 × (1 - position/max_position)

	This is the fair baseline: both models see the same zero-filled user features.

	Author: ResearchIT ML Pipeline — Phase 6, Step 3
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import time
	from collections import defaultdict
	from pathlib import Path

	import lightgbm as lgb
	import numpy as np
	import pyarrow.parquet as pq


	# ── Feature schema (must match Step 2) ───────────────────────────────────────

	FEATURE_SCHEMA = [
	"qdrant_cosine_score", "candidate_position", "candidate_citation_count",
	"candidate_log_citations", "candidate_influential_citations",
	"candidate_age_days", "candidate_recency_score", "query_citation_count",
	"query_age_days", "year_diff", "same_primary_category", "co_citation_count",
	"shared_author_count", "candidate_is_newer", "query_log_citations",
	"citation_count_ratio", "age_ratio", "candidate_citations_per_year",
	"query_num_references", "candidate_num_cited_by",
	"ewma_longterm_similarity", "ewma_shortterm_similarity",
	"ewma_negative_similarity", "cluster_importance",
	"cluster_distance_to_medoid", "is_suppressed_category",
	"onboarding_category_match", "user_total_saves", "user_total_dismissals",
	"user_days_since_last_save", "user_session_save_count",
	"cosine_x_recency", "cosine_x_citations", "category_x_recency",
	"cosine_x_cocitation", "position_inverse", "citations_x_recency",
	]

	NUM_FEATURES = 37


	# ── Data Loading ─────────────────────────────────────────────────────────────

	def load_ltr_data(parquet_path: str) -> tuple[np.ndarray, np.ndarray, list[int], list[str]]:
	"""
	Load a parquet file into LightGBM-ready format.

	Returns:
	features: (N, 37) float32 matrix
	labels: (N,) int32 array (0, 1, or 2)
	groups: list of group sizes (candidates per query)
	query_ids: list of query arXiv IDs (one per row, for analysis)
	"""
	table = pq.read_table(parquet_path)

	query_ids = table.column("query_arxiv_id").to_pylist()
	labels = np.array(table.column("label").to_pylist(), dtype=np.int32)

	# Extract feature columns
	feature_arrays = []
	for fname in FEATURE_SCHEMA:
	col = table.column(fname).to_pylist()
	feature_arrays.append(col)
	features = np.column_stack(feature_arrays).astype(np.float32)

	# Compute group sizes (number of candidates per query)
	groups = []
	current_qid = None
	current_count = 0
	for qid in query_ids:
	if qid != current_qid:
	if current_qid is not None:
	groups.append(current_count)
	current_qid = qid
	current_count = 1
	else:
	current_count += 1
	if current_count > 0:
	groups.append(current_count)

	# Verify consistency
	assert sum(groups) == len(labels), f"Group sum {sum(groups)} != {len(labels)} rows"
	assert features.shape == (len(labels), NUM_FEATURES), f"Feature shape mismatch"

	return features, labels, groups, query_ids


	# ── Heuristic Baseline ──────────────────────────────────────────────────────

	def heuristic_baseline_score(features: np.ndarray) -> np.ndarray:
	"""
	Replicate the EXACT scoring logic from app/recommend/reranker.py.

	heuristic_score():
	lt_sim = features[:, 0] → here: ewma_longterm_similarity (col 20) = 0
	st_sim = features[:, 1] → here: ewma_shortterm_similarity (col 21) = 0
	age_days = features[:, 2] → here: candidate_age_days (col 5)
	rrf_pos = features[:, 3] → here: candidate_position (col 1)
	neg_sim = features[:, 4] → here: ewma_negative_similarity (col 22) = 0

	For pseudo-label data, EWMA features are 0, so score simplifies to:
	score = 0.15 × exp(-0.002 × age_days) + 0.10 × (1 - pos/max_pos)

	But we also include the cosine score (col 0) since that's what the
	reranker would actually see in production (it's feature 0 = lt_sim proxy).
	In the real pipeline, lt_sim IS the cosine similarity to the long-term
	profile — for pseudo-labels, the closest proxy is qdrant_cosine_score.

	So the fair pseudo-label heuristic baseline is:
	score = 0.40 × qdrant_cosine_score (proxy for lt_sim)
	+ 0.15 × recency_decay
	+ 0.10 × rrf_confidence
	"""
	qdrant_cosine = features[:, 0] # qdrant_cosine_score
	position = features[:, 1] # candidate_position
	age_days = features[:, 5] # candidate_age_days

	# Recency: exp(-0.002 * age_days) — matches reranker.py exactly
	recency = np.exp(-0.002 * age_days)

	# RRF confidence: inverse of position (normalised)
	max_pos = position.max() + 1
	rrf_conf = 1.0 - (position / max_pos)

	scores = (
	0.40 * qdrant_cosine
	+ 0.15 * recency
	+ 0.10 * rrf_conf
	)
	return scores


	# ── Evaluation Metrics ───────────────────────────────────────────────────────

	def ndcg_at_k(labels: np.ndarray, scores: np.ndarray, groups: list[int], k: int = 10) -> float:
	"""Compute mean nDCG@k across all queries."""
	ndcg_scores = []
	offset = 0
	for group_size in groups:
	group_labels = labels[offset:offset + group_size]
	group_scores = scores[offset:offset + group_size]

	# Sort by predicted score descending
	order = np.argsort(-group_scores)
	sorted_labels = group_labels[order]

	# DCG@k
	top_k = sorted_labels[:k]
	gains = (2.0 ** top_k) - 1.0
	discounts = np.log2(np.arange(len(top_k)) + 2.0)
	dcg = np.sum(gains / discounts)

	# Ideal DCG@k
	ideal_order = np.argsort(-group_labels)
	ideal_labels = group_labels[ideal_order][:k]
	ideal_gains = (2.0 ** ideal_labels) - 1.0
	ideal_discounts = np.log2(np.arange(len(ideal_labels)) + 2.0)
	idcg = np.sum(ideal_gains / ideal_discounts)

	if idcg > 0:
	ndcg_scores.append(dcg / idcg)
	# Skip queries with all-zero labels (no positives)

	offset += group_size

	return float(np.mean(ndcg_scores)) if ndcg_scores else 0.0


	def recall_at_k(labels: np.ndarray, scores: np.ndarray, groups: list[int], k: int = 50) -> float:
	"""Compute mean Recall@k (fraction of positives in top-k) across all queries."""
	recalls = []
	offset = 0
	for group_size in groups:
	group_labels = labels[offset:offset + group_size]
	group_scores = scores[offset:offset + group_size]

	total_positives = np.sum(group_labels > 0)
	if total_positives == 0:
	offset += group_size
	continue

	order = np.argsort(-group_scores)
	sorted_labels = group_labels[order]
	top_k_positives = np.sum(sorted_labels[:k] > 0)
	recalls.append(top_k_positives / total_positives)

	offset += group_size

	return float(np.mean(recalls)) if recalls else 0.0


	def hit_rate_at_k(labels: np.ndarray, scores: np.ndarray, groups: list[int], k: int = 10) -> float:
	"""Compute HR@k: fraction of queries where at least one positive is in top-k."""
	hits = 0
	total = 0
	offset = 0
	for group_size in groups:
	group_labels = labels[offset:offset + group_size]
	group_scores = scores[offset:offset + group_size]

	if np.sum(group_labels > 0) == 0:
	offset += group_size
	continue

	order = np.argsort(-group_scores)
	sorted_labels = group_labels[order]
	if np.any(sorted_labels[:k] > 0):
	hits += 1
	total += 1
	offset += group_size

	return hits / total if total > 0 else 0.0


	def mean_reciprocal_rank(labels: np.ndarray, scores: np.ndarray, groups: list[int]) -> float:
	"""Compute MRR: average of 1/rank of the first positive result."""
	rr_scores = []
	offset = 0
	for group_size in groups:
	group_labels = labels[offset:offset + group_size]
	group_scores = scores[offset:offset + group_size]

	if np.sum(group_labels > 0) == 0:
	offset += group_size
	continue

	order = np.argsort(-group_scores)
	sorted_labels = group_labels[order]
	for rank, l in enumerate(sorted_labels, 1):
	if l > 0:
	rr_scores.append(1.0 / rank)
	break

	offset += group_size

	return float(np.mean(rr_scores)) if rr_scores else 0.0


	def evaluate_model(
	name: str,
	labels: np.ndarray,
	scores: np.ndarray,
	groups: list[int],
	) -> dict:
	"""Run all eval metrics and return as dict."""
	metrics = {
	"model": name,
	"ndcg@5": ndcg_at_k(labels, scores, groups, k=5),
	"ndcg@10": ndcg_at_k(labels, scores, groups, k=10),
	"ndcg@20": ndcg_at_k(labels, scores, groups, k=20),
	"recall@10": recall_at_k(labels, scores, groups, k=10),
	"recall@50": recall_at_k(labels, scores, groups, k=50),
	"hr@10": hit_rate_at_k(labels, scores, groups, k=10),
	"mrr": mean_reciprocal_rank(labels, scores, groups),
	}
	return metrics


	# ── Main Training Pipeline ───────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(
	description="Train LightGBM lambdarank reranker for ResearchIT"
	)
	parser.add_argument("--train-file", required=True, help="train.parquet from Step 2")
	parser.add_argument("--eval-file", required=True, help="eval.parquet from Step 2")
	parser.add_argument("--output-dir", default="./model_output")
	parser.add_argument("--num-boost-round", type=int, default=500)
	parser.add_argument("--learning-rate", type=float, default=0.05)
	parser.add_argument("--num-leaves", type=int, default=63)
	parser.add_argument("--min-data-in-leaf", type=int, default=50)
	parser.add_argument("--feature-fraction", type=float, default=0.8)
	parser.add_argument("--early-stopping-rounds", type=int, default=50)

	args = parser.parse_args()

	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	# ── Load data ────────────────────────────────────────────────────────
	print("=" * 60)
	print("Loading training data...")
	train_features, train_labels, train_groups, train_qids = load_ltr_data(args.train_file)
	print(f" Train: {len(train_labels)} rows, {len(train_groups)} queries")
	print(f" Label distribution: 0={np.sum(train_labels==0)}, 1={np.sum(train_labels==1)}, 2={np.sum(train_labels==2)}")

	print("\nLoading eval data...")
	eval_features, eval_labels, eval_groups, eval_qids = load_ltr_data(args.eval_file)
	print(f" Eval: {len(eval_labels)} rows, {len(eval_groups)} queries")
	print(f" Label distribution: 0={np.sum(eval_labels==0)}, 1={np.sum(eval_labels==1)}, 2={np.sum(eval_labels==2)}")

	# Verify time split: no overlap between train and eval query IDs
	train_query_set = set(train_qids)
	eval_query_set = set(eval_qids)
	overlap = train_query_set & eval_query_set
	if overlap:
	print(f" WARNING: {len(overlap)} query IDs appear in both splits!")
	else:
	print(f" ✅ No query overlap between train/eval splits")

	# ── Baseline: heuristic scorer ───────────────────────────────────────
	print("\n" + "=" * 60)
	print("Evaluating heuristic baseline...")

	baseline_scores = heuristic_baseline_score(eval_features)
	baseline_metrics = evaluate_model("heuristic_baseline", eval_labels, baseline_scores, eval_groups)

	print(f"\n Heuristic Baseline Results:")
	for k, v in baseline_metrics.items():
	if k != "model":
	print(f" {k}: {v:.4f}")

	# ── Train LightGBM ───────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("Training LightGBM lambdarank...")

	train_dataset = lgb.Dataset(
	train_features,
	label=train_labels,
	group=train_groups,
	feature_name=FEATURE_SCHEMA,
	free_raw_data=False,
	)

	eval_dataset = lgb.Dataset(
	eval_features,
	label=eval_labels,
	group=eval_groups,
	feature_name=FEATURE_SCHEMA,
	reference=train_dataset,
	free_raw_data=False,
	)

	params = {
	"objective": "lambdarank",
	"metric": "ndcg",
	"eval_at": [5, 10, 20],
	"num_leaves": args.num_leaves,
	"learning_rate": args.learning_rate,
	"min_data_in_leaf": args.min_data_in_leaf,
	"feature_fraction": args.feature_fraction,
	"bagging_fraction": 0.8,
	"bagging_freq": 5,
	"lambdarank_truncation_level": 20,
	"verbose": 1,
	"seed": 42,
	"num_threads": os.cpu_count() or 4,
	}

	print(f"\n Parameters:")
	for k, v in params.items():
	print(f" {k}: {v}")

	callbacks = [
	lgb.log_evaluation(period=50),
	lgb.early_stopping(stopping_rounds=args.early_stopping_rounds),
	]

	t0 = time.time()
	model = lgb.train(
	params,
	train_dataset,
	num_boost_round=args.num_boost_round,
	valid_sets=[eval_dataset],
	valid_names=["eval"],
	callbacks=callbacks,
	)
	train_time = time.time() - t0

	print(f"\n Training completed in {train_time:.1f}s")
	print(f" Best iteration: {model.best_iteration}")
	print(f" Best nDCG@10: {model.best_score.get('eval', {}).get('ndcg@10', 'N/A')}")

	# ── Evaluate LightGBM ────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("Evaluating LightGBM on eval set...")

	lgb_scores = model.predict(eval_features)
	lgb_metrics = evaluate_model("lightgbm_lambdarank", eval_labels, lgb_scores, eval_groups)

	print(f"\n LightGBM Results:")
	for k, v in lgb_metrics.items():
	if k != "model":
	print(f" {k}: {v:.4f}")

	# ── Comparison ───────────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("COMPARISON: LightGBM vs Heuristic Baseline")
	print("-" * 50)
	print(f" {'Metric':<15} {'Heuristic':>12} {'LightGBM':>12} {'Δ':>10} {'%Δ':>8}")
	print("-" * 50)

	comparison = {}
	for metric_key in ["ndcg@5", "ndcg@10", "ndcg@20", "recall@10", "recall@50", "hr@10", "mrr"]:
	b = baseline_metrics[metric_key]
	l = lgb_metrics[metric_key]
	delta = l - b
	pct = (delta / b * 100) if b > 0 else float('inf')
	comparison[metric_key] = {
	"heuristic": round(b, 4),
	"lightgbm": round(l, 4),
	"delta": round(delta, 4),
	"pct_improvement": round(pct, 2),
	}
	marker = "✅" if delta > 0 else "⚠️" if delta == 0 else "❌"
	print(f" {metric_key:<15} {b:>12.4f} {l:>12.4f} {delta:>+10.4f} {pct:>+7.1f}% {marker}")

	print("-" * 50)

	# ── Feature Importance ───────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("Feature Importance (top 20):")

	importance = model.feature_importance(importance_type="gain")
	importance_pairs = sorted(
	zip(FEATURE_SCHEMA, importance),
	key=lambda x: x[1],
	reverse=True,
	)

	print(f" {'Rank':<6} {'Feature':<35} {'Importance':>12}")
	print("-" * 55)
	for rank, (fname, imp) in enumerate(importance_pairs[:20], 1):
	bar = "█" * int(imp / max(importance) * 30) if max(importance) > 0 else ""
	print(f" {rank:<6} {fname:<35} {imp:>12.1f} {bar}")

	# Zero-importance features (expected: user behavior features 20-30)
	zero_features = [fname for fname, imp in importance_pairs if imp == 0]
	if zero_features:
	print(f"\n Zero-importance features ({len(zero_features)}):")
	for fname in zero_features:
	print(f" - {fname}")

	# ── Inference latency benchmark ──────────────────────────────────────
	print("\n" + "=" * 60)
	print("Inference Latency Benchmark:")

	# Simulate production: 100 candidates per query
	test_batch = eval_features[:100] if len(eval_features) >= 100 else eval_features

	# Warm up
	for _ in range(10):
	model.predict(test_batch)

	# Benchmark
	n_iters = 1000
	t0 = time.time()
	for _ in range(n_iters):
	model.predict(test_batch)
	total_ms = (time.time() - t0) * 1000
	per_call_ms = total_ms / n_iters

	print(f" {len(test_batch)} candidates × {n_iters} iterations")
	print(f" Total: {total_ms:.1f}ms")
	print(f" Per call: {per_call_ms:.3f}ms")
	print(f" Target: <1ms for 100 candidates → {'✅ PASS' if per_call_ms < 1.0 else '⚠️ SLOW'}")

	# ── Save outputs ─────────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("Saving outputs...")

	# Model
	model_path = output_dir / "reranker_v1.txt"
	model.save_model(str(model_path))
	model_size_kb = os.path.getsize(model_path) / 1024
	print(f" Model: {model_path} ({model_size_kb:.1f} KB)")

	# Eval metrics
	metrics_path = output_dir / "eval_metrics.json"
	with open(metrics_path, "w") as f:
	json.dump({
	"baseline": baseline_metrics,
	"lightgbm": lgb_metrics,
	"comparison": comparison,
	"training": {
	"num_boost_round": args.num_boost_round,
	"best_iteration": model.best_iteration,
	"training_time_seconds": round(train_time, 1),
	"train_rows": len(train_labels),
	"train_queries": len(train_groups),
	"eval_rows": len(eval_labels),
	"eval_queries": len(eval_groups),
	"params": params,
	},
	"latency": {
	"candidates": len(test_batch),
	"per_call_ms": round(per_call_ms, 3),
	"target_ms": 1.0,
	"pass": per_call_ms < 1.0,
	},
	"feature_importance": [
	{"feature": fname, "importance": float(imp)}
	for fname, imp in importance_pairs
	],
	}, f, indent=2)
	print(f" Metrics: {metrics_path}")

	# Feature importance CSV
	fi_path = output_dir / "feature_importance.csv"
	with open(fi_path, "w") as f:
	f.write("rank,feature,importance\n")
	for rank, (fname, imp) in enumerate(importance_pairs, 1):
	f.write(f"{rank},{fname},{imp}\n")
	print(f" Feature importance: {fi_path}")

	# Baseline comparison
	comp_path = output_dir / "baseline_comparison.json"
	with open(comp_path, "w") as f:
	json.dump(comparison, f, indent=2)
	print(f" Comparison: {comp_path}")

	# ── Summary ──────────────────────────────────────────────────────────
	print("\n" + "=" * 60)
	primary_metric = "ndcg@10"
	b = baseline_metrics[primary_metric]
	l = lgb_metrics[primary_metric]
	delta = l - b
	pct = (delta / b * 100) if b > 0 else 0

	if delta > 0.03:
	verdict = "✅ STRONG IMPROVEMENT — deploy LightGBM"
	elif delta > 0:
	verdict = "⚠️ MARGINAL IMPROVEMENT — consider if complexity is worth it"
	else:
	verdict = "❌ NO IMPROVEMENT — keep heuristic, investigate features"

	print(f"PRIMARY METRIC: nDCG@10")
	print(f" Heuristic: {b:.4f}")
	print(f" LightGBM: {l:.4f} ({delta:+.4f}, {pct:+.1f}%)")
	print(f" Verdict: {verdict}")
	print(f"\nModel file: {model_path}")
	print(f"Model size: {model_size_kb:.1f} KB")
	print(f"Latency: {per_call_ms:.3f}ms per 100 candidates")

	print("\n✅ Done!")


	if __name__ == "__main__":
	main()