Spaces:

Shrot102
/

llmopt-server

Running

App Files Files Community

llmopt-server / scripts /train_complexity_model.py

Shrot101

Initial commit of LLMOpt framework

3c1db6c 1 day ago

raw

history blame contribute delete

7.04 kB

	"""
	train_complexity_model.py

	Trains a supervised regression model to predict query complexity scores.
	Uses the labeled data in data/complexity_training_data.json.

	This replaces the heuristic ComplexityEstimator with a trained model.
	The model is saved to data/complexity_model.pkl and auto-detected by
	ComplexityEstimator at startup if present.

	Usage:
	python scripts/train_complexity_model.py

	Requirements:
	pip install scikit-learn numpy pandas joblib
	"""

	from __future__ import annotations

	import json
	import sys
	from pathlib import Path

	ROOT = Path(__file__).parent.parent
	sys.path.insert(0, str(ROOT))

	DATA_PATH = ROOT / "data" / "complexity_training_data.json"
	MODEL_PATH = ROOT / "data" / "complexity_model.pkl"


	# ---------------------------------------------------------------------------
	# Feature extraction (mirrors QueryAnalyzer logic as numeric vector)
	# ---------------------------------------------------------------------------

	import re
	import numpy as np

	_CODE_KEYWORDS = {
	"code", "function", "algorithm", "implement", "debug", "python",
	"javascript", "java", "sql", "class", "method", "api", "sort",
	"search", "complexity", "data structure", "tree", "graph",
	"recursion", "regex", "binary",
	}
	_MATH_KEYWORDS = {
	"prove", "theorem", "equation", "integral", "derivative", "matrix",
	"probability", "calculus", "algebra", "geometry", "gradient",
	"bayesian", "stochastic", "markov", "eigenvalue",
	}
	_REASONING_KEYWORDS = {
	"analyze", "evaluate", "compare", "contrast", "argue", "debate",
	"design", "architecture", "strategy", "pros and cons", "versus",
	}
	_SCIENCE_KEYWORDS = {
	"physics", "chemistry", "biology", "quantum", "neural", "transformer",
	"genetics", "molecule",
	}
	_CREATIVE_KEYWORDS = {
	"write a story", "poem", "creative", "fiction", "narrative",
	}


	def _kw_hit(text: str, keywords: set) -> int:
	return int(any(kw in text for kw in keywords))


	def extract_features(query: str) -> np.ndarray:
	"""
	Converts a raw query string into a numeric feature vector.
	Must stay in sync with the features the heuristic estimator uses.
	"""
	ql = query.lower()
	token_count = len(query.split())
	sentence_count = len(re.split(r"[.!?]+", query))

	features = [
	# Length features
	token_count / 100.0,
	min(token_count / 200.0, 1.0),
	sentence_count / 10.0,

	# Structural
	int(bool(re.search(r"```\|`[^`]+`", query))),
	int(bool(re.search(r"\$.?\$\|\\[.?\\]\|[∫∑∏√∞∂]", query))),
	int("?" in query),
	int(bool(re.search(r"^\s*\d+[\.\)]", query, re.MULTILINE))),
	int(bool(re.search(r"^\s[-•]", query, re.MULTILINE))),

	# Domain
	_kw_hit(ql, _CODE_KEYWORDS),
	_kw_hit(ql, _MATH_KEYWORDS),
	_kw_hit(ql, _REASONING_KEYWORDS),
	_kw_hit(ql, _SCIENCE_KEYWORDS),
	_kw_hit(ql, _CREATIVE_KEYWORDS),

	# Complexity signals
	int(bool(re.search(r"\bstep[s]?\b\|\bfirst\b.{0,60}\bthen\b\|\bfinally\b", ql))),
	int(any(kw in ql for kw in ["compare", "contrast", "vs", "versus", "difference"])),
	int(any(kw in ql for kw in ["write", "generate", "create", "implement", "build"])),
	int(any(kw in ql for kw in ["explain", "how does", "why does"])),
	int(any(kw in ql for kw in ["analyze", "evaluate", "assess", "critique"])),
	int(any(kw in ql for kw in ["debate", "argue", "advocate", "defend"])),
	int(any(kw in ql for kw in ["comprehensive", "in depth", "detailed", "tutorial"])),
	int(any(kw in ql for kw in ["from scratch", "from first principles"])),
	]

	return np.array(features, dtype=float)


	# ---------------------------------------------------------------------------
	# Training
	# ---------------------------------------------------------------------------

	def load_data():
	with open(DATA_PATH) as f:
	records = json.load(f)

	X = np.array([extract_features(r["query"]) for r in records])
	y = np.array([r["complexity"] for r in records])
	return X, y, records


	def train():
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.model_selection import cross_val_score, train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import Pipeline
	import joblib

	print("Loading training data...")
	X, y, records = load_data()
	print(f" {len(records)} labeled examples loaded")

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.20, random_state=42
	)

	model = Pipeline([
	("scaler", StandardScaler()),
	("gbr", GradientBoostingRegressor(
	n_estimators=200,
	max_depth=4,
	learning_rate=0.05,
	subsample=0.8,
	random_state=42,
	)),
	])

	print("Training GradientBoostingRegressor...")
	model.fit(X_train, y_train)

	# Evaluation
	train_score = model.score(X_train, y_train)
	test_score = model.score(X_test, y_test)
	cv_scores = cross_val_score(model, X, y, cv=5, scoring="r2")

	print(f"\n Train R² : {train_score:.4f}")
	print(f" Test R² : {test_score:.4f}")
	print(f" CV R² mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

	# Per-example predictions on test set
	from sklearn.metrics import mean_absolute_error
	y_pred = model.predict(X_test)
	mae = mean_absolute_error(y_test, y_pred)
	print(f" MAE : {mae:.4f}")

	# Save
	joblib.dump({"model": model, "feature_extractor": extract_features}, MODEL_PATH)
	print(f"\nModel saved to: {MODEL_PATH}")

	# Sample predictions
	print("\nSample predictions:")
	print(f" {'Query':<55} {'True':>6} {'Pred':>6}")
	print(" " + "-" * 70)
	test_queries = [records[i] for i in range(min(10, len(records)))]
	for r in test_queries:
	feat = extract_features(r["query"]).reshape(1, -1)
	pred = model.predict(feat)[0]
	print(f" {r['query'][:55]:<55} {r['complexity']:>6.3f} {pred:>6.3f}")


	# ---------------------------------------------------------------------------
	# Inference helper (used by ComplexityEstimator when model file exists)
	# ---------------------------------------------------------------------------

	def load_model():
	"""Load the trained model if available."""
	import joblib
	if not MODEL_PATH.exists():
	return None
	bundle = joblib.load(MODEL_PATH)
	return bundle["model"], bundle["feature_extractor"]


	def predict(query: str, model=None, feature_extractor=None) -> float:
	"""Predict complexity score for a single query."""
	if model is None or feature_extractor is None:
	bundle = load_model()
	if bundle is None:
	raise FileNotFoundError(f"No trained model at {MODEL_PATH}")
	model, feature_extractor = bundle

	feat = feature_extractor(query).reshape(1, -1)
	score = float(model.predict(feat)[0])
	return max(0.0, min(score, 1.0))


	if __name__ == "__main__":
	train()