llmopt-server / scripts /train_complexity_model.py
Shrot101's picture
Initial commit of LLMOpt framework
3c1db6c
"""
train_complexity_model.py
Trains a supervised regression model to predict query complexity scores.
Uses the labeled data in data/complexity_training_data.json.
This replaces the heuristic ComplexityEstimator with a trained model.
The model is saved to data/complexity_model.pkl and auto-detected by
ComplexityEstimator at startup if present.
Usage:
python scripts/train_complexity_model.py
Requirements:
pip install scikit-learn numpy pandas joblib
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(ROOT))
DATA_PATH = ROOT / "data" / "complexity_training_data.json"
MODEL_PATH = ROOT / "data" / "complexity_model.pkl"
# ---------------------------------------------------------------------------
# Feature extraction (mirrors QueryAnalyzer logic as numeric vector)
# ---------------------------------------------------------------------------
import re
import numpy as np
_CODE_KEYWORDS = {
"code", "function", "algorithm", "implement", "debug", "python",
"javascript", "java", "sql", "class", "method", "api", "sort",
"search", "complexity", "data structure", "tree", "graph",
"recursion", "regex", "binary",
}
_MATH_KEYWORDS = {
"prove", "theorem", "equation", "integral", "derivative", "matrix",
"probability", "calculus", "algebra", "geometry", "gradient",
"bayesian", "stochastic", "markov", "eigenvalue",
}
_REASONING_KEYWORDS = {
"analyze", "evaluate", "compare", "contrast", "argue", "debate",
"design", "architecture", "strategy", "pros and cons", "versus",
}
_SCIENCE_KEYWORDS = {
"physics", "chemistry", "biology", "quantum", "neural", "transformer",
"genetics", "molecule",
}
_CREATIVE_KEYWORDS = {
"write a story", "poem", "creative", "fiction", "narrative",
}
def _kw_hit(text: str, keywords: set) -> int:
return int(any(kw in text for kw in keywords))
def extract_features(query: str) -> np.ndarray:
"""
Converts a raw query string into a numeric feature vector.
Must stay in sync with the features the heuristic estimator uses.
"""
ql = query.lower()
token_count = len(query.split())
sentence_count = len(re.split(r"[.!?]+", query))
features = [
# Length features
token_count / 100.0,
min(token_count / 200.0, 1.0),
sentence_count / 10.0,
# Structural
int(bool(re.search(r"```|`[^`]+`", query))),
int(bool(re.search(r"\$.*?\$|\\[.*?\\]|[∫∑∏√∞∂]", query))),
int("?" in query),
int(bool(re.search(r"^\s*\d+[\.\)]", query, re.MULTILINE))),
int(bool(re.search(r"^\s*[-*•]", query, re.MULTILINE))),
# Domain
_kw_hit(ql, _CODE_KEYWORDS),
_kw_hit(ql, _MATH_KEYWORDS),
_kw_hit(ql, _REASONING_KEYWORDS),
_kw_hit(ql, _SCIENCE_KEYWORDS),
_kw_hit(ql, _CREATIVE_KEYWORDS),
# Complexity signals
int(bool(re.search(r"\bstep[s]?\b|\bfirst\b.{0,60}\bthen\b|\bfinally\b", ql))),
int(any(kw in ql for kw in ["compare", "contrast", "vs", "versus", "difference"])),
int(any(kw in ql for kw in ["write", "generate", "create", "implement", "build"])),
int(any(kw in ql for kw in ["explain", "how does", "why does"])),
int(any(kw in ql for kw in ["analyze", "evaluate", "assess", "critique"])),
int(any(kw in ql for kw in ["debate", "argue", "advocate", "defend"])),
int(any(kw in ql for kw in ["comprehensive", "in depth", "detailed", "tutorial"])),
int(any(kw in ql for kw in ["from scratch", "from first principles"])),
]
return np.array(features, dtype=float)
# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------
def load_data():
with open(DATA_PATH) as f:
records = json.load(f)
X = np.array([extract_features(r["query"]) for r in records])
y = np.array([r["complexity"] for r in records])
return X, y, records
def train():
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
print("Loading training data...")
X, y, records = load_data()
print(f" {len(records)} labeled examples loaded")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=42
)
model = Pipeline([
("scaler", StandardScaler()),
("gbr", GradientBoostingRegressor(
n_estimators=200,
max_depth=4,
learning_rate=0.05,
subsample=0.8,
random_state=42,
)),
])
print("Training GradientBoostingRegressor...")
model.fit(X_train, y_train)
# Evaluation
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
cv_scores = cross_val_score(model, X, y, cv=5, scoring="r2")
print(f"\n Train R² : {train_score:.4f}")
print(f" Test R² : {test_score:.4f}")
print(f" CV R² mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
# Per-example predictions on test set
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f" MAE : {mae:.4f}")
# Save
joblib.dump({"model": model, "feature_extractor": extract_features}, MODEL_PATH)
print(f"\nModel saved to: {MODEL_PATH}")
# Sample predictions
print("\nSample predictions:")
print(f" {'Query':<55} {'True':>6} {'Pred':>6}")
print(" " + "-" * 70)
test_queries = [records[i] for i in range(min(10, len(records)))]
for r in test_queries:
feat = extract_features(r["query"]).reshape(1, -1)
pred = model.predict(feat)[0]
print(f" {r['query'][:55]:<55} {r['complexity']:>6.3f} {pred:>6.3f}")
# ---------------------------------------------------------------------------
# Inference helper (used by ComplexityEstimator when model file exists)
# ---------------------------------------------------------------------------
def load_model():
"""Load the trained model if available."""
import joblib
if not MODEL_PATH.exists():
return None
bundle = joblib.load(MODEL_PATH)
return bundle["model"], bundle["feature_extractor"]
def predict(query: str, model=None, feature_extractor=None) -> float:
"""Predict complexity score for a single query."""
if model is None or feature_extractor is None:
bundle = load_model()
if bundle is None:
raise FileNotFoundError(f"No trained model at {MODEL_PATH}")
model, feature_extractor = bundle
feat = feature_extractor(query).reshape(1, -1)
score = float(model.predict(feat)[0])
return max(0.0, min(score, 1.0))
if __name__ == "__main__":
train()