""" train_complexity_model.py Trains a supervised regression model to predict query complexity scores. Uses the labeled data in data/complexity_training_data.json. This replaces the heuristic ComplexityEstimator with a trained model. The model is saved to data/complexity_model.pkl and auto-detected by ComplexityEstimator at startup if present. Usage: python scripts/train_complexity_model.py Requirements: pip install scikit-learn numpy pandas joblib """ from __future__ import annotations import json import sys from pathlib import Path ROOT = Path(__file__).parent.parent sys.path.insert(0, str(ROOT)) DATA_PATH = ROOT / "data" / "complexity_training_data.json" MODEL_PATH = ROOT / "data" / "complexity_model.pkl" # --------------------------------------------------------------------------- # Feature extraction (mirrors QueryAnalyzer logic as numeric vector) # --------------------------------------------------------------------------- import re import numpy as np _CODE_KEYWORDS = { "code", "function", "algorithm", "implement", "debug", "python", "javascript", "java", "sql", "class", "method", "api", "sort", "search", "complexity", "data structure", "tree", "graph", "recursion", "regex", "binary", } _MATH_KEYWORDS = { "prove", "theorem", "equation", "integral", "derivative", "matrix", "probability", "calculus", "algebra", "geometry", "gradient", "bayesian", "stochastic", "markov", "eigenvalue", } _REASONING_KEYWORDS = { "analyze", "evaluate", "compare", "contrast", "argue", "debate", "design", "architecture", "strategy", "pros and cons", "versus", } _SCIENCE_KEYWORDS = { "physics", "chemistry", "biology", "quantum", "neural", "transformer", "genetics", "molecule", } _CREATIVE_KEYWORDS = { "write a story", "poem", "creative", "fiction", "narrative", } def _kw_hit(text: str, keywords: set) -> int: return int(any(kw in text for kw in keywords)) def extract_features(query: str) -> np.ndarray: """ Converts a raw query string into a numeric feature vector. Must stay in sync with the features the heuristic estimator uses. """ ql = query.lower() token_count = len(query.split()) sentence_count = len(re.split(r"[.!?]+", query)) features = [ # Length features token_count / 100.0, min(token_count / 200.0, 1.0), sentence_count / 10.0, # Structural int(bool(re.search(r"```|`[^`]+`", query))), int(bool(re.search(r"\$.*?\$|\\[.*?\\]|[∫∑∏√∞∂]", query))), int("?" in query), int(bool(re.search(r"^\s*\d+[\.\)]", query, re.MULTILINE))), int(bool(re.search(r"^\s*[-*•]", query, re.MULTILINE))), # Domain _kw_hit(ql, _CODE_KEYWORDS), _kw_hit(ql, _MATH_KEYWORDS), _kw_hit(ql, _REASONING_KEYWORDS), _kw_hit(ql, _SCIENCE_KEYWORDS), _kw_hit(ql, _CREATIVE_KEYWORDS), # Complexity signals int(bool(re.search(r"\bstep[s]?\b|\bfirst\b.{0,60}\bthen\b|\bfinally\b", ql))), int(any(kw in ql for kw in ["compare", "contrast", "vs", "versus", "difference"])), int(any(kw in ql for kw in ["write", "generate", "create", "implement", "build"])), int(any(kw in ql for kw in ["explain", "how does", "why does"])), int(any(kw in ql for kw in ["analyze", "evaluate", "assess", "critique"])), int(any(kw in ql for kw in ["debate", "argue", "advocate", "defend"])), int(any(kw in ql for kw in ["comprehensive", "in depth", "detailed", "tutorial"])), int(any(kw in ql for kw in ["from scratch", "from first principles"])), ] return np.array(features, dtype=float) # --------------------------------------------------------------------------- # Training # --------------------------------------------------------------------------- def load_data(): with open(DATA_PATH) as f: records = json.load(f) X = np.array([extract_features(r["query"]) for r in records]) y = np.array([r["complexity"] for r in records]) return X, y, records def train(): from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import cross_val_score, train_test_split from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import joblib print("Loading training data...") X, y, records = load_data() print(f" {len(records)} labeled examples loaded") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42 ) model = Pipeline([ ("scaler", StandardScaler()), ("gbr", GradientBoostingRegressor( n_estimators=200, max_depth=4, learning_rate=0.05, subsample=0.8, random_state=42, )), ]) print("Training GradientBoostingRegressor...") model.fit(X_train, y_train) # Evaluation train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) cv_scores = cross_val_score(model, X, y, cv=5, scoring="r2") print(f"\n Train R² : {train_score:.4f}") print(f" Test R² : {test_score:.4f}") print(f" CV R² mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") # Per-example predictions on test set from sklearn.metrics import mean_absolute_error y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) print(f" MAE : {mae:.4f}") # Save joblib.dump({"model": model, "feature_extractor": extract_features}, MODEL_PATH) print(f"\nModel saved to: {MODEL_PATH}") # Sample predictions print("\nSample predictions:") print(f" {'Query':<55} {'True':>6} {'Pred':>6}") print(" " + "-" * 70) test_queries = [records[i] for i in range(min(10, len(records)))] for r in test_queries: feat = extract_features(r["query"]).reshape(1, -1) pred = model.predict(feat)[0] print(f" {r['query'][:55]:<55} {r['complexity']:>6.3f} {pred:>6.3f}") # --------------------------------------------------------------------------- # Inference helper (used by ComplexityEstimator when model file exists) # --------------------------------------------------------------------------- def load_model(): """Load the trained model if available.""" import joblib if not MODEL_PATH.exists(): return None bundle = joblib.load(MODEL_PATH) return bundle["model"], bundle["feature_extractor"] def predict(query: str, model=None, feature_extractor=None) -> float: """Predict complexity score for a single query.""" if model is None or feature_extractor is None: bundle = load_model() if bundle is None: raise FileNotFoundError(f"No trained model at {MODEL_PATH}") model, feature_extractor = bundle feat = feature_extractor(query).reshape(1, -1) score = float(model.predict(feat)[0]) return max(0.0, min(score, 1.0)) if __name__ == "__main__": train()