Spaces:
Running
Running
| """ | |
| train_complexity_model.py | |
| Trains a supervised regression model to predict query complexity scores. | |
| Uses the labeled data in data/complexity_training_data.json. | |
| This replaces the heuristic ComplexityEstimator with a trained model. | |
| The model is saved to data/complexity_model.pkl and auto-detected by | |
| ComplexityEstimator at startup if present. | |
| Usage: | |
| python scripts/train_complexity_model.py | |
| Requirements: | |
| pip install scikit-learn numpy pandas joblib | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(ROOT)) | |
| DATA_PATH = ROOT / "data" / "complexity_training_data.json" | |
| MODEL_PATH = ROOT / "data" / "complexity_model.pkl" | |
| # --------------------------------------------------------------------------- | |
| # Feature extraction (mirrors QueryAnalyzer logic as numeric vector) | |
| # --------------------------------------------------------------------------- | |
| import re | |
| import numpy as np | |
| _CODE_KEYWORDS = { | |
| "code", "function", "algorithm", "implement", "debug", "python", | |
| "javascript", "java", "sql", "class", "method", "api", "sort", | |
| "search", "complexity", "data structure", "tree", "graph", | |
| "recursion", "regex", "binary", | |
| } | |
| _MATH_KEYWORDS = { | |
| "prove", "theorem", "equation", "integral", "derivative", "matrix", | |
| "probability", "calculus", "algebra", "geometry", "gradient", | |
| "bayesian", "stochastic", "markov", "eigenvalue", | |
| } | |
| _REASONING_KEYWORDS = { | |
| "analyze", "evaluate", "compare", "contrast", "argue", "debate", | |
| "design", "architecture", "strategy", "pros and cons", "versus", | |
| } | |
| _SCIENCE_KEYWORDS = { | |
| "physics", "chemistry", "biology", "quantum", "neural", "transformer", | |
| "genetics", "molecule", | |
| } | |
| _CREATIVE_KEYWORDS = { | |
| "write a story", "poem", "creative", "fiction", "narrative", | |
| } | |
| def _kw_hit(text: str, keywords: set) -> int: | |
| return int(any(kw in text for kw in keywords)) | |
| def extract_features(query: str) -> np.ndarray: | |
| """ | |
| Converts a raw query string into a numeric feature vector. | |
| Must stay in sync with the features the heuristic estimator uses. | |
| """ | |
| ql = query.lower() | |
| token_count = len(query.split()) | |
| sentence_count = len(re.split(r"[.!?]+", query)) | |
| features = [ | |
| # Length features | |
| token_count / 100.0, | |
| min(token_count / 200.0, 1.0), | |
| sentence_count / 10.0, | |
| # Structural | |
| int(bool(re.search(r"```|`[^`]+`", query))), | |
| int(bool(re.search(r"\$.*?\$|\\[.*?\\]|[∫∑∏√∞∂]", query))), | |
| int("?" in query), | |
| int(bool(re.search(r"^\s*\d+[\.\)]", query, re.MULTILINE))), | |
| int(bool(re.search(r"^\s*[-*•]", query, re.MULTILINE))), | |
| # Domain | |
| _kw_hit(ql, _CODE_KEYWORDS), | |
| _kw_hit(ql, _MATH_KEYWORDS), | |
| _kw_hit(ql, _REASONING_KEYWORDS), | |
| _kw_hit(ql, _SCIENCE_KEYWORDS), | |
| _kw_hit(ql, _CREATIVE_KEYWORDS), | |
| # Complexity signals | |
| int(bool(re.search(r"\bstep[s]?\b|\bfirst\b.{0,60}\bthen\b|\bfinally\b", ql))), | |
| int(any(kw in ql for kw in ["compare", "contrast", "vs", "versus", "difference"])), | |
| int(any(kw in ql for kw in ["write", "generate", "create", "implement", "build"])), | |
| int(any(kw in ql for kw in ["explain", "how does", "why does"])), | |
| int(any(kw in ql for kw in ["analyze", "evaluate", "assess", "critique"])), | |
| int(any(kw in ql for kw in ["debate", "argue", "advocate", "defend"])), | |
| int(any(kw in ql for kw in ["comprehensive", "in depth", "detailed", "tutorial"])), | |
| int(any(kw in ql for kw in ["from scratch", "from first principles"])), | |
| ] | |
| return np.array(features, dtype=float) | |
| # --------------------------------------------------------------------------- | |
| # Training | |
| # --------------------------------------------------------------------------- | |
| def load_data(): | |
| with open(DATA_PATH) as f: | |
| records = json.load(f) | |
| X = np.array([extract_features(r["query"]) for r in records]) | |
| y = np.array([r["complexity"] for r in records]) | |
| return X, y, records | |
| def train(): | |
| from sklearn.ensemble import GradientBoostingRegressor | |
| from sklearn.model_selection import cross_val_score, train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.pipeline import Pipeline | |
| import joblib | |
| print("Loading training data...") | |
| X, y, records = load_data() | |
| print(f" {len(records)} labeled examples loaded") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.20, random_state=42 | |
| ) | |
| model = Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("gbr", GradientBoostingRegressor( | |
| n_estimators=200, | |
| max_depth=4, | |
| learning_rate=0.05, | |
| subsample=0.8, | |
| random_state=42, | |
| )), | |
| ]) | |
| print("Training GradientBoostingRegressor...") | |
| model.fit(X_train, y_train) | |
| # Evaluation | |
| train_score = model.score(X_train, y_train) | |
| test_score = model.score(X_test, y_test) | |
| cv_scores = cross_val_score(model, X, y, cv=5, scoring="r2") | |
| print(f"\n Train R² : {train_score:.4f}") | |
| print(f" Test R² : {test_score:.4f}") | |
| print(f" CV R² mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") | |
| # Per-example predictions on test set | |
| from sklearn.metrics import mean_absolute_error | |
| y_pred = model.predict(X_test) | |
| mae = mean_absolute_error(y_test, y_pred) | |
| print(f" MAE : {mae:.4f}") | |
| # Save | |
| joblib.dump({"model": model, "feature_extractor": extract_features}, MODEL_PATH) | |
| print(f"\nModel saved to: {MODEL_PATH}") | |
| # Sample predictions | |
| print("\nSample predictions:") | |
| print(f" {'Query':<55} {'True':>6} {'Pred':>6}") | |
| print(" " + "-" * 70) | |
| test_queries = [records[i] for i in range(min(10, len(records)))] | |
| for r in test_queries: | |
| feat = extract_features(r["query"]).reshape(1, -1) | |
| pred = model.predict(feat)[0] | |
| print(f" {r['query'][:55]:<55} {r['complexity']:>6.3f} {pred:>6.3f}") | |
| # --------------------------------------------------------------------------- | |
| # Inference helper (used by ComplexityEstimator when model file exists) | |
| # --------------------------------------------------------------------------- | |
| def load_model(): | |
| """Load the trained model if available.""" | |
| import joblib | |
| if not MODEL_PATH.exists(): | |
| return None | |
| bundle = joblib.load(MODEL_PATH) | |
| return bundle["model"], bundle["feature_extractor"] | |
| def predict(query: str, model=None, feature_extractor=None) -> float: | |
| """Predict complexity score for a single query.""" | |
| if model is None or feature_extractor is None: | |
| bundle = load_model() | |
| if bundle is None: | |
| raise FileNotFoundError(f"No trained model at {MODEL_PATH}") | |
| model, feature_extractor = bundle | |
| feat = feature_extractor(query).reshape(1, -1) | |
| score = float(model.predict(feat)[0]) | |
| return max(0.0, min(score, 1.0)) | |
| if __name__ == "__main__": | |
| train() | |