Spaces:

Shrot102
/

llmopt-server

Running

File size: 7,038 Bytes

3c1db6c

"""
train_complexity_model.py

Trains a supervised regression model to predict query complexity scores.
Uses the labeled data in data/complexity_training_data.json.

This replaces the heuristic ComplexityEstimator with a trained model.
The model is saved to data/complexity_model.pkl and auto-detected by
ComplexityEstimator at startup if present.

Usage:
    python scripts/train_complexity_model.py

Requirements:
    pip install scikit-learn numpy pandas joblib
"""

from __future__ import annotations

import json
import sys
from pathlib import Path

ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(ROOT))

DATA_PATH  = ROOT / "data" / "complexity_training_data.json"
MODEL_PATH = ROOT / "data" / "complexity_model.pkl"


# ---------------------------------------------------------------------------
# Feature extraction (mirrors QueryAnalyzer logic as numeric vector)
# ---------------------------------------------------------------------------

import re
import numpy as np

_CODE_KEYWORDS = {
    "code", "function", "algorithm", "implement", "debug", "python",
    "javascript", "java", "sql", "class", "method", "api", "sort",
    "search", "complexity", "data structure", "tree", "graph",
    "recursion", "regex", "binary",
}
_MATH_KEYWORDS = {
    "prove", "theorem", "equation", "integral", "derivative", "matrix",
    "probability", "calculus", "algebra", "geometry", "gradient",
    "bayesian", "stochastic", "markov", "eigenvalue",
}
_REASONING_KEYWORDS = {
    "analyze", "evaluate", "compare", "contrast", "argue", "debate",
    "design", "architecture", "strategy", "pros and cons", "versus",
}
_SCIENCE_KEYWORDS = {
    "physics", "chemistry", "biology", "quantum", "neural", "transformer",
    "genetics", "molecule",
}
_CREATIVE_KEYWORDS = {
    "write a story", "poem", "creative", "fiction", "narrative",
}


def _kw_hit(text: str, keywords: set) -> int:
    return int(any(kw in text for kw in keywords))


def extract_features(query: str) -> np.ndarray:
    """
    Converts a raw query string into a numeric feature vector.
    Must stay in sync with the features the heuristic estimator uses.
    """
    ql = query.lower()
    token_count = len(query.split())
    sentence_count = len(re.split(r"[.!?]+", query))

    features = [
        # Length features
        token_count / 100.0,
        min(token_count / 200.0, 1.0),
        sentence_count / 10.0,

        # Structural
        int(bool(re.search(r"```|`[^`]+`", query))),
        int(bool(re.search(r"\$.*?\$|\\[.*?\\]|[∫∑∏√∞∂]", query))),
        int("?" in query),
        int(bool(re.search(r"^\s*\d+[\.\)]", query, re.MULTILINE))),
        int(bool(re.search(r"^\s*[-*•]", query, re.MULTILINE))),

        # Domain
        _kw_hit(ql, _CODE_KEYWORDS),
        _kw_hit(ql, _MATH_KEYWORDS),
        _kw_hit(ql, _REASONING_KEYWORDS),
        _kw_hit(ql, _SCIENCE_KEYWORDS),
        _kw_hit(ql, _CREATIVE_KEYWORDS),

        # Complexity signals
        int(bool(re.search(r"\bstep[s]?\b|\bfirst\b.{0,60}\bthen\b|\bfinally\b", ql))),
        int(any(kw in ql for kw in ["compare", "contrast", "vs", "versus", "difference"])),
        int(any(kw in ql for kw in ["write", "generate", "create", "implement", "build"])),
        int(any(kw in ql for kw in ["explain", "how does", "why does"])),
        int(any(kw in ql for kw in ["analyze", "evaluate", "assess", "critique"])),
        int(any(kw in ql for kw in ["debate", "argue", "advocate", "defend"])),
        int(any(kw in ql for kw in ["comprehensive", "in depth", "detailed", "tutorial"])),
        int(any(kw in ql for kw in ["from scratch", "from first principles"])),
    ]

    return np.array(features, dtype=float)


# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------

def load_data():
    with open(DATA_PATH) as f:
        records = json.load(f)

    X = np.array([extract_features(r["query"]) for r in records])
    y = np.array([r["complexity"] for r in records])
    return X, y, records


def train():
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.model_selection import cross_val_score, train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    import joblib

    print("Loading training data...")
    X, y, records = load_data()
    print(f"  {len(records)} labeled examples loaded")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("gbr", GradientBoostingRegressor(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.05,
            subsample=0.8,
            random_state=42,
        )),
    ])

    print("Training GradientBoostingRegressor...")
    model.fit(X_train, y_train)

    # Evaluation
    train_score = model.score(X_train, y_train)
    test_score  = model.score(X_test, y_test)
    cv_scores   = cross_val_score(model, X, y, cv=5, scoring="r2")

    print(f"\n  Train R²  : {train_score:.4f}")
    print(f"  Test  R²  : {test_score:.4f}")
    print(f"  CV R² mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Per-example predictions on test set
    from sklearn.metrics import mean_absolute_error
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"  MAE       : {mae:.4f}")

    # Save
    joblib.dump({"model": model, "feature_extractor": extract_features}, MODEL_PATH)
    print(f"\nModel saved to: {MODEL_PATH}")

    # Sample predictions
    print("\nSample predictions:")
    print(f"  {'Query':<55} {'True':>6} {'Pred':>6}")
    print("  " + "-" * 70)
    test_queries = [records[i] for i in range(min(10, len(records)))]
    for r in test_queries:
        feat = extract_features(r["query"]).reshape(1, -1)
        pred = model.predict(feat)[0]
        print(f"  {r['query'][:55]:<55} {r['complexity']:>6.3f} {pred:>6.3f}")


# ---------------------------------------------------------------------------
# Inference helper (used by ComplexityEstimator when model file exists)
# ---------------------------------------------------------------------------

def load_model():
    """Load the trained model if available."""
    import joblib
    if not MODEL_PATH.exists():
        return None
    bundle = joblib.load(MODEL_PATH)
    return bundle["model"], bundle["feature_extractor"]


def predict(query: str, model=None, feature_extractor=None) -> float:
    """Predict complexity score for a single query."""
    if model is None or feature_extractor is None:
        bundle = load_model()
        if bundle is None:
            raise FileNotFoundError(f"No trained model at {MODEL_PATH}")
        model, feature_extractor = bundle

    feat = feature_extractor(query).reshape(1, -1)
    score = float(model.predict(feat)[0])
    return max(0.0, min(score, 1.0))


if __name__ == "__main__":
    train()