Tabular Classification
Scikit-learn
Joblib
postgresql
sql
query-cache
plan-cache
redis
database
tabular-regression
Instructions to use nilenpatel/pg-plan-cache-models with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use nilenpatel/pg-plan-cache-models with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("nilenpatel/pg-plan-cache-models", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
File size: 6,582 Bytes
406cec4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | #!/usr/bin/env python3
"""
Train all three pg_plan_cache models:
1. SQL Cache Advisor (classification: low / medium / high)
2. Cache TTL Recommender (regression: seconds)
3. Query Complexity Estimator (regression: 1-100 score)
Saves trained models as joblib files in the ./trained/ directory.
"""
import os
import json
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
from features import extract_features, FEATURE_NAMES
from dataset import generate_dataset
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained")
def train():
print("=" * 60)
print(" pg_plan_cache β Model Training")
print("=" * 60)
# ββ Generate data βββββββββββββββββββββββββββββββββββββββββ
print("\n[1/5] Generating synthetic training data...")
queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42)
print(f" Generated {len(queries)} samples")
# ββ Extract features ββββββββββββββββββββββββββββββββββββββ
print("\n[2/5] Extracting features...")
X = np.array([extract_features(q) for q in queries])
print(f" Feature matrix: {X.shape}")
# ββ Encode labels βββββββββββββββββββββββββββββββββββββββββ
le = LabelEncoder()
y_benefit = le.fit_transform(benefits) # low=1, medium=2, high=0
y_ttl = np.array(ttls, dtype=float)
y_complexity = np.array(complexities, dtype=float)
# ββ Split βββββββββββββββββββββββββββββββββββββββββββββββββ
X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \
train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42)
print(f" Train: {len(X_train)}, Test: {len(X_test)}")
# ββ Model 1: Cache Advisor (classification) βββββββββββββββ
print("\n[3/5] Training SQL Cache Advisor...")
clf = RandomForestClassifier(
n_estimators=200,
max_depth=15,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1,
)
clf.fit(X_train, yb_train)
yb_pred = clf.predict(X_test)
print("\n Classification Report:")
report = classification_report(yb_test, yb_pred, target_names=le.classes_)
print(" " + report.replace("\n", "\n "))
cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy")
print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
# ββ Model 2: TTL Recommender (regression) βββββββββββββββββ
print("\n[4/5] Training Cache TTL Recommender...")
reg_ttl = GradientBoostingRegressor(
n_estimators=200,
max_depth=8,
learning_rate=0.1,
min_samples_split=5,
random_state=42,
)
reg_ttl.fit(X_train, yt_train)
yt_pred = reg_ttl.predict(X_test)
mae_ttl = mean_absolute_error(yt_test, yt_pred)
r2_ttl = r2_score(yt_test, yt_pred)
print(f" MAE: {mae_ttl:.1f} seconds")
print(f" R2: {r2_ttl:.3f}")
# ββ Model 3: Complexity Estimator (regression) ββββββββββββ
print("\n[5/5] Training Query Complexity Estimator...")
reg_cplx = GradientBoostingRegressor(
n_estimators=200,
max_depth=8,
learning_rate=0.1,
min_samples_split=5,
random_state=42,
)
reg_cplx.fit(X_train, yc_train)
yc_pred = reg_cplx.predict(X_test)
mae_cplx = mean_absolute_error(yc_test, yc_pred)
r2_cplx = r2_score(yc_test, yc_pred)
print(f" MAE: {mae_cplx:.1f} points")
print(f" R2: {r2_cplx:.3f}")
# ββ Save models βββββββββββββββββββββββββββββββββββββββββββ
os.makedirs(OUTPUT_DIR, exist_ok=True)
joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib"))
joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib"))
joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib"))
joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib"))
# Feature importances
importances = {
"cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())),
"ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())),
"complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())),
}
with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f:
json.dump(importances, f, indent=2)
# Model metadata
metadata = {
"models": {
"cache_advisor": {
"type": "RandomForestClassifier",
"task": "classification",
"classes": le.classes_.tolist(),
"accuracy_cv5": round(float(cv_scores.mean()), 4),
},
"ttl_recommender": {
"type": "GradientBoostingRegressor",
"task": "regression",
"unit": "seconds",
"mae": round(float(mae_ttl), 2),
"r2": round(float(r2_ttl), 4),
},
"complexity_estimator": {
"type": "GradientBoostingRegressor",
"task": "regression",
"unit": "score (1-100)",
"mae": round(float(mae_cplx), 2),
"r2": round(float(r2_cplx), 4),
},
},
"features": FEATURE_NAMES,
"n_features": len(FEATURE_NAMES),
"training_samples": len(queries),
"test_samples": len(X_test),
}
with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f:
json.dump(metadata, f, indent=2)
print(f"\n Models saved to {OUTPUT_DIR}/")
print(" Files: cache_advisor.joblib, ttl_recommender.joblib,")
print(" complexity_estimator.joblib, label_encoder.joblib,")
print(" feature_importances.json, metadata.json")
print("\nDone.")
if __name__ == "__main__":
train()
|