Spaces:
Sleeping
Sleeping
File size: 11,154 Bytes
70ea7be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | """Difficulty Model training pipeline.
Trains a RandomForestRegressor on question features for difficulty estimation.
Target: difficulty_score (continuous [0, 1]).
Features: bloom_score, grade, subject (encoded), question_type (encoded).
Primary metric: MAE.
"""
import logging
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
from app.core.config import settings
from app.core.exceptions import TrainingError
from training.base_trainer import BaseTrainer, TrainingResult
logger = logging.getLogger(__name__)
FEATURE_COLUMNS = ["bloom_score", "grade", "subject", "question_type"]
CATEGORICAL_COLUMNS = ["subject", "question_type"]
NUMERIC_COLUMNS = ["bloom_score", "grade"]
TARGET_COLUMN = "difficulty_score"
class DifficultyModelTrainer(BaseTrainer):
"""RandomForestRegressor for question difficulty estimation.
Target: difficulty_score (continuous [0, 1])
Features: bloom_score, grade, subject (encoded), question_type (encoded)
Primary metric: MAE
"""
@property
def model_name(self) -> str:
return "difficulty_model"
@property
def model_version(self) -> str:
return "difficulty_model_v2_baseline_001"
@property
def table_name(self) -> str:
return "training_lo_tagging"
def _load_with_question_type(self, df: pd.DataFrame) -> pd.DataFrame:
"""Join question_type from questions.csv since training_lo_tagging lacks it.
The training_lo_tagging table does not include question_type, but the
design requires it as a feature. We join on question_id from questions.csv.
"""
questions_df = self._loader.load_table("questions")
question_type_map = questions_df[["question_id", "question_type"]].drop_duplicates()
df = df.merge(question_type_map, on="question_id", how="left")
# Fill any missing question_type with a default
if df["question_type"].isna().any():
missing_count = df["question_type"].isna().sum()
logger.warning(
"Found %d rows with missing question_type after join; filling with 'unknown'",
missing_count,
)
df["question_type"] = df["question_type"].fillna("unknown")
return df
def train(self, train_df: pd.DataFrame, val_df: pd.DataFrame) -> dict:
"""Train RandomForestRegressor on question features.
Algorithm:
1. Join question_type from questions table
2. Encode categorical columns (subject, question_type) with OrdinalEncoder
3. Build numeric feature matrix: [bloom_score, grade, subject_encoded, question_type_encoded]
4. Target: difficulty_score
5. Fit RandomForestRegressor(n_estimators=100, random_state=seed)
6. Return {"model": rf, "encoder": ordinal_enc, "feature_columns.json": feature_list}
"""
# Join question_type for both train and val
train_df = self._load_with_question_type(train_df)
# Fit OrdinalEncoder on categorical columns
ordinal_enc = OrdinalEncoder(
handle_unknown="use_encoded_value",
unknown_value=-1,
)
ordinal_enc.fit(train_df[CATEGORICAL_COLUMNS])
# Build feature matrix
X_cat = ordinal_enc.transform(train_df[CATEGORICAL_COLUMNS])
X_num = train_df[NUMERIC_COLUMNS].values
X_train = np.hstack([X_num, X_cat])
y_train = train_df[TARGET_COLUMN].values
# Fit RandomForestRegressor
rf = RandomForestRegressor(
n_estimators=100,
random_state=self._seed,
)
rf.fit(X_train, y_train)
logger.info(
"Difficulty model trained — %d samples, %d features",
X_train.shape[0],
X_train.shape[1],
)
return {
"model": rf,
"encoder": ordinal_enc,
"feature_columns.json": FEATURE_COLUMNS,
}
def evaluate(self, artifacts: dict, df: pd.DataFrame, split_name: str) -> dict:
"""Evaluate model on a split.
Computes: MAE, R-squared, per-bucket MAE (easy/medium/hard based on
difficulty column).
"""
model = artifacts["model"]
encoder = artifacts["encoder"]
# Join question_type for evaluation data
df = self._load_with_question_type(df)
# Build feature matrix
X_cat = encoder.transform(df[CATEGORICAL_COLUMNS])
X_num = df[NUMERIC_COLUMNS].values
X = np.hstack([X_num, X_cat])
y_true = df[TARGET_COLUMN].values
y_pred = model.predict(X)
# Overall metrics
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
# Per-bucket MAE (easy/medium/hard based on difficulty column)
per_bucket_mae = {}
if "difficulty" in df.columns:
for bucket in df["difficulty"].unique():
mask = df["difficulty"] == bucket
if mask.sum() > 0:
bucket_mae = mean_absolute_error(
y_true[mask], y_pred[mask]
)
per_bucket_mae[bucket.lower()] = round(bucket_mae, 4)
metrics = {
"mae": round(mae, 4),
"r_squared": round(r2, 4),
"per_bucket_mae": per_bucket_mae,
}
logger.info(
"%s metrics — MAE: %.4f, R²: %.4f",
split_name, mae, r2,
)
return metrics
def _check_baseline(self, metrics: dict) -> None:
"""Verify MAE < 0.5 (very lenient baseline for synthetic data).
Raises TrainingError if not met.
"""
test_metrics = metrics.get("metrics", {}).get("test", {})
mae = test_metrics.get("mae")
# Fallback to validation metrics if test not available
if mae is None:
val_metrics = metrics.get("metrics", {}).get("validation", {})
mae = val_metrics.get("mae")
if mae is None:
raise TrainingError(
"Cannot compute baseline: MAE not found in metrics.",
model_name=self.model_name,
)
if mae >= 0.5:
raise TrainingError(
f"MAE ({mae:.4f}) does not meet baseline threshold (< 0.5). "
f"Model performance is insufficient.",
model_name=self.model_name,
)
logger.info("Baseline check passed — MAE %.4f < 0.5", mae)
def _build_metrics(
self,
val_metrics: dict,
test_metrics: dict,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
test_df: pd.DataFrame,
) -> dict:
"""Assemble full metrics.json content."""
return {
"model_name": self.model_name,
"model_version": self.model_version,
"dataset_version": settings.ai_service_version,
"trained_at": datetime.now(timezone.utc).isoformat(),
"seed": self._seed,
"split_counts": {
"train": len(train_df),
"validation": len(val_df),
"test": len(test_df),
},
"metrics": {
"validation": val_metrics,
"test": test_metrics,
},
"limitations": [
"Trained on synthetic data only.",
"difficulty_score distribution may not reflect real-world difficulty.",
"OrdinalEncoder assumes an ordering that may not be meaningful for subject/question_type.",
"Per-bucket MAE depends on the quality of the difficulty string labels.",
],
}
def _build_training_config(
self,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
test_df: pd.DataFrame,
) -> dict:
"""Build training_config.json with hyperparameters."""
return {
"model_name": self.model_name,
"model_version": self.model_version,
"dataset_version": settings.ai_service_version,
"seed": self._seed,
"split_counts": {
"train": len(train_df),
"validation": len(val_df),
"test": len(test_df),
},
"hyperparameters": {
"n_estimators": 100,
"random_state": self._seed,
"algorithm": "RandomForestRegressor",
"encoder": "OrdinalEncoder",
},
"feature_columns": FEATURE_COLUMNS,
"categorical_columns": CATEGORICAL_COLUMNS,
"numeric_columns": NUMERIC_COLUMNS,
"target_column": TARGET_COLUMN,
"algorithm": "RandomForestRegressor",
}
def _build_model_card(self, metrics: dict) -> str:
"""Generate model_card.md content."""
val_metrics = metrics.get("metrics", {}).get("validation", {})
test_metrics = metrics.get("metrics", {}).get("test", {})
card = f"""# Model Card: Difficulty Model
## Model Details
- **Model Name:** {self.model_name}
- **Model Version:** {self.model_version}
- **Algorithm:** RandomForestRegressor
- **Framework:** scikit-learn
- **Trained At:** {metrics.get("trained_at", "N/A")}
- **Seed:** {self._seed}
## Intended Use
Estimate question difficulty as a continuous score in [0, 1] based on
question features (bloom_score, grade, subject, question_type). Used in
the difficulty estimation endpoint to predict how hard a question is for
a given grade level.
## Training Data
- **Source:** training_lo_tagging.csv + questions.csv (for question_type)
- **Split Counts:** train={metrics.get("split_counts", {}).get("train", "N/A")}, \
validation={metrics.get("split_counts", {}).get("validation", "N/A")}, \
test={metrics.get("split_counts", {}).get("test", "N/A")}
- **Features:** bloom_score (numeric), grade (numeric), subject (OrdinalEncoded), \
question_type (OrdinalEncoded)
- **Target:** difficulty_score (continuous [0, 1])
## Metrics
### Validation Set
- MAE: {val_metrics.get("mae", "N/A")}
- R-squared: {val_metrics.get("r_squared", "N/A")}
- Per-bucket MAE: {val_metrics.get("per_bucket_mae", "N/A")}
### Test Set
- MAE: {test_metrics.get("mae", "N/A")}
- R-squared: {test_metrics.get("r_squared", "N/A")}
- Per-bucket MAE: {test_metrics.get("per_bucket_mae", "N/A")}
## Known Limitations
- Trained on synthetic data only — performance on real questions is unknown.
- difficulty_score distribution may not reflect real-world difficulty.
- OrdinalEncoder assumes an ordering that may not be meaningful for subject/question_type.
- Per-bucket MAE depends on the quality of the difficulty string labels.
- Limited feature set (4 features); text-based features could improve performance.
## Fallback Behavior
When the model is not loaded or confidence is below threshold, the system
falls back to a rule-based difficulty estimation using bloom_score and
grade-level heuristics.
"""
return card
|