"""Bloom Classifier training pipeline. Trains a TF-IDF + LogisticRegression model for Bloom taxonomy classification. Target: bloom_level (6 classes: Remember, Understand, Apply, Analyze, Evaluate, Create). Features: question_text (TF-IDF). Primary metric: macro F1. """ import logging from datetime import datetime, timezone import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( classification_report, confusion_matrix, f1_score, ) from sklearn.preprocessing import LabelEncoder from app.core.config import settings from app.core.exceptions import TrainingError from training.base_trainer import BaseTrainer, TrainingResult logger = logging.getLogger(__name__) class BloomClassifierTrainer(BaseTrainer): """TF-IDF + LogisticRegression for Bloom taxonomy classification. Target: bloom_level (6 classes: Remember, Understand, Apply, Analyze, Evaluate, Create) Features: question_text (TF-IDF) Primary metric: macro F1 """ @property def model_name(self) -> str: return "bloom_classifier" @property def model_version(self) -> str: return "bloom_classifier_v2_baseline_001" @property def table_name(self) -> str: return "training_bloom_classification" def train(self, train_df: pd.DataFrame, val_df: pd.DataFrame) -> dict: """Train TF-IDF + LogisticRegression(multinomial). Algorithm: 1. Fit TF-IDF vectorizer on train question_text 2. Encode bloom_level labels with LabelEncoder 3. Fit LogisticRegression(multi_class="multinomial", C=1.0, solver="lbfgs", max_iter=1000, random_state=seed) 4. Return {"model": logreg, "vectorizer": tfidf, "label_encoder": le} """ tfidf = TfidfVectorizer( max_features=8000, ngram_range=(1, 2), sublinear_tf=True, ) X_train = tfidf.fit_transform(train_df["question_text"]) le = LabelEncoder() y_train = le.fit_transform(train_df["bloom_level"]) logreg = LogisticRegression( multi_class="multinomial", C=1.0, solver="lbfgs", max_iter=1000, random_state=self._seed, ) logreg.fit(X_train, y_train) logger.info( "Bloom Classifier trained — %d features, %d classes", X_train.shape[1], len(le.classes_), ) return {"model": logreg, "vectorizer": tfidf, "label_encoder": le} def evaluate(self, artifacts: dict, df: pd.DataFrame, split_name: str) -> dict: """Evaluate model on a split. Computes: macro F1, weighted F1, per-class precision/recall/f1, confusion matrix. """ model = artifacts["model"] tfidf = artifacts["vectorizer"] le = artifacts["label_encoder"] X = tfidf.transform(df["question_text"]) y_true = le.transform(df["bloom_level"]) y_pred = model.predict(X) # F1 scores macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0) weighted_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0) # Per-class metrics report = classification_report( y_true, y_pred, target_names=le.classes_, output_dict=True, zero_division=0, ) per_class = {} for class_name in le.classes_: if class_name in report: per_class[class_name] = { "precision": round(report[class_name]["precision"], 4), "recall": round(report[class_name]["recall"], 4), "f1": round(report[class_name]["f1-score"], 4), "support": int(report[class_name]["support"]), } # Confusion matrix cm = confusion_matrix(y_true, y_pred).tolist() metrics = { "macro_f1": round(macro_f1, 4), "weighted_f1": round(weighted_f1, 4), "per_class": per_class, "confusion_matrix": cm, } logger.info( "%s metrics — macro_f1: %.4f, weighted_f1: %.4f", split_name, macro_f1, weighted_f1, ) return metrics def _check_baseline(self, metrics: dict) -> None: """Verify macro F1 > (1 / num_classes). Raise TrainingError if not met.""" test_metrics = metrics.get("metrics", {}).get("test", {}) macro_f1 = test_metrics.get("macro_f1", 0.0) num_classes = len(test_metrics.get("per_class", {})) # Fallback: use validation metrics if test not available if num_classes == 0: val_metrics = metrics.get("metrics", {}).get("validation", {}) macro_f1 = val_metrics.get("macro_f1", 0.0) num_classes = len(val_metrics.get("per_class", {})) if num_classes == 0: raise TrainingError( "Cannot compute baseline: no classes found in metrics.", model_name=self.model_name, ) baseline = 1.0 / num_classes if macro_f1 <= baseline: raise TrainingError( f"Macro F1 ({macro_f1:.4f}) does not exceed " f"random baseline ({baseline:.4f} = 1/{num_classes}). " f"Model is not better than random.", model_name=self.model_name, ) logger.info( "Baseline check passed — macro F1 %.4f > baseline %.4f", macro_f1, baseline, ) def _build_metrics( self, val_metrics: dict, test_metrics: dict, train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame, ) -> dict: """Assemble full metrics.json content.""" return { "model_name": self.model_name, "model_version": self.model_version, "dataset_version": settings.ai_service_version, "trained_at": datetime.now(timezone.utc).isoformat(), "seed": self._seed, "split_counts": { "train": len(train_df), "validation": len(val_df), "test": len(test_df), }, "metrics": { "validation": val_metrics, "test": test_metrics, }, "limitations": [ "Trained on synthetic data only.", "6 classes with imbalanced distribution — Create (~2%) and Evaluate (~4%) are rare.", "Macro F1 is the primary metric; per-class recall may be low for rare classes.", "TF-IDF features do not capture semantic similarity beyond n-gram overlap.", ], } def _build_training_config( self, train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame, ) -> dict: """Build training_config.json with hyperparameters.""" return { "model_name": self.model_name, "model_version": self.model_version, "dataset_version": settings.ai_service_version, "seed": self._seed, "split_counts": { "train": len(train_df), "validation": len(val_df), "test": len(test_df), }, "hyperparameters": { "tfidf_max_features": 8000, "ngram_range": [1, 2], "sublinear_tf": True, "logreg_C": 1.0, "logreg_solver": "lbfgs", "logreg_max_iter": 1000, "logreg_multi_class": "multinomial", }, "feature_columns": ["question_text"], "target_column": "bloom_level", "algorithm": "LogisticRegression(multinomial)", } def _build_model_card(self, metrics: dict) -> str: """Generate model_card.md content.""" val_metrics = metrics.get("metrics", {}).get("validation", {}) test_metrics = metrics.get("metrics", {}).get("test", {}) card = f"""# Model Card: Bloom Classifier ## Model Details - **Model Name:** {self.model_name} - **Model Version:** {self.model_version} - **Algorithm:** TF-IDF + LogisticRegression (multinomial) - **Framework:** scikit-learn - **Trained At:** {metrics.get("trained_at", "N/A")} - **Seed:** {self._seed} ## Intended Use Automatically classify questions by Bloom's taxonomy cognitive level. Used in the Bloom classification endpoint to predict one of 6 levels: Remember, Understand, Apply, Analyze, Evaluate, Create. ## Training Data - **Source:** training_bloom_classification.csv (synthetic dataset v2) - **Split Counts:** train={metrics.get("split_counts", {}).get("train", "N/A")}, \ validation={metrics.get("split_counts", {}).get("validation", "N/A")}, \ test={metrics.get("split_counts", {}).get("test", "N/A")} - **Feature:** question_text (TF-IDF vectorized, max_features=8000, ngram_range=(1,2)) - **Target:** bloom_level (6 classes) ## Metrics ### Validation Set - Macro F1: {val_metrics.get("macro_f1", "N/A")} - Weighted F1: {val_metrics.get("weighted_f1", "N/A")} ### Test Set - Macro F1: {test_metrics.get("macro_f1", "N/A")} - Weighted F1: {test_metrics.get("weighted_f1", "N/A")} ## Known Limitations - Trained on synthetic data only — performance on real classroom questions is unknown. - Class imbalance: Create (~2%) and Evaluate (~4%) are rare; recall on these classes may be low. - TF-IDF features do not capture semantic similarity beyond n-gram overlap. - Macro F1 is the primary metric; accuracy alone would mask poor performance on rare classes. ## Fallback Behavior When the model is not loaded or confidence is below the threshold (0.55), the system falls back to keyword heuristic classification: define/list → Remember; explain → Understand; calculate/use → Apply; compare/contrast → Analyze; justify → Evaluate; design → Create. """ return card