Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| import shap | |
| import matplotlib.pyplot as plt | |
| import scipy.sparse | |
| import polars as pl | |
| import re | |
| import gcsfs | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import Pipeline as SKPipeline | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold | |
| from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold | |
| from sklearn.metrics import classification_report, ConfusionMatrixDisplay, f1_score, make_scorer | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.ensemble import IsolationForest | |
| from imblearn.pipeline import Pipeline as ImbPipeline | |
| from imblearn.over_sampling import ADASYN | |
| from sentence_transformers import SentenceTransformer | |
| from xgboost import XGBClassifier | |
| from evidently import Report | |
| from evidently.presets import DataDriftPreset | |
| import optuna | |
| # --- Custom Transformers --- | |
| # Transformer for binarizing multi-label columns (lists of categories per row) | |
| class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin): | |
| """Custom transformer to binarize multi-label (list of strings) columns. | |
| Attributes: | |
| col (str): The column name being transformed. | |
| mlb (MultiLabelBinarizer): Fitted binarizer. | |
| """ | |
| def fit(self, X, y=None): | |
| """Fit the binarizer to the data. | |
| Args: | |
| X (pd.Series): Series of lists to binarize. | |
| y (ignored): Not used. | |
| Returns: | |
| self | |
| """ | |
| self.col = X.name | |
| self.mlb = MultiLabelBinarizer() | |
| self.mlb.fit(X) | |
| return self | |
| def transform(self, X): | |
| """Transform the input Series to a binary matrix. | |
| Args: | |
| X (pd.Series): Series of lists to transform. | |
| Returns: | |
| np.ndarray: Binary matrix for multi-label data. | |
| """ | |
| return self.mlb.transform(X) | |
| def get_feature_names_out(self, input_features=None): | |
| """Get output feature names for the binarized columns. | |
| Args: | |
| input_features (ignored): Not used. | |
| Returns: | |
| list: List of output feature names. | |
| """ | |
| return [f"{self.col}_{cls}" for cls in self.mlb.classes_] | |
| def get_params(self, deep=True): | |
| """Get parameters (stub for sklearn compatibility).""" | |
| return {} | |
| def set_params(self, **params): | |
| """Set parameters (stub for sklearn compatibility).""" | |
| return self | |
| # Adds anomaly score from IsolationForest as a feature (for noise/outlier detection) | |
| class AnomalyScoreTransformer(BaseEstimator, TransformerMixin): | |
| """Custom transformer to compute anomaly scores using IsolationForest | |
| and add them as a new feature. | |
| """ | |
| def __init__(self): | |
| self.model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42) | |
| def fit(self, X, y=None): | |
| """Fit the IsolationForest on input data. | |
| Args: | |
| X (array-like): Input features. | |
| y (ignored): Not used. | |
| Returns: | |
| self | |
| """ | |
| self.model.fit(X) | |
| return self | |
| def transform(self, X): | |
| """Transform the data by appending the anomaly scores. | |
| Args: | |
| X (array-like): Input features. | |
| Returns: | |
| np.ndarray: Input features with anomaly score column appended. | |
| """ | |
| scores = -self.model.decision_function(X) | |
| return np.hstack([X, scores.reshape(-1, 1)]) | |
| # --- Step 1: Data Preparation --- | |
| def prepare_data(df, is_train=True, model_dir="model_artifacts"): | |
| """Prepare and clean the raw input DataFrame for modeling. | |
| - Maps status to binary label if training. | |
| - Handles multilabel columns. | |
| - Cleans and expands list fields. | |
| - Adds count features. | |
| Args: | |
| df (pd.DataFrame): Raw data. | |
| is_train (bool): Whether the data is for training. | |
| model_dir (str): Directory to store artifacts (not used here). | |
| Returns: | |
| pd.DataFrame: Cleaned and feature-engineered DataFrame. | |
| """ | |
| df = df.copy() | |
| if is_train: | |
| # Filter for only labeled classes and map to numeric | |
| df['status'] = df['status'].astype(str).str.upper() | |
| df = df[df['status'].isin(['CLOSED', 'TERMINATED'])] | |
| df['label'] = df['status'].map({'CLOSED': 0, 'TERMINATED': 1}) | |
| assert df['label'].notna().all(), "Label column still has NaNs!" | |
| # Define fields that are lists of values (multi-label columns) | |
| multilabel_fields = [ | |
| 'list_country', 'list_activityType', 'list_deliverableType', | |
| 'list_availableLanguages', 'list_euroSciVocTitle' | |
| ] | |
| # Helper to extract intermediate paths (for hierarchy/ontology features) | |
| def extract_intermediate_levels(paths): | |
| tokens = [] | |
| if isinstance(paths, list): | |
| for p in paths: | |
| parts = p.strip('/').split('/') | |
| tokens.extend(parts[:-1]) | |
| return list(set(tokens)) | |
| df['euroSciVoc_intermediate'] = df['list_euroSciVocPath'].apply(extract_intermediate_levels) | |
| multilabel_fields.append('euroSciVoc_intermediate') | |
| # Normalize and clean multi-label fields | |
| for col in multilabel_fields: | |
| df[col] = df[col].apply(lambda x: [] if x is None else (x.tolist() if hasattr(x, 'tolist') else x)) | |
| df[col] = df[col].apply(lambda x: list(x) if not isinstance(x, list) else x) | |
| df[col] = df[col].apply(lambda x: [item for item in x if item is not None]) | |
| df[col] = df[col].apply(lambda x: [str(item).upper() for item in x]) | |
| # Split language field (if comma-separated) | |
| def split_languages(lang_list): | |
| if not isinstance(lang_list, list): | |
| return [] | |
| result = [] | |
| for entry in lang_list: | |
| if isinstance(entry, str): | |
| result.extend(entry.split(",")) | |
| return result | |
| df["list_availableLanguages"] = df["list_availableLanguages"].apply(split_languages) | |
| # Fill NA and convert to string for text columns | |
| for col in ['title', 'objective']: | |
| df[col] = df[col].fillna("").astype(str) | |
| # Count number of partners, countries, SMEs (for feature engineering) | |
| df['n_partners'] = df['list_name'].apply( | |
| lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0) | |
| ) | |
| df['n_country'] = df['list_country'].apply( | |
| lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0) | |
| ) | |
| df['n_sme'] = df['list_SME'].apply( | |
| lambda x: sum(1 for i in (x.tolist() if hasattr(x, 'tolist') else x) if i is True) | |
| if x is not None and (hasattr(x, 'tolist') or isinstance(x, list)) else 0 | |
| ) | |
| return df | |
| # --- Step 2: Text Embedding --- | |
| def compute_embeddings(df, text_columns, model_name='sentence-transformers/LaBSE', svd_dim=50): | |
| """Compute SBERT embeddings for text columns, reduce with SVD, and add to DataFrame. | |
| Embeddings are cached to disk for re-use. SVD is fitted per column. | |
| Args: | |
| df (pd.DataFrame): DataFrame with text columns. | |
| text_columns (list of str): Columns to embed. | |
| model_name (str): HuggingFace model name. | |
| svd_dim (int): Number of SVD components. | |
| Returns: | |
| pd.DataFrame: DataFrame with added embedding columns. | |
| """ | |
| model = SentenceTransformer(model_name) | |
| os.makedirs("/content/drive/MyDrive/model_artifacts", exist_ok=True) | |
| os.makedirs("/content/drive/MyDrive/embeddings", exist_ok=True) | |
| for col in text_columns: | |
| embedding_file = f"/content/drive/MyDrive/embeddings/{col}_embeddings.npy" | |
| svd_file = f"/content/drive/MyDrive/model_artifacts/{col}_svd.pkl" | |
| if os.path.exists(embedding_file): | |
| print(f"Loading saved embeddings for column '{col}'...") | |
| embeddings = np.load(embedding_file) | |
| else: | |
| print(f"Computing embeddings for column '{col}'...") | |
| embeddings = model.encode(df[col].tolist(), show_progress_bar=True) | |
| np.save(embedding_file, embeddings) | |
| print(f"Fitting SVD for column '{col}'...") | |
| svd = TruncatedSVD(n_components=svd_dim, random_state=42) | |
| svd.fit(embeddings) | |
| joblib.dump(svd, svd_file) | |
| reduced = svd.transform(embeddings) | |
| embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])]) | |
| embed_df.index = df.index # Keep index aligned for merge | |
| df = pd.concat([df, embed_df], axis=1) | |
| return df | |
| # --- Step 3: Build Preprocessor --- | |
| def build_preprocessor(numeric_features, categorical_features, multilabel_fields): | |
| """Create a ColumnTransformer that preprocesses numeric, categorical, and multilabel fields. | |
| Args: | |
| numeric_features (list of str): Names of numeric columns. | |
| categorical_features (list of str): Names of categorical columns. | |
| multilabel_fields (list of str): Names of multi-label columns. | |
| Returns: | |
| ColumnTransformer: Configured sklearn transformer. | |
| """ | |
| numeric_pipeline = SKPipeline([ | |
| ('imputer', SimpleImputer(strategy='median')), | |
| ('scaler', StandardScaler())], memory="cache_dir" | |
| ) | |
| categorical_pipeline = SKPipeline([ | |
| ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), | |
| ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))], memory="cache_dir" | |
| ) | |
| transformers = [ | |
| ('num', numeric_pipeline, numeric_features), | |
| ('cat', categorical_pipeline, categorical_features), | |
| # Add a binarizer transformer for each multilabel field | |
| *[(f'mlb_{col}', MultiLabelBinarizerTransformer(), col) for col in multilabel_fields]] | |
| return ColumnTransformer(transformers, sparse_threshold=0.0) | |
| # --- Step 4: Build Pipeline --- | |
| def build_pipeline(preprocessor, base_model, k=250): | |
| """Build the full ML pipeline including preprocessing, anomaly detection, | |
| resampling, feature selection, and a calibrated classifier. | |
| Args: | |
| preprocessor (ColumnTransformer): Preprocessing transformer. | |
| base_model (sklearn estimator): Base classifier (e.g. XGBClassifier). | |
| k (int): Number of features to select. | |
| Returns: | |
| ImbPipeline: Configured imbalanced-learn pipeline. | |
| """ | |
| return ImbPipeline(steps=[ | |
| ('preprocessor', preprocessor), | |
| ('anomaly', AnomalyScoreTransformer()), | |
| ('resample', ADASYN()), | |
| ("variance_filter", VarianceThreshold(threshold=0.0)), | |
| ('feature_select', SelectKBest(score_func=f_classif, k=k)), | |
| ('classifier', CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3)) | |
| ]) | |
| # --- Step 5: Drift Monitoring --- | |
| def monitor_drift(reference, current, feature_names, output_html='drift_report.html'): | |
| """Generate a drift report using Evidently comparing reference and current data. | |
| Args: | |
| reference (np.ndarray or pd.DataFrame): Reference feature matrix. | |
| current (np.ndarray or pd.DataFrame): Current feature matrix. | |
| feature_names (list of str): Feature names. | |
| output_html (str): Output path for HTML report. | |
| Returns: | |
| None | |
| """ | |
| ref_df = pd.DataFrame(reference, columns=feature_names) | |
| cur_df = pd.DataFrame(current, columns=feature_names) | |
| report = Report(metrics=[DataDriftPreset()]) | |
| evaluated_report = report.run(reference_data=ref_df, current_data=cur_df) | |
| evaluated_report.save_html(output_html) | |
| print(f"✅ Drift report saved to {output_html}") | |
| # --- Step 6: Evaluation + SHAP --- | |
| def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names): | |
| """Print evaluation metrics, show confusion matrix, and plot SHAP summary. | |
| Args: | |
| model (sklearn Pipeline): Trained pipeline. | |
| X_train (np.ndarray or pd.DataFrame): Training features. | |
| X_test (np.ndarray or pd.DataFrame): Test features. | |
| y_train (np.ndarray or pd.Series): Training labels. | |
| y_test (np.ndarray or pd.Series): Test labels. | |
| feature_names (list of str): Feature names after selection. | |
| Returns: | |
| None | |
| """ | |
| y_pred = model.predict(X_test) | |
| print(classification_report(y_test, y_pred)) | |
| ConfusionMatrixDisplay.from_predictions(y_test, y_pred) | |
| plt.title("Evaluation") | |
| plt.tight_layout() | |
| plt.show() | |
| # Prepare data for SHAP after feature selection | |
| X_proc = model.named_steps['preprocessor'].transform(X_test) | |
| if scipy.sparse.issparse(X_proc): | |
| X_proc = X_proc.toarray() | |
| selector = model.named_steps['feature_select'] | |
| X_selected = selector.transform(X_proc) | |
| explainer = shap.TreeExplainer(model.named_steps['classifier'].calibrated_classifiers_[0].estimator, feature_names=feature_names) | |
| shap_values = explainer(X_selected) | |
| shap.summary_plot(shap_values, X_selected) | |
| # --- Final Orchestration --- | |
| def status_prediction_model(df): | |
| """Orchestrate end-to-end model training, evaluation, drift detection, and saving artifacts. | |
| Args: | |
| df (pd.DataFrame): Input data. | |
| Returns: | |
| None | |
| """ | |
| os.makedirs("model_artifacts", exist_ok=True) | |
| print("🧹 Preparing data...") | |
| df = prepare_data(df, is_train=True) | |
| print("💡 Embedding text...") | |
| df = compute_embeddings(df, ['title', 'objective']) | |
| # Feature lists for the pipeline | |
| text_embed_cols = [col for col in df.columns if '_embed_' in col] | |
| numeric_features = ['durationDays', 'ecMaxContribution', 'totalCost', | |
| 'n_partners', 'n_country', 'n_sme'] + text_embed_cols | |
| categorical_features = ['fundingScheme', 'legalBasis', 'nature'] | |
| multilabel_fields = ['list_country', 'list_activityType', 'list_deliverableType', | |
| 'list_availableLanguages', 'list_euroSciVocTitle','euroSciVoc_intermediate'] | |
| # Restrict to used features + label | |
| df = df[numeric_features + categorical_features + multilabel_fields + ['label']] | |
| X = df.drop(columns='label') | |
| y = df['label'] | |
| # Train/test split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42) | |
| print("🧱 Building pipeline...") | |
| preprocessor = build_preprocessor(numeric_features, categorical_features, multilabel_fields) | |
| base_model = XGBClassifier(eval_metric='logloss', n_jobs=-1) | |
| # Optuna hyperparameter tuning (24 trials, 6 jobs parallel) | |
| print("🎯 Training model with Optuna...") | |
| def objective(trial): | |
| params = { | |
| 'n_estimators': trial.suggest_int('n_estimators', 100, 300), | |
| 'max_depth': trial.suggest_int('max_depth', 3, 10), | |
| 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3), | |
| 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2.0, 10.0) | |
| } | |
| base_model.set_params(**params) | |
| pipeline = build_pipeline(preprocessor, base_model) | |
| scores = cross_val_score(pipeline, X_train, y_train, cv=StratifiedKFold(3, shuffle=True, random_state=42), | |
| scoring=make_scorer(f1_score, pos_label=1),n_jobs=-1) | |
| return scores.mean() | |
| study = optuna.create_study(direction='maximize') | |
| study.optimize(objective, n_trials=24,n_jobs=6) | |
| best_params = study.best_trial.params | |
| base_model.set_params(**best_params) | |
| # Final fit and evaluation | |
| print("✅ Training final model and evaluating...") | |
| final_pipeline = build_pipeline(preprocessor, base_model) | |
| final_pipeline.fit(X_train, y_train) | |
| selector = final_pipeline.named_steps['feature_select'] | |
| if hasattr(selector, 'get_support'): | |
| feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())[selector.get_support()] | |
| else: | |
| feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out()) | |
| evaluate_model(final_pipeline, X_train, X_test, y_train, y_test, feature_names) | |
| # Drift monitoring (train vs test data) | |
| print("📊 Monitoring drift...") | |
| X_train_p = preprocessor.transform(X_train) | |
| X_test_p = preprocessor.transform(X_test) | |
| if scipy.sparse.issparse(X_train_p): X_train_p = X_train_p.toarray() | |
| if scipy.sparse.issparse(X_test_p): X_test_p = X_test_p.toarray() | |
| selector = final_pipeline.named_steps["feature_select"] | |
| X_train_sel = selector.transform(X_train_p) | |
| X_test_sel = selector.transform(X_test_p) | |
| monitor_drift(X_train_sel, X_test_sel, feature_names) | |
| print("💾 Saving model and artifacts...") | |
| joblib.dump(final_pipeline, "model_artifacts/model.pkl") | |
| joblib.dump(preprocessor, "model_artifacts/preprocessor.pkl") | |
| X_train.to_csv("model_artifacts/X_train_processed.csv", index=False) | |
| y_train.to_csv("model_artifacts/y_train.csv", index=False) | |
| feature_config = { | |
| "numeric_features": numeric_features, | |
| "categorical_features": categorical_features, | |
| "multilabel_fields": multilabel_fields | |
| } | |
| json.dump(feature_config, open("model_artifacts/feature_config.json", "w")) | |
| print("✅ Training complete. Model artifacts saved.") | |
| def score(new_df, model_dir="model_artifacts"): | |
| """Score new/unseen data using the trained pipeline and explain predictions via SHAP. | |
| Steps: | |
| - Loads model and configs | |
| - Prepares data and embeds text | |
| - Predicts label and probability | |
| - Computes SHAP values and identifies top contributing features | |
| Args: | |
| new_df (pd.DataFrame): Data to score. | |
| model_dir (str): Path to saved model artifacts. | |
| Returns: | |
| pd.DataFrame: DataFrame with predictions, probabilities, and SHAP features. | |
| """ | |
| # 1) Load trained model and config | |
| pipe = joblib.load(os.path.join(model_dir, "model.pkl")) | |
| config = json.load(open(os.path.join(model_dir, "feature_config.json"))) | |
| # 2) Prepare & embed exactly as in training | |
| df = prepare_data(new_df.copy(), is_train=False) | |
| text_cols = ['title', 'objective'] | |
| sbert = SentenceTransformer('sentence-transformers/LaBSE') | |
| os.makedirs("/content/drive/MyDrive/embeddings_test", exist_ok=True) | |
| for col in text_cols: | |
| embedding_file = f"/content/drive/MyDrive/embeddings_test/{col}_embeddings.npy" | |
| # load the SVD you trained | |
| svd = joblib.load(os.path.join(model_dir, f"{col}_svd.pkl")) | |
| if os.path.exists(embedding_file): | |
| print(f"Loading saved embeddings for column '{col}'...") | |
| embeddings = np.load(embedding_file) | |
| else: | |
| print(f"Computing embeddings for column '{col}'...") | |
| embeddings = sbert.encode(df[col].tolist(), show_progress_bar=True) | |
| np.save(embedding_file, embeddings) | |
| reduced = svd.transform(embeddings) | |
| emb_df = pd.DataFrame(reduced, | |
| columns=[f"{col}_embed_{i}" for i in range(reduced.shape[1])], | |
| index=df.index) | |
| df = pd.concat([df, emb_df], axis=1) | |
| # 3) Build the final feature set | |
| X = df[ config["numeric_features"] | |
| + config["categorical_features"] | |
| + config["multilabel_fields"] ] | |
| # 4) Predict & attach to DataFrame | |
| preds = pipe.predict(X) | |
| probs = pipe.predict_proba(X)[:, 1] # assume binary and positive class = index 1 | |
| df["predicted_label"] = preds | |
| df["predicted_prob"] = probs | |
| # 5) SHAP explanations on the *selected* features | |
| # (we need to re-run preprocessing + feature_selection) | |
| preproc = pipe.named_steps["preprocessor"] | |
| select = pipe.named_steps["feature_select"] | |
| clf = pipe.named_steps["classifier"].calibrated_classifiers_[0].estimator | |
| X_proc = preproc.transform(X) | |
| if scipy.sparse.issparse(X_proc): | |
| X_proc = X_proc.toarray() | |
| X_sel = select.transform(X_proc) | |
| feature_names = select.get_feature_names_out( | |
| preproc.get_feature_names_out() | |
| ) | |
| # Use a TreeExplainer directly on the XGB base estimator | |
| explainer = shap.Explainer(clf, X_sel, feature_names=feature_names) | |
| shap_vals = explainer(X_sel) # returns a ShapleyValues object | |
| # 6) For each row, pick top-6 absolute contributors | |
| shap_df = pd.DataFrame(shap_vals.values, columns=feature_names, index=df.index) | |
| # 7) get absolute values | |
| abs_shap = shap_df.abs() | |
| # 8) for each row, record the top‐6 feature names by absolute magnitude | |
| top_feats = abs_shap.apply(lambda row: row.nlargest(6).index.tolist(), axis=1) | |
| # 9) convert that to six separate columns | |
| feat_cols = [f"top{i}_feature" for i in range(1,7)] | |
| df[feat_cols] = pd.DataFrame(top_feats.tolist(), index=df.index) | |
| # 10) now build the *true* SHAP values by looking up each name in shap_df | |
| # for each row, shap_df.loc[idx, feat] is the signed value | |
| top_vals = [ | |
| [ shap_df.loc[idx, feat] for feat in feats ] | |
| for idx, feats in top_feats.items() | |
| ] | |
| # 11) store them in your six shap‐value columns | |
| val_cols = [f"top{i}_shap" for i in range(1,7)] | |
| df[val_cols] = pd.DataFrame(top_vals, index=df.index) | |
| return df | |
| def clean_feature_name(raw: str) -> str: | |
| """ | |
| - cat__: "cat__feature_value" → "Feature: Value" | |
| - num__: "num__some_count" → "Some Count" | |
| - mlb_: "mlb_list_activityType__list_activityType_Research" | |
| → "List Activity Type: Research" | |
| """ | |
| if not raw: | |
| return "" | |
| # 1) cat__ | |
| if raw.startswith("cat__"): | |
| s = raw[len("cat__"):] | |
| col, val = (s.split("__", 1) + [None])[:2] | |
| col_c = col.replace("_", " ").title() | |
| if val: | |
| val_c = val.replace("_", " ").title() | |
| return f"{col_c}: {val_c}" | |
| return col_c | |
| # 2) num__ | |
| if raw.startswith("num__"): | |
| s = raw[len("num__"):] | |
| return s.replace("_", " ").replace('n ','Number of ') | |
| # 3) mlb_ | |
| if raw.startswith("mlb_"): | |
| s = raw[len("mlb_"):] | |
| col_part, val_part = (s.split("__", 1) + [None])[:2] | |
| # drop leading "list_" on the column | |
| if col_part.startswith("list_"): | |
| col_inner = col_part[len("list_"):] | |
| else: | |
| col_inner = col_part | |
| col_c = col_inner.replace("_", " ").title() | |
| col_c = "List " + col_c | |
| if val_part: | |
| # drop "list_{col_inner}_" or leading "list_" | |
| prefix = f"list_{col_inner}_" | |
| if val_part.startswith(prefix): | |
| val_inner = val_part[len(prefix):] | |
| elif val_part.startswith("list_"): | |
| val_inner = val_part[len("list_"):] | |
| else: | |
| val_inner = val_part | |
| val_c = val_inner.replace("_", " ").title() | |
| return f"{col_c}: {val_c}" | |
| return col_c | |
| # fallback: replace __ → ": ", _ → " " | |
| return raw.replace("__", ": ").replace("_", " ").title() | |
| def preprocess_feature_names(df: pl.DataFrame) -> pl.DataFrame: | |
| transforms = [] | |
| # clean and round top-6 features & shap values | |
| for i in range(1, 7): | |
| fcol = f"top{i}_feature" | |
| scol = f"top{i}_shap" | |
| if fcol in df.columns: | |
| transforms.append( | |
| pl.col(fcol) | |
| .map_elements(clean_feature_name, return_dtype=pl.Utf8) | |
| .alias(fcol) | |
| ) | |
| if scol in df.columns: | |
| transforms.append( | |
| pl.col(scol) | |
| .round(4) | |
| .alias(scol) | |
| ) | |
| # round overall predicted probability | |
| if "predicted_prob" in df.columns: | |
| transforms.append( | |
| pl.col("predicted_prob") | |
| .round(4) | |
| .alias("predicted_prob") | |
| ) | |
| # 1) build the full list of embed-columns | |
| embed_cols = [f"title_embed_{i}" for i in range(50)] + \ | |
| [f"objective_embed_{i}" for i in range(50)] | |
| # 2) keep only the ones that actually exist in df.columns | |
| to_drop = [c for c in embed_cols if c in df.columns] | |
| # 3) drop them | |
| df = df.drop(to_drop) | |
| return df.with_columns(transforms) | |
| if __name__ == "__main__": | |
| # Entry point for training and scoring: loads data from Google Cloud Storage, | |
| # builds model and artifacts, then scores the same data as a test. | |
| bucket = "mda_eu_project" | |
| path = "data/consolidated_clean.parquet" | |
| uri = f"gs://{bucket}/{path}" | |
| fs = gcsfs.GCSFileSystem() | |
| with fs.open(uri, "rb") as f: | |
| df = pl.read_parquet(f).to_pandas() | |
| status_prediction_model(df) | |
| df_clean = preprocess_feature_names(pl.from_pandas(score(df))) | |
| df_clean.head(10) |