Spaces:
Sleeping
Sleeping
File size: 6,383 Bytes
f1de9f4 091d188 f1de9f4 091d188 f1de9f4 091d188 f1de9f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | """
server/evaluator.py — Train/holdout evaluator (v0.5).
Trains on agent's modified data, tests on frozen holdout.
Also returns feature importance and regression explanations.
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
class Evaluator:
def __init__(self, holdout_df: pd.DataFrame):
self.holdout_df = holdout_df
self._feature_cols: list = None
self._holdout_X: np.ndarray = None
self._holdout_y: np.ndarray = None
self._pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", LogisticRegression(max_iter=500, random_state=42, n_jobs=1)),
])
self._last_feature_importance: dict = {}
self._prepare_holdout()
def _prepare_holdout(self):
df = self.holdout_df.dropna()
self._feature_cols = [
c for c in df.columns
if c != "label" and not c.startswith("_") and df[c].dtype != object
]
if not self._feature_cols or len(df) < 5:
self._holdout_X = None
self._holdout_y = None
return
self._holdout_X = df[self._feature_cols].values.astype(float)
self._holdout_y = df["label"].values
def evaluate(self, train_df: pd.DataFrame) -> float:
return self._run(train_df)["accuracy"]
def evaluate_with_details(self, train_df: pd.DataFrame, prev_accuracy: float = None) -> dict:
result = self._run(train_df)
acc = result["accuracy"]
explanation = None
if prev_accuracy is not None and acc < prev_accuracy - 0.005:
explanation = self._explain_regression(train_df, prev_accuracy, acc)
return {
"accuracy": acc,
"feature_importance": result.get("feature_importance", {}),
"regression_explanation": explanation,
}
def _run(self, train_df: pd.DataFrame) -> dict:
if self._holdout_X is None or len(self._holdout_y) < 5:
return {"accuracy": 0.0, "feature_importance": {}}
train_clean = train_df.dropna()
if len(train_clean) < 20:
return {"accuracy": 0.0, "feature_importance": {}}
available_cols = [
c for c in self._feature_cols
if c in train_clean.columns and train_clean[c].dtype != object
]
if not available_cols:
return {"accuracy": 0.0, "feature_importance": {}}
X_train = train_clean[available_cols].values.astype(float)
y_train = train_clean["label"].values
if len(set(y_train)) < 2:
return {"accuracy": 0.0, "feature_importance": {}}
holdout_clean = self.holdout_df.dropna()
available_holdout = [c for c in available_cols if c in holdout_clean.columns]
if not available_holdout:
return {"accuracy": 0.0, "feature_importance": {}}
X_test = holdout_clean[available_holdout].values.astype(float)
y_test = holdout_clean["label"].values
if len(set(y_test)) < 2:
return {"accuracy": 0.0, "feature_importance": {}}
try:
self._pipeline.fit(X_train, y_train)
accuracy = float(self._pipeline.score(X_test, y_test))
clf = self._pipeline.named_steps["clf"]
coefs = clf.coef_[0] if len(clf.coef_) == 1 else clf.coef_.mean(axis=0)
importance = dict(zip(available_cols, [round(float(c), 4) for c in coefs]))
sorted_imp = sorted(importance.items(), key=lambda x: abs(x[1]), reverse=True)
feature_importance = {
"top_positive": [{"feature": k, "coef": v} for k, v in sorted_imp if v > 0][:4],
"top_negative": [{"feature": k, "coef": v} for k, v in sorted_imp if v < 0][:4],
"note": "Coefficients after StandardScaler — magnitude reflects relative importance.",
}
self._last_feature_importance = feature_importance
return {"accuracy": accuracy, "feature_importance": feature_importance}
except Exception:
return {"accuracy": 0.0, "feature_importance": {}}
def _explain_regression(self, train_df: pd.DataFrame, prev_acc: float, new_acc: float) -> dict:
delta = round(new_acc - prev_acc, 4)
n_rows = len(train_df.dropna())
n_missing = int(train_df.isnull().sum().sum())
label_counts = train_df["label"].value_counts()
balance = float(label_counts.min() / label_counts.sum()) if len(label_counts) > 1 else 1.0
likely_cause = "unknown"
suggestion = "Try a different approach or rollback this step."
if n_rows > 800:
likely_cause = "large_augmentation_overfitting"
suggestion = (
"Large augmentation overfits the training set — synthetic rows don't generalise to holdout. "
"Try 'query_balancer' with undersample_majority instead, or rollback."
)
elif n_missing / max(n_rows * train_df.shape[1], 1) > 0.15:
likely_cause = "high_residual_missing"
suggestion = "Many missing values remain. Apply 'query_cleaner' again on remaining columns."
elif balance < 0.25:
likely_cause = "worsened_class_imbalance"
suggestion = (
"Class imbalance got worse. The classifier is biased toward majority on holdout. "
"Apply 'query_balancer' or 'query_augmenter'."
)
elif n_rows < 200:
likely_cause = "too_few_training_rows"
suggestion = "Row deletion left too few examples. Prefer imputation over drop_rows, or rollback."
return {
"accuracy_delta": delta,
"likely_cause": likely_cause,
"suggestion": suggestion,
"training_stats": {
"n_rows": n_rows,
"n_missing_cells": n_missing,
"class_balance": round(balance, 4),
},
}
def baseline_accuracy(self) -> float:
if self._holdout_y is None:
return 0.0
majority = np.bincount(self._holdout_y).max()
return round(majority / len(self._holdout_y), 4)
@property
def last_feature_importance(self) -> dict:
return self._last_feature_importance
|