|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import pickle |
|
|
import argparse |
|
|
from dataclasses import dataclass |
|
|
from typing import List |
|
|
|
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn.ensemble import RandomForestRegressor |
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Example: |
|
|
scores: np.ndarray |
|
|
threshold: float |
|
|
bins: int |
|
|
meta: dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_dataset(path: str) -> List[Example]: |
|
|
with open(path, "rb") as f: |
|
|
examples = pickle.load(f) |
|
|
print(f"Loaded {len(examples)} examples from {path}") |
|
|
return examples |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_distribution_features(scores: np.ndarray, n_bins: int = 100) -> np.ndarray: |
|
|
""" |
|
|
Extract statistical + histogram-based features from a score distribution in [0,1]. |
|
|
Returns a 1D numpy array of floats. |
|
|
""" |
|
|
|
|
|
feats = [ |
|
|
float(np.mean(scores)), |
|
|
float(np.std(scores)), |
|
|
float(np.median(scores)), |
|
|
float(np.percentile(scores, 25)), |
|
|
float(np.percentile(scores, 75)), |
|
|
float(np.min(scores)), |
|
|
float(np.max(scores)), |
|
|
] |
|
|
|
|
|
|
|
|
hist, _ = np.histogram(scores, bins=n_bins, range=(0.0, 1.0)) |
|
|
hist = hist.astype(np.float32) |
|
|
hist = hist / (np.sum(hist) + 1e-8) |
|
|
feats.extend(hist.tolist()) |
|
|
|
|
|
|
|
|
kernel_size = 5 |
|
|
kernel = np.ones(kernel_size, dtype=np.float32) / kernel_size |
|
|
smooth_hist = np.convolve(hist, kernel, mode="same") |
|
|
feats.extend(smooth_hist.tolist()) |
|
|
|
|
|
|
|
|
try: |
|
|
from scipy.stats import skew, kurtosis |
|
|
feats.append(float(skew(scores))) |
|
|
feats.append(float(kurtosis(scores))) |
|
|
except Exception: |
|
|
|
|
|
m = np.mean(scores) |
|
|
s = np.std(scores) + 1e-8 |
|
|
skew_approx = float(np.mean(((scores - m) / s) ** 3)) |
|
|
kurt_approx = float(np.mean(((scores - m) / s) ** 4) - 3.0) |
|
|
feats.append(skew_approx) |
|
|
feats.append(kurt_approx) |
|
|
|
|
|
|
|
|
feats.append(float(np.sum(scores < 0.2) / len(scores))) |
|
|
feats.append(float(np.sum((scores >= 0.2) & (scores < 0.4)) / len(scores))) |
|
|
feats.append(float(np.sum((scores >= 0.4) & (scores < 0.6)) / len(scores))) |
|
|
feats.append(float(np.sum((scores >= 0.6) & (scores < 0.8)) / len(scores))) |
|
|
feats.append(float(np.sum(scores >= 0.8) / len(scores))) |
|
|
|
|
|
|
|
|
hist_grad = np.gradient(smooth_hist) |
|
|
feats.append(float(np.min(hist_grad))) |
|
|
|
|
|
return np.asarray(feats, dtype=np.float32) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_evaluation_plots(targets: np.ndarray, |
|
|
predictions: np.ndarray, |
|
|
output_dir: str = "rf_evaluation") -> None: |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
|
|
|
|
|
|
|
|
ax = axes[0] |
|
|
ax.scatter(targets, predictions, alpha=0.5, s=20) |
|
|
ax.plot([0, 1], [0, 1], 'r--', linewidth=2, label='Perfect prediction') |
|
|
ax.set_xlabel('True Threshold') |
|
|
ax.set_ylabel('Predicted Threshold') |
|
|
ax.set_title('Random Forest: Predictions vs Ground Truth') |
|
|
ax.legend() |
|
|
ax.grid(True, alpha=0.3) |
|
|
ax.set_xlim(0, 1) |
|
|
ax.set_ylim(0, 1) |
|
|
|
|
|
|
|
|
ax = axes[1] |
|
|
errors = predictions - targets |
|
|
ax.hist(errors, bins=50, alpha=0.7, edgecolor='black') |
|
|
ax.axvline(0, color='r', linestyle='--', linewidth=2, label='Zero error') |
|
|
mae = np.abs(errors).mean() |
|
|
ax.axvline(mae, color='g', linestyle='--', linewidth=2, label=f'MAE={mae:.4f}') |
|
|
ax.axvline(-mae, color='g', linestyle='--', linewidth=2) |
|
|
ax.set_xlabel('Prediction Error') |
|
|
ax.set_ylabel('Count') |
|
|
ax.set_title('Error Distribution') |
|
|
ax.legend() |
|
|
ax.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
out_path = os.path.join(output_dir, "rf_evaluation.png") |
|
|
plt.savefig(out_path, dpi=150) |
|
|
plt.close() |
|
|
print(f"Saved visualizations to: {out_path}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_rf_threshold_model( |
|
|
dataset_path: str = "threshold_dataset.pkl", |
|
|
output_path: str = "threshold_model_rf.pkl", |
|
|
train_split: float = 0.9, |
|
|
n_estimators: int = 100, |
|
|
max_depth: int = 20, |
|
|
random_state: int = 42, |
|
|
n_bins: int = 100, |
|
|
): |
|
|
print("=" * 70) |
|
|
print("TRAINING RANDOM FOREST FOR THRESHOLD DETECTION") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
print("\nLoading dataset...") |
|
|
examples = load_dataset(dataset_path) |
|
|
|
|
|
|
|
|
n_train = int(len(examples) * train_split) |
|
|
train_examples = examples[:n_train] |
|
|
val_examples = examples[n_train:] |
|
|
print(f"Train: {len(train_examples)}, Val: {len(val_examples)}") |
|
|
|
|
|
|
|
|
print("\nExtracting features (this may take a minute)...") |
|
|
train_features = np.stack([extract_distribution_features(ex.scores, n_bins=n_bins) for ex in train_examples]) |
|
|
train_targets = np.array([ex.threshold for ex in train_examples], dtype=np.float32) |
|
|
|
|
|
val_features = np.stack([extract_distribution_features(ex.scores, n_bins=n_bins) for ex in val_examples]) |
|
|
val_targets = np.array([ex.threshold for ex in val_examples], dtype=np.float32) |
|
|
|
|
|
print(f"Feature dimension: {train_features.shape[1]}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("Training Random Forest...") |
|
|
print("=" * 70) |
|
|
|
|
|
rf = RandomForestRegressor( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=max_depth, |
|
|
random_state=random_state, |
|
|
n_jobs=-1, |
|
|
verbose=1, |
|
|
) |
|
|
rf.fit(train_features, train_targets) |
|
|
print("\nTraining complete!") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("EVALUATION") |
|
|
print("=" * 70) |
|
|
|
|
|
train_preds = rf.predict(train_features) |
|
|
val_preds = rf.predict(val_features) |
|
|
|
|
|
train_mae = mean_absolute_error(train_targets, train_preds) |
|
|
train_rmse = np.sqrt(mean_squared_error(train_targets, train_preds)) |
|
|
val_mae = mean_absolute_error(val_targets, val_preds) |
|
|
val_rmse = np.sqrt(mean_squared_error(val_targets, val_preds)) |
|
|
|
|
|
print(f"\nTraining Set:") |
|
|
print(f" MAE: {train_mae:.6f}") |
|
|
print(f" RMSE: {train_rmse:.6f}") |
|
|
|
|
|
print(f"\nValidation Set:") |
|
|
print(f" MAE: {val_mae:.6f}") |
|
|
print(f" RMSE: {val_rmse:.6f}") |
|
|
|
|
|
errors = np.abs(val_preds - val_targets) |
|
|
for tol in (0.01, 0.02, 0.05, 0.10): |
|
|
pct = 100 * np.mean(errors <= tol) |
|
|
print(f" Within {int(tol*100)}%: {pct:.1f}%") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("TOP 10 MOST IMPORTANT FEATURES") |
|
|
print("=" * 70) |
|
|
fi = rf.feature_importances_ |
|
|
top_idx = np.argsort(fi)[-10:][::-1] |
|
|
for i, idx in enumerate(top_idx, 1): |
|
|
print(f"{i:2d}. Feature {idx:3d}: {fi[idx]:.4f}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("SAVING MODEL") |
|
|
print("=" * 70) |
|
|
model_data = { |
|
|
"model": rf, |
|
|
"feature_names": [f"feature_{i}" for i in range(train_features.shape[1])], |
|
|
"n_features": int(train_features.shape[1]), |
|
|
"val_mae": float(val_mae), |
|
|
"val_rmse": float(val_rmse), |
|
|
"training_info": { |
|
|
"n_estimators": int(n_estimators), |
|
|
"max_depth": int(max_depth) if max_depth is not None else None, |
|
|
"train_samples": int(len(train_examples)), |
|
|
"val_samples": int(len(val_examples)), |
|
|
"train_mae": float(train_mae), |
|
|
"val_mae": float(val_mae), |
|
|
}, |
|
|
"n_bins": int(n_bins), |
|
|
} |
|
|
with open(output_path, "wb") as f: |
|
|
pickle.dump(model_data, f, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
size_mb = len(pickle.dumps(model_data)) / 1024 / 1024 |
|
|
print(f"\nModel saved to: {output_path}") |
|
|
print(f"Model size: {size_mb:.2f} MB") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("CREATING VISUALIZATIONS") |
|
|
print("=" * 70) |
|
|
create_evaluation_plots(val_targets, val_preds, output_dir="rf_evaluation") |
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("✅ TRAINING COMPLETE!") |
|
|
print("=" * 70) |
|
|
print(f"\nTo use this model:") |
|
|
print(f"1) Copy {output_path} next to your inference code") |
|
|
print(f"2) Load with pickle and pass features from extract_distribution_features(...)") |
|
|
return rf, model_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
p = argparse.ArgumentParser(description="Train a Random Forest threshold model (standalone).") |
|
|
p.add_argument("--dataset-path", type=str, default="threshold_dataset.pkl") |
|
|
p.add_argument("--output-path", type=str, default="threshold_model_rf.pkl") |
|
|
p.add_argument("--train-split", type=float, default=0.9) |
|
|
p.add_argument("--n-estimators", type=int, default=100) |
|
|
p.add_argument("--max-depth", type=int, default=20) |
|
|
p.add_argument("--random-state", type=int, default=42) |
|
|
p.add_argument("--n-bins", type=int, default=100, help="Histogram bins for feature extraction.") |
|
|
return p.parse_args() |
|
|
|
|
|
|
|
|
def main(): |
|
|
args = parse_args() |
|
|
train_rf_threshold_model( |
|
|
dataset_path=args.dataset_path, |
|
|
output_path=args.output_path, |
|
|
train_split=args.train_split, |
|
|
n_estimators=args.n_estimators, |
|
|
max_depth=args.max_depth, |
|
|
random_state=args.random_state, |
|
|
n_bins=args.n_bins, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|