| """ |
| Evaluation utilities for comparing trained vs random agents. |
| """ |
|
|
| import numpy as np |
| import pandas as pd |
| from typing import List, Dict, Optional |
|
|
| from training.config import TrainingConfig |
| from training.train import train, run_random_baseline |
| from utils.visualization import ( |
| plot_reward_curve, |
| plot_grade_progression, |
| plot_comparison_table, |
| ) |
|
|
|
|
| def evaluate( |
| config: Optional[TrainingConfig] = None, |
| trained_metrics: Optional[List[Dict]] = None, |
| baseline_episodes: int = 10, |
| df: Optional[pd.DataFrame] = None, |
| ) -> Dict: |
| """ |
| Run full evaluation: train agent, run random baseline, compare, and plot. |
| |
| Args: |
| config: Training configuration (uses default if None). |
| trained_metrics: Pre-computed training metrics (skips training if provided). |
| baseline_episodes: Number of random baseline episodes. |
| df: Optional dataframe for the environment. |
| |
| Returns: |
| Evaluation results dict. |
| """ |
| if config is None: |
| config = TrainingConfig() |
|
|
| |
| if trained_metrics is None: |
| print("Running training...") |
| trained_metrics = train(config, df=df) |
|
|
| |
| print(f"\nRunning random baseline ({baseline_episodes} episodes)...") |
| baseline_metrics = run_random_baseline(config, df=df, num_episodes=baseline_episodes) |
|
|
| |
| print(f"\n{'='*60}") |
| print("EVALUATION RESULTS") |
| print(f"{'='*60}") |
|
|
| def avg(metrics, key): |
| return np.mean([m[key] for m in metrics]) |
|
|
| print(f"\n{'Metric':<20} {'Random':>12} {'Trained':>12} {'Improvement':>14}") |
| print("-" * 60) |
|
|
| for key, label in [ |
| ("total_reward", "Avg Reward"), |
| ("final_grade", "Avg Grade"), |
| ("pnl_pct", "Avg PnL %"), |
| ("max_drawdown", "Avg Max DD"), |
| ("sharpe_ratio", "Avg Sharpe"), |
| ]: |
| r = avg(baseline_metrics, key) |
| t = avg(trained_metrics, key) |
| imp = t - r |
| sign = "+" if imp > 0 else "" |
| print(f" {label:<18} {r:>12.4f} {t:>12.4f} {sign}{imp:>13.4f}") |
|
|
| |
| print("\nGenerating plots...") |
| plot_reward_curve(trained_metrics, baseline_metrics) |
| plot_grade_progression(trained_metrics, baseline_metrics) |
| plot_comparison_table(trained_metrics, baseline_metrics) |
|
|
| results = { |
| "trained_metrics": trained_metrics, |
| "baseline_metrics": baseline_metrics, |
| "trained_avg_grade": avg(trained_metrics, "final_grade"), |
| "baseline_avg_grade": avg(baseline_metrics, "final_grade"), |
| "grade_improvement": avg(trained_metrics, "final_grade") - avg(baseline_metrics, "final_grade"), |
| } |
| return results |
|
|
|
|
| if __name__ == "__main__": |
| evaluate() |
|
|