| """
|
| AegisLM Benchmarking Module
|
|
|
| Provides benchmarking capabilities for evaluating LLM robustness:
|
| - Baseline evaluation mode
|
| - Adversarial evaluation mode
|
| - Delta robustness computation
|
| - Cross-model comparison
|
| - Statistical reporting
|
| - Benchmark artifact generation
|
| """
|
|
|
| from backend.benchmarking.comparison import (
|
| compare_models,
|
| find_most_robust_model,
|
| find_most_stable_model,
|
| find_most_vulnerable_model,
|
| generate_comparative_report,
|
| generate_vulnerability_heatmap,
|
| get_attack_type_vulnerability,
|
| rank_models,
|
| )
|
| from backend.benchmarking.engine import (
|
| BenchmarkEngine,
|
| BenchmarkEvent,
|
| get_benchmark_engine,
|
| )
|
| from backend.benchmarking.reporter import (
|
| DEFAULT_BENCHMARK_DIR,
|
| export_to_csv,
|
| generate_benchmark_artifact,
|
| generate_summary_report,
|
| generate_text_report,
|
| list_benchmarks,
|
| load_benchmark_artifact,
|
| )
|
| from backend.benchmarking.schemas import (
|
| BenchmarkMode,
|
| BenchmarkPerformance,
|
| BenchmarkResult,
|
| BenchmarkStatus,
|
| BenchmarkWeights,
|
| EvaluationResult,
|
| MetricDeltas,
|
| ModelBenchmarkResult,
|
| ModelMetrics,
|
| ModelRanking,
|
| StartBenchmarkRequest,
|
| StartBenchmarkResponse,
|
| VulnerabilityHeatmap,
|
| VulnerabilityHeatmapCell,
|
| )
|
| from backend.benchmarking.statistics import (
|
| MetricStatistics,
|
| calculate_confidence_interval,
|
| calculate_mean,
|
| calculate_mean_with_ci,
|
| calculate_paired_differences,
|
| calculate_standard_deviation,
|
| calculate_variance,
|
| calculate_sample_std,
|
| cohens_d,
|
| calculate_vulnerability_consistency,
|
| generate_summary_statistics,
|
| paired_t_test,
|
| )
|
|
|
|
|
| __all__ = [
|
|
|
| "compare_models",
|
| "find_most_robust_model",
|
| "find_most_stable_model",
|
| "find_most_vulnerable_model",
|
| "generate_comparative_report",
|
| "generate_vulnerability_heatmap",
|
| "get_attack_type_vulnerability",
|
| "rank_models",
|
|
|
| "BenchmarkEngine",
|
| "BenchmarkEvent",
|
| "get_benchmark_engine",
|
|
|
| "DEFAULT_BENCHMARK_DIR",
|
| "export_to_csv",
|
| "generate_benchmark_artifact",
|
| "generate_summary_report",
|
| "generate_text_report",
|
| "list_benchmarks",
|
| "load_benchmark_artifact",
|
|
|
| "BenchmarkMode",
|
| "BenchmarkPerformance",
|
| "BenchmarkResult",
|
| "BenchmarkStatus",
|
| "BenchmarkWeights",
|
| "EvaluationResult",
|
| "MetricDeltas",
|
| "ModelBenchmarkResult",
|
| "ModelMetrics",
|
| "ModelRanking",
|
| "StartBenchmarkRequest",
|
| "StartBenchmarkResponse",
|
| "VulnerabilityHeatmap",
|
| "VulnerabilityHeatmapCell",
|
|
|
| "MetricStatistics",
|
| "calculate_confidence_interval",
|
| "calculate_mean",
|
| "calculate_mean_with_ci",
|
| "calculate_paired_differences",
|
| "calculate_standard_deviation",
|
| "calculate_variance",
|
| "calculate_sample_std",
|
| "cohens_d",
|
| "calculate_vulnerability_consistency",
|
| "generate_summary_statistics",
|
| "paired_t_test",
|
| ]
|
|
|