aegislm / backend /benchmarking /__init__.py
ACA050's picture
Upload 50 files
1a4aa87 verified
"""
AegisLM Benchmarking Module
Provides benchmarking capabilities for evaluating LLM robustness:
- Baseline evaluation mode
- Adversarial evaluation mode
- Delta robustness computation
- Cross-model comparison
- Statistical reporting
- Benchmark artifact generation
"""
from backend.benchmarking.comparison import (
compare_models,
find_most_robust_model,
find_most_stable_model,
find_most_vulnerable_model,
generate_comparative_report,
generate_vulnerability_heatmap,
get_attack_type_vulnerability,
rank_models,
)
from backend.benchmarking.engine import (
BenchmarkEngine,
BenchmarkEvent,
get_benchmark_engine,
)
from backend.benchmarking.reporter import (
DEFAULT_BENCHMARK_DIR,
export_to_csv,
generate_benchmark_artifact,
generate_summary_report,
generate_text_report,
list_benchmarks,
load_benchmark_artifact,
)
from backend.benchmarking.schemas import (
BenchmarkMode,
BenchmarkPerformance,
BenchmarkResult,
BenchmarkStatus,
BenchmarkWeights,
EvaluationResult,
MetricDeltas,
ModelBenchmarkResult,
ModelMetrics,
ModelRanking,
StartBenchmarkRequest,
StartBenchmarkResponse,
VulnerabilityHeatmap,
VulnerabilityHeatmapCell,
)
from backend.benchmarking.statistics import (
MetricStatistics,
calculate_confidence_interval,
calculate_mean,
calculate_mean_with_ci,
calculate_paired_differences,
calculate_standard_deviation,
calculate_variance,
calculate_sample_std,
cohens_d,
calculate_vulnerability_consistency,
generate_summary_statistics,
paired_t_test,
)
__all__ = [
# Comparison
"compare_models",
"find_most_robust_model",
"find_most_stable_model",
"find_most_vulnerable_model",
"generate_comparative_report",
"generate_vulnerability_heatmap",
"get_attack_type_vulnerability",
"rank_models",
# Engine
"BenchmarkEngine",
"BenchmarkEvent",
"get_benchmark_engine",
# Reporter
"DEFAULT_BENCHMARK_DIR",
"export_to_csv",
"generate_benchmark_artifact",
"generate_summary_report",
"generate_text_report",
"list_benchmarks",
"load_benchmark_artifact",
# Schemas
"BenchmarkMode",
"BenchmarkPerformance",
"BenchmarkResult",
"BenchmarkStatus",
"BenchmarkWeights",
"EvaluationResult",
"MetricDeltas",
"ModelBenchmarkResult",
"ModelMetrics",
"ModelRanking",
"StartBenchmarkRequest",
"StartBenchmarkResponse",
"VulnerabilityHeatmap",
"VulnerabilityHeatmapCell",
# Statistics
"MetricStatistics",
"calculate_confidence_interval",
"calculate_mean",
"calculate_mean_with_ci",
"calculate_paired_differences",
"calculate_standard_deviation",
"calculate_variance",
"calculate_sample_std",
"cohens_d",
"calculate_vulnerability_consistency",
"generate_summary_statistics",
"paired_t_test",
]