File size: 6,538 Bytes
8ef2d83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
#!/bin/bash
#
# HAT Benchmark Reproducibility Suite
# ===================================
#
# This script runs all benchmarks from the HAT paper and generates
# a comprehensive results report.
#
# Usage:
# ./run_all_benchmarks.sh [--quick]
#
# Options:
# --quick Run abbreviated benchmarks (faster, less thorough)
#
# Requirements:
# - Rust toolchain (cargo)
# - Python 3.8+ with venv
# - ~2GB free disk space
# - ~10 minutes for full suite, ~2 minutes for quick
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
RESULTS_DIR="$SCRIPT_DIR/results"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS_FILE="$RESULTS_DIR/benchmark_results_$TIMESTAMP.txt"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Parse arguments
QUICK_MODE=false
if [[ "$1" == "--quick" ]]; then
QUICK_MODE=true
echo -e "${YELLOW}Running in quick mode (abbreviated benchmarks)${NC}"
fi
# Create results directory
mkdir -p "$RESULTS_DIR"
echo "========================================================================"
echo " HAT Benchmark Reproducibility Suite"
echo " $(date)"
echo "========================================================================"
echo ""
echo "Project directory: $PROJECT_DIR"
echo "Results will be saved to: $RESULTS_FILE"
echo ""
# Initialize results file
cat > "$RESULTS_FILE" << EOF
HAT Benchmark Results
=====================
Date: $(date)
Host: $(hostname)
Rust: $(rustc --version)
Quick mode: $QUICK_MODE
EOF
cd "$PROJECT_DIR"
# Function to run a test and capture results
run_benchmark() {
local name="$1"
local test_name="$2"
echo -e "${BLUE}[$name]${NC} Running..."
echo "" >> "$RESULTS_FILE"
echo "=== $name ===" >> "$RESULTS_FILE"
echo "" >> "$RESULTS_FILE"
if cargo test --test "$test_name" -- --nocapture 2>&1 | tee -a "$RESULTS_FILE"; then
echo -e "${GREEN}[$name]${NC} PASSED"
else
echo -e "${RED}[$name]${NC} FAILED"
echo "FAILED" >> "$RESULTS_FILE"
fi
echo ""
}
echo "========================================================================"
echo " Phase 1: Building Project"
echo "========================================================================"
echo "Building release version..."
cargo build --release 2>&1 | tail -5
echo "Building test suite..."
cargo build --tests 2>&1 | tail -5
echo ""
echo "========================================================================"
echo " Phase 2: Running Core Benchmarks"
echo "========================================================================"
# Phase 3.1: HAT vs HNSW
echo ""
echo "--- Phase 3.1: HAT vs HNSW Comparative Benchmark ---"
run_benchmark "HAT vs HNSW" "phase31_hat_vs_hnsw"
# Phase 3.2: Real Embeddings
echo ""
echo "--- Phase 3.2: Real Embedding Dimensions ---"
run_benchmark "Real Embeddings" "phase32_real_embeddings"
# Phase 3.3: Persistence
echo ""
echo "--- Phase 3.3: Persistence Layer ---"
run_benchmark "Persistence" "phase33_persistence"
# Phase 4.2: Attention State
echo ""
echo "--- Phase 4.2: Attention State Format ---"
run_benchmark "Attention State" "phase42_attention_state"
echo ""
echo "========================================================================"
echo " Phase 3: Python Integration Tests"
echo "========================================================================"
# Check for Python venv
VENV_DIR="/tmp/arms-hat-bench-venv"
if [[ ! -d "$VENV_DIR" ]]; then
echo "Creating Python virtual environment..."
python3 -m venv "$VENV_DIR"
fi
source "$VENV_DIR/bin/activate"
# Install dependencies
echo "Installing Python dependencies..."
pip install -q maturin pytest 2>/dev/null || true
# Build Python extension
echo "Building Python extension..."
maturin develop --features python 2>&1 | tail -3
# Run Python tests
echo ""
echo "--- Python Binding Tests ---"
echo "" >> "$RESULTS_FILE"
echo "=== Python Binding Tests ===" >> "$RESULTS_FILE"
echo "" >> "$RESULTS_FILE"
if python -m pytest "$PROJECT_DIR/python/tests/" -v 2>&1 | tee -a "$RESULTS_FILE"; then
echo -e "${GREEN}[Python Tests]${NC} PASSED"
else
echo -e "${RED}[Python Tests]${NC} FAILED"
fi
echo ""
echo "========================================================================"
echo " Phase 4: End-to-End Demo"
echo "========================================================================"
echo "" >> "$RESULTS_FILE"
echo "=== End-to-End Demo ===" >> "$RESULTS_FILE"
echo "" >> "$RESULTS_FILE"
# Check for sentence-transformers
if pip show sentence-transformers >/dev/null 2>&1; then
echo "Running end-to-end demo with real embeddings..."
python "$PROJECT_DIR/examples/demo_hat_memory.py" 2>&1 | tee -a "$RESULTS_FILE"
else
echo "Installing sentence-transformers for full demo..."
pip install -q sentence-transformers 2>/dev/null || true
if pip show sentence-transformers >/dev/null 2>&1; then
python "$PROJECT_DIR/examples/demo_hat_memory.py" 2>&1 | tee -a "$RESULTS_FILE"
else
echo "Running demo with pseudo-embeddings (sentence-transformers not available)..."
python "$PROJECT_DIR/examples/demo_hat_memory.py" 2>&1 | tee -a "$RESULTS_FILE"
fi
fi
deactivate
echo ""
echo "========================================================================"
echo " Summary"
echo "========================================================================"
# Extract key metrics from results
echo "" >> "$RESULTS_FILE"
echo "=== Summary ===" >> "$RESULTS_FILE"
echo "" >> "$RESULTS_FILE"
# Count passed tests
RUST_PASSED=$(grep -c "test .* ok" "$RESULTS_FILE" 2>/dev/null || echo "0")
PYTHON_PASSED=$(grep -c "PASSED" "$RESULTS_FILE" 2>/dev/null || echo "0")
echo "Results saved to: $RESULTS_FILE"
echo ""
echo "Key Results:"
echo " - Rust tests passed: ~$RUST_PASSED"
echo " - Python tests passed: ~$PYTHON_PASSED"
echo ""
# Extract recall metrics if available
if grep -q "HAT enables 100% recall" "$RESULTS_FILE"; then
echo -e "${GREEN}Core claim validated: 100% recall achieved${NC}"
fi
if grep -q "Average retrieval latency" "$RESULTS_FILE"; then
LATENCY=$(grep "Average retrieval latency" "$RESULTS_FILE" | tail -1 | grep -oE '[0-9]+\.[0-9]+ms')
echo " - Retrieval latency: $LATENCY"
fi
echo ""
echo "========================================================================"
echo " Benchmark Complete"
echo "========================================================================"
echo ""
echo "Full results: $RESULTS_FILE"
echo ""
|