Spaces:
Running
Running
File size: 5,960 Bytes
39028c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | import requests
import time
import json
from statistics import mean, stdev
from typing import List, Dict
BASE_URL = "http://localhost:8000"
class PerformanceBenchmark:
def __init__(self):
self.results = {}
def run_benchmark(self, name: str, payload: Dict, iterations: int = 5) -> Dict:
times = []
print(f"\n{'='*60}")
print(f"🔍 Benchmarking: {name}")
print(f"{'='*60}")
print(f"Iterations: {iterations}")
for i in range(iterations):
start = time.time()
try:
response = requests.post(f"{BASE_URL}/summarize", json=payload, timeout=30)
elapsed = (time.time() - start) * 1000
if response.status_code == 200:
times.append(elapsed)
print(f" Iteration {i+1}: {elapsed:.2f}ms ✅")
else:
print(f" Iteration {i+1}: ERROR (Status {response.status_code}) ❌")
except Exception as e:
print(f" Iteration {i+1}: ERROR ({str(e)}) ❌")
if times:
result = {
'name': name,
'iterations': len(times),
'min_ms': min(times),
'max_ms': max(times),
'avg_ms': mean(times),
'stdev_ms': stdev(times) if len(times) > 1 else 0,
'success_rate': (len(times) / iterations) * 100
}
print(f"\n📊 Results:")
print(f" Min: {result['min_ms']:.2f}ms")
print(f" Max: {result['max_ms']:.2f}ms")
print(f" Avg: {result['avg_ms']:.2f}ms")
print(f" Std Dev: {result['stdev_ms']:.2f}ms")
print(f" Success Rate: {result['success_rate']:.1f}%")
self.results[name] = result
return result
else:
print(f"❌ All iterations failed")
return None
def print_summary(self):
"""Print benchmark summary"""
print(f"\n{'='*60}")
print("📈 BENCHMARK SUMMARY")
print(f"{'='*60}")
if not self.results:
print("No successful benchmarks")
return
# Sort by average time
sorted_results = sorted(self.results.items(), key=lambda x: x[1]['avg_ms'])
for name, result in sorted_results:
print(f"\n{name}:")
print(f" Average: {result['avg_ms']:.2f}ms")
print(f" Range: {result['min_ms']:.2f}ms - {result['max_ms']:.2f}ms")
print(f" Success: {result['success_rate']:.1f}%")
def main():
print("\n" + "="*60)
print("🚀 DOCUMENT SUMMARIZER PERFORMANCE BENCHMARK")
print("="*60)
benchmark = PerformanceBenchmark()
# Test 1: Short document with speed mode
print("\n[1/6] Speed Mode - Short Document")
payload_speed_short = {
"document": "Machine learning is AI. It learns from data.",
"quality_preference": "speed"
}
benchmark.run_benchmark("Speed Mode (Short Doc)", payload_speed_short, iterations=5)
# Test 2: Medium document with balanced mode
print("\n[2/6] Balanced Mode - Medium Document")
payload_balanced_med = {
"document": """Deep learning is a subset of machine learning using neural networks.
Each layer learns different features. It's used for image recognition, NLP, and speech.
Training requires GPU acceleration. Popular frameworks include PyTorch and TensorFlow.""",
"quality_preference": "balanced"
}
benchmark.run_benchmark("Balanced Mode (Medium Doc)", payload_balanced_med, iterations=5)
# Test 3: Quality mode
print("\n[3/6] Quality Mode - Medium Document")
payload_quality = {
"document": """Transformers have revolutionized NLP. The attention mechanism allows focusing
on relevant sequence parts. BERT and GPT are transformer-based. Self-attention enables learning
long-range dependencies. Position encoding preserves sequence information. Transformers scale
to billions of parameters like GPT-3.""",
"quality_preference": "quality"
}
benchmark.run_benchmark("Quality Mode (Medium Doc)", payload_quality, iterations=3)
# Test 4: Different intent
print("\n[4/6] Technical Overview Intent")
payload_intent = {
"document": "Convolutional neural networks use filters for feature extraction. ReLU activations add non-linearity. Pooling reduces dimensionality. CNNs excel at image tasks.",
"intent": "technical_overview"
}
benchmark.run_benchmark("Technical Overview Intent", payload_intent, iterations=5)
# Test 5: Methodology intent
print("\n[5/6] Methodology Intent")
payload_methodology = {
"document": """We collected 10000 samples. Training used SGD with learning rate 0.01.
Batch size was 32. We trained for 100 epochs. Cross-entropy was the loss function.
We achieved 95% accuracy on test set.""",
"intent": "methodology"
}
benchmark.run_benchmark("Methodology Intent", payload_methodology, iterations=5)
# Test 6: Spanish language
print("\n[6/6] Multilingual - Spanish")
payload_spanish = {
"document": "El aprendizaje automático es una rama de la inteligencia artificial importante.",
"language": "spanish",
"quality_preference": "speed"
}
benchmark.run_benchmark("Spanish Language (Speed)", payload_spanish, iterations=5)
# Print summary
benchmark.print_summary()
# Save results to file
with open('benchmark_results.json', 'w') as f:
json.dump(benchmark.results, f, indent=2)
print(f"\n✅ Results saved to benchmark_results.json")
print("\n" + "="*60)
print("✅ BENCHMARK COMPLETE")
print("="*60 + "\n")
if __name__ == "__main__":
main()
|