File size: 6,692 Bytes
2b259aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
"""
YOFO Benchmark Script.
This script runs a rigorous comparison between YOFO and standard baselines.
It measures:
1. Latency (Time per example)
2. Token Usage (Input + Output tokens)
3. Extrapolated Cost (Based on GPT-4 pricing)
Baselines:
- YOFO (Ours): Single forward pass
- N-Call Judge: 12 separate API calls (one per requirement)
- CoT Judge: 1 call generating detailed reasoning
"""
import time
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import sys
import os
# Add src to path
sys.path.append(os.getcwd())
from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS
# Pricing constants (GPT-4 Turbo Pricing - Nov 2024)
PRICE_INPUT_1K = 0.01
PRICE_OUTPUT_1K = 0.03
class Benchmark:
def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initializing benchmark on {self.device}...")
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map=self.device,
trust_remote_code=True
)
self.model.eval()
self.builder = YOFOTemplateBuilder(self.tokenizer)
def _count_tokens(self, text):
return len(self.tokenizer.encode(text, add_special_tokens=False))
def benchmark_yofo(self, prompt, response, n_repeats=5):
"""Measure YOFO performance (Single Forward Pass)."""
# Prepare Input
yofo_input = self.builder.build_template(prompt, response)
# Count actual non-pad tokens
actual_tokens = yofo_input.attention_mask.sum().item()
print(f"DEBUG: YOFO actual tokens: {actual_tokens}")
input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device)
# Warmup
with torch.no_grad():
self.model(input_ids)
# Timing
latencies = []
for _ in range(n_repeats):
start = time.time()
with torch.no_grad():
self.model(input_ids)
latencies.append(time.time() - start)
avg_latency = sum(latencies) / len(latencies)
return {
"method": "YOFO (Ours)",
"latency_ms": avg_latency * 1000,
"input_tokens": actual_tokens,
"output_tokens": 0,
"calls": 1
}
def benchmark_n_call(self, prompt, response, n_repeats=1):
"""Measure N-Call Baseline (12 separate calls)."""
total_input_tokens = 0
total_output_tokens = 12
base_context = f"User: {prompt}\nResponse: {response}\n"
base_tokens = self._count_tokens(base_context)
print(f"DEBUG: Base context tokens: {base_tokens}")
for req in YOFO_REQS:
question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):"
q_tokens = self._count_tokens(question_text)
total_input_tokens += (base_tokens + q_tokens)
print(f"DEBUG: N-Call total input tokens: {total_input_tokens}")
# Timing (Simulate 1 call * 12)
sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):"
inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device)
start = time.time()
with torch.no_grad():
self.model.generate(**inputs, max_new_tokens=1)
one_call_time = time.time() - start
return {
"method": "N-Call Baseline",
"latency_ms": one_call_time * 12 * 1000,
"input_tokens": total_input_tokens,
"output_tokens": total_output_tokens,
"calls": 12
}
def benchmark_cot(self, prompt, response):
"""Measure CoT Baseline (Generate reasoning)."""
# Prompt asking for reasoning
text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:"
input_tokens = self._count_tokens(text)
inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
# Measure generation time for ~100 tokens of reasoning
start = time.time()
with torch.no_grad():
output = self.model.generate(**inputs, max_new_tokens=100)
latency = time.time() - start
output_tokens = len(output[0]) - len(inputs['input_ids'][0])
return {
"method": "Chain-of-Thought",
"latency_ms": latency * 1000,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"calls": 1
}
def calculate_cost(row):
"""Calculate cost per 1k evaluations based on tokens."""
cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K
cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K
total_cost_per_eval = cost_in + cost_out
return total_cost_per_eval * 1000 # Per 1k evals
def run_benchmark():
bench = Benchmark()
# LONG CONTEXT Example (Realistic Chat)
# This mimics a typical user query + long model explanation
prompt = "Explain the history of the Roman Empire and its fall." * 10 # ~100 tokens
response = "The Roman Empire was one of the largest... " * 20 # ~200 tokens
# Total context approx 300-400 tokens.
# This will show the penalty of repeating it 12 times.
print("\nRunning benchmarks with Long Context (~400 tokens)...")
results = []
# 1. Run YOFO
print("1. Benchmarking YOFO...")
results.append(bench.benchmark_yofo(prompt, response))
# 2. Run N-Call
print("2. Benchmarking N-Call Baseline...")
results.append(bench.benchmark_n_call(prompt, response))
# 3. Run CoT
print("3. Benchmarking CoT Baseline...")
results.append(bench.benchmark_cot(prompt, response))
# Analysis
df = pd.DataFrame(results)
df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1)
df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms']
print("\n" + "="*80)
print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)")
print("="*80)
print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False))
# Save results
df.to_csv("benchmark_results.csv", index=False)
print("\nSaved results to benchmark_results.csv")
if __name__ == "__main__":
run_benchmark()
|