RonniRodriguez's picture
Initial commit of YOFO Safety Evaluator
2b259aa
"""
YOFO Benchmark Script.
This script runs a rigorous comparison between YOFO and standard baselines.
It measures:
1. Latency (Time per example)
2. Token Usage (Input + Output tokens)
3. Extrapolated Cost (Based on GPT-4 pricing)
Baselines:
- YOFO (Ours): Single forward pass
- N-Call Judge: 12 separate API calls (one per requirement)
- CoT Judge: 1 call generating detailed reasoning
"""
import time
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import sys
import os
# Add src to path
sys.path.append(os.getcwd())
from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS
# Pricing constants (GPT-4 Turbo Pricing - Nov 2024)
PRICE_INPUT_1K = 0.01
PRICE_OUTPUT_1K = 0.03
class Benchmark:
def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initializing benchmark on {self.device}...")
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map=self.device,
trust_remote_code=True
)
self.model.eval()
self.builder = YOFOTemplateBuilder(self.tokenizer)
def _count_tokens(self, text):
return len(self.tokenizer.encode(text, add_special_tokens=False))
def benchmark_yofo(self, prompt, response, n_repeats=5):
"""Measure YOFO performance (Single Forward Pass)."""
# Prepare Input
yofo_input = self.builder.build_template(prompt, response)
# Count actual non-pad tokens
actual_tokens = yofo_input.attention_mask.sum().item()
print(f"DEBUG: YOFO actual tokens: {actual_tokens}")
input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device)
# Warmup
with torch.no_grad():
self.model(input_ids)
# Timing
latencies = []
for _ in range(n_repeats):
start = time.time()
with torch.no_grad():
self.model(input_ids)
latencies.append(time.time() - start)
avg_latency = sum(latencies) / len(latencies)
return {
"method": "YOFO (Ours)",
"latency_ms": avg_latency * 1000,
"input_tokens": actual_tokens,
"output_tokens": 0,
"calls": 1
}
def benchmark_n_call(self, prompt, response, n_repeats=1):
"""Measure N-Call Baseline (12 separate calls)."""
total_input_tokens = 0
total_output_tokens = 12
base_context = f"User: {prompt}\nResponse: {response}\n"
base_tokens = self._count_tokens(base_context)
print(f"DEBUG: Base context tokens: {base_tokens}")
for req in YOFO_REQS:
question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):"
q_tokens = self._count_tokens(question_text)
total_input_tokens += (base_tokens + q_tokens)
print(f"DEBUG: N-Call total input tokens: {total_input_tokens}")
# Timing (Simulate 1 call * 12)
sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):"
inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device)
start = time.time()
with torch.no_grad():
self.model.generate(**inputs, max_new_tokens=1)
one_call_time = time.time() - start
return {
"method": "N-Call Baseline",
"latency_ms": one_call_time * 12 * 1000,
"input_tokens": total_input_tokens,
"output_tokens": total_output_tokens,
"calls": 12
}
def benchmark_cot(self, prompt, response):
"""Measure CoT Baseline (Generate reasoning)."""
# Prompt asking for reasoning
text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:"
input_tokens = self._count_tokens(text)
inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
# Measure generation time for ~100 tokens of reasoning
start = time.time()
with torch.no_grad():
output = self.model.generate(**inputs, max_new_tokens=100)
latency = time.time() - start
output_tokens = len(output[0]) - len(inputs['input_ids'][0])
return {
"method": "Chain-of-Thought",
"latency_ms": latency * 1000,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"calls": 1
}
def calculate_cost(row):
"""Calculate cost per 1k evaluations based on tokens."""
cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K
cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K
total_cost_per_eval = cost_in + cost_out
return total_cost_per_eval * 1000 # Per 1k evals
def run_benchmark():
bench = Benchmark()
# LONG CONTEXT Example (Realistic Chat)
# This mimics a typical user query + long model explanation
prompt = "Explain the history of the Roman Empire and its fall." * 10 # ~100 tokens
response = "The Roman Empire was one of the largest... " * 20 # ~200 tokens
# Total context approx 300-400 tokens.
# This will show the penalty of repeating it 12 times.
print("\nRunning benchmarks with Long Context (~400 tokens)...")
results = []
# 1. Run YOFO
print("1. Benchmarking YOFO...")
results.append(bench.benchmark_yofo(prompt, response))
# 2. Run N-Call
print("2. Benchmarking N-Call Baseline...")
results.append(bench.benchmark_n_call(prompt, response))
# 3. Run CoT
print("3. Benchmarking CoT Baseline...")
results.append(bench.benchmark_cot(prompt, response))
# Analysis
df = pd.DataFrame(results)
df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1)
df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms']
print("\n" + "="*80)
print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)")
print("="*80)
print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False))
# Save results
df.to_csv("benchmark_results.csv", index=False)
print("\nSaved results to benchmark_results.csv")
if __name__ == "__main__":
run_benchmark()