""" YOFO Benchmark Script. This script runs a rigorous comparison between YOFO and standard baselines. It measures: 1. Latency (Time per example) 2. Token Usage (Input + Output tokens) 3. Extrapolated Cost (Based on GPT-4 pricing) Baselines: - YOFO (Ours): Single forward pass - N-Call Judge: 12 separate API calls (one per requirement) - CoT Judge: 1 call generating detailed reasoning """ import time import torch import pandas as pd from transformers import AutoTokenizer, AutoModelForCausalLM from tqdm import tqdm import sys import os # Add src to path sys.path.append(os.getcwd()) from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS # Pricing constants (GPT-4 Turbo Pricing - Nov 2024) PRICE_INPUT_1K = 0.01 PRICE_OUTPUT_1K = 0.03 class Benchmark: def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"): self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Initializing benchmark on {self.device}...") self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map=self.device, trust_remote_code=True ) self.model.eval() self.builder = YOFOTemplateBuilder(self.tokenizer) def _count_tokens(self, text): return len(self.tokenizer.encode(text, add_special_tokens=False)) def benchmark_yofo(self, prompt, response, n_repeats=5): """Measure YOFO performance (Single Forward Pass).""" # Prepare Input yofo_input = self.builder.build_template(prompt, response) # Count actual non-pad tokens actual_tokens = yofo_input.attention_mask.sum().item() print(f"DEBUG: YOFO actual tokens: {actual_tokens}") input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device) # Warmup with torch.no_grad(): self.model(input_ids) # Timing latencies = [] for _ in range(n_repeats): start = time.time() with torch.no_grad(): self.model(input_ids) latencies.append(time.time() - start) avg_latency = sum(latencies) / len(latencies) return { "method": "YOFO (Ours)", "latency_ms": avg_latency * 1000, "input_tokens": actual_tokens, "output_tokens": 0, "calls": 1 } def benchmark_n_call(self, prompt, response, n_repeats=1): """Measure N-Call Baseline (12 separate calls).""" total_input_tokens = 0 total_output_tokens = 12 base_context = f"User: {prompt}\nResponse: {response}\n" base_tokens = self._count_tokens(base_context) print(f"DEBUG: Base context tokens: {base_tokens}") for req in YOFO_REQS: question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):" q_tokens = self._count_tokens(question_text) total_input_tokens += (base_tokens + q_tokens) print(f"DEBUG: N-Call total input tokens: {total_input_tokens}") # Timing (Simulate 1 call * 12) sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):" inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device) start = time.time() with torch.no_grad(): self.model.generate(**inputs, max_new_tokens=1) one_call_time = time.time() - start return { "method": "N-Call Baseline", "latency_ms": one_call_time * 12 * 1000, "input_tokens": total_input_tokens, "output_tokens": total_output_tokens, "calls": 12 } def benchmark_cot(self, prompt, response): """Measure CoT Baseline (Generate reasoning).""" # Prompt asking for reasoning text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:" input_tokens = self._count_tokens(text) inputs = self.tokenizer(text, return_tensors="pt").to(self.device) # Measure generation time for ~100 tokens of reasoning start = time.time() with torch.no_grad(): output = self.model.generate(**inputs, max_new_tokens=100) latency = time.time() - start output_tokens = len(output[0]) - len(inputs['input_ids'][0]) return { "method": "Chain-of-Thought", "latency_ms": latency * 1000, "input_tokens": input_tokens, "output_tokens": output_tokens, "calls": 1 } def calculate_cost(row): """Calculate cost per 1k evaluations based on tokens.""" cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K total_cost_per_eval = cost_in + cost_out return total_cost_per_eval * 1000 # Per 1k evals def run_benchmark(): bench = Benchmark() # LONG CONTEXT Example (Realistic Chat) # This mimics a typical user query + long model explanation prompt = "Explain the history of the Roman Empire and its fall." * 10 # ~100 tokens response = "The Roman Empire was one of the largest... " * 20 # ~200 tokens # Total context approx 300-400 tokens. # This will show the penalty of repeating it 12 times. print("\nRunning benchmarks with Long Context (~400 tokens)...") results = [] # 1. Run YOFO print("1. Benchmarking YOFO...") results.append(bench.benchmark_yofo(prompt, response)) # 2. Run N-Call print("2. Benchmarking N-Call Baseline...") results.append(bench.benchmark_n_call(prompt, response)) # 3. Run CoT print("3. Benchmarking CoT Baseline...") results.append(bench.benchmark_cot(prompt, response)) # Analysis df = pd.DataFrame(results) df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1) df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms'] print("\n" + "="*80) print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)") print("="*80) print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False)) # Save results df.to_csv("benchmark_results.csv", index=False) print("\nSaved results to benchmark_results.csv") if __name__ == "__main__": run_benchmark()