Spaces:

RonniRodriguez
/

YOFO_cost_and_speed_analysis

Sleeping

App Files Files Community

YOFO_cost_and_speed_analysis / src /benchmark.py

RonniRodriguez

Initial commit of YOFO Safety Evaluator

2b259aa 21 days ago

raw

history blame contribute delete

6.69 kB

	"""
	YOFO Benchmark Script.

	This script runs a rigorous comparison between YOFO and standard baselines.
	It measures:
	1. Latency (Time per example)
	2. Token Usage (Input + Output tokens)
	3. Extrapolated Cost (Based on GPT-4 pricing)

	Baselines:
	- YOFO (Ours): Single forward pass
	- N-Call Judge: 12 separate API calls (one per requirement)
	- CoT Judge: 1 call generating detailed reasoning
	"""

	import time
	import torch
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from tqdm import tqdm
	import sys
	import os

	# Add src to path
	sys.path.append(os.getcwd())
	from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS

	# Pricing constants (GPT-4 Turbo Pricing - Nov 2024)
	PRICE_INPUT_1K = 0.01
	PRICE_OUTPUT_1K = 0.03

	class Benchmark:
	def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Initializing benchmark on {self.device}...")

	self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	self.model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
	device_map=self.device,
	trust_remote_code=True
	)
	self.model.eval()
	self.builder = YOFOTemplateBuilder(self.tokenizer)

	def _count_tokens(self, text):
	return len(self.tokenizer.encode(text, add_special_tokens=False))

	def benchmark_yofo(self, prompt, response, n_repeats=5):
	"""Measure YOFO performance (Single Forward Pass)."""

	# Prepare Input
	yofo_input = self.builder.build_template(prompt, response)

	# Count actual non-pad tokens
	actual_tokens = yofo_input.attention_mask.sum().item()
	print(f"DEBUG: YOFO actual tokens: {actual_tokens}")

	input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device)

	# Warmup
	with torch.no_grad():
	self.model(input_ids)

	# Timing
	latencies = []
	for _ in range(n_repeats):
	start = time.time()
	with torch.no_grad():
	self.model(input_ids)
	latencies.append(time.time() - start)

	avg_latency = sum(latencies) / len(latencies)

	return {
	"method": "YOFO (Ours)",
	"latency_ms": avg_latency * 1000,
	"input_tokens": actual_tokens,
	"output_tokens": 0,
	"calls": 1
	}

	def benchmark_n_call(self, prompt, response, n_repeats=1):
	"""Measure N-Call Baseline (12 separate calls)."""
	total_input_tokens = 0
	total_output_tokens = 12

	base_context = f"User: {prompt}\nResponse: {response}\n"
	base_tokens = self._count_tokens(base_context)
	print(f"DEBUG: Base context tokens: {base_tokens}")

	for req in YOFO_REQS:
	question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):"
	q_tokens = self._count_tokens(question_text)
	total_input_tokens += (base_tokens + q_tokens)

	print(f"DEBUG: N-Call total input tokens: {total_input_tokens}")

	# Timing (Simulate 1 call * 12)
	sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):"
	inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device)

	start = time.time()
	with torch.no_grad():
	self.model.generate(**inputs, max_new_tokens=1)
	one_call_time = time.time() - start

	return {
	"method": "N-Call Baseline",
	"latency_ms": one_call_time * 12 * 1000,
	"input_tokens": total_input_tokens,
	"output_tokens": total_output_tokens,
	"calls": 12
	}

	def benchmark_cot(self, prompt, response):
	"""Measure CoT Baseline (Generate reasoning)."""
	# Prompt asking for reasoning
	text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:"
	input_tokens = self._count_tokens(text)

	inputs = self.tokenizer(text, return_tensors="pt").to(self.device)

	# Measure generation time for ~100 tokens of reasoning
	start = time.time()
	with torch.no_grad():
	output = self.model.generate(**inputs, max_new_tokens=100)
	latency = time.time() - start

	output_tokens = len(output[0]) - len(inputs['input_ids'][0])

	return {
	"method": "Chain-of-Thought",
	"latency_ms": latency * 1000,
	"input_tokens": input_tokens,
	"output_tokens": output_tokens,
	"calls": 1
	}

	def calculate_cost(row):
	"""Calculate cost per 1k evaluations based on tokens."""
	cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K
	cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K
	total_cost_per_eval = cost_in + cost_out
	return total_cost_per_eval * 1000 # Per 1k evals

	def run_benchmark():
	bench = Benchmark()

	# LONG CONTEXT Example (Realistic Chat)
	# This mimics a typical user query + long model explanation
	prompt = "Explain the history of the Roman Empire and its fall." * 10 # ~100 tokens
	response = "The Roman Empire was one of the largest... " * 20 # ~200 tokens
	# Total context approx 300-400 tokens.
	# This will show the penalty of repeating it 12 times.

	print("\nRunning benchmarks with Long Context (~400 tokens)...")
	results = []

	# 1. Run YOFO
	print("1. Benchmarking YOFO...")
	results.append(bench.benchmark_yofo(prompt, response))

	# 2. Run N-Call
	print("2. Benchmarking N-Call Baseline...")
	results.append(bench.benchmark_n_call(prompt, response))

	# 3. Run CoT
	print("3. Benchmarking CoT Baseline...")
	results.append(bench.benchmark_cot(prompt, response))

	# Analysis
	df = pd.DataFrame(results)
	df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1)
	df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms']

	print("\n" + "="*80)
	print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)")
	print("="*80)
	print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False))

	# Save results
	df.to_csv("benchmark_results.csv", index=False)
	print("\nSaved results to benchmark_results.csv")

	if __name__ == "__main__":
	run_benchmark()