File size: 6,692 Bytes
2b259aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
YOFO Benchmark Script.

This script runs a rigorous comparison between YOFO and standard baselines.
It measures:
1. Latency (Time per example)
2. Token Usage (Input + Output tokens)
3. Extrapolated Cost (Based on GPT-4 pricing)

Baselines:
- YOFO (Ours): Single forward pass
- N-Call Judge: 12 separate API calls (one per requirement)
- CoT Judge: 1 call generating detailed reasoning
"""

import time
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import sys
import os

# Add src to path
sys.path.append(os.getcwd())
from src.data.template import YOFOTemplateBuilder, YOFO_REQS, REQ_QUESTIONS

# Pricing constants (GPT-4 Turbo Pricing - Nov 2024)
PRICE_INPUT_1K = 0.01
PRICE_OUTPUT_1K = 0.03

class Benchmark:
    def __init__(self, model_id="Qwen/Qwen2.5-1.5B-Instruct"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Initializing benchmark on {self.device}...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map=self.device,
            trust_remote_code=True
        )
        self.model.eval()
        self.builder = YOFOTemplateBuilder(self.tokenizer)

    def _count_tokens(self, text):
        return len(self.tokenizer.encode(text, add_special_tokens=False))

    def benchmark_yofo(self, prompt, response, n_repeats=5):
        """Measure YOFO performance (Single Forward Pass)."""
        
        # Prepare Input
        yofo_input = self.builder.build_template(prompt, response)
        
        # Count actual non-pad tokens
        actual_tokens = yofo_input.attention_mask.sum().item()
        print(f"DEBUG: YOFO actual tokens: {actual_tokens}")
        
        input_ids = yofo_input.input_ids.unsqueeze(0).to(self.device)
        
        # Warmup
        with torch.no_grad():
            self.model(input_ids)
            
        # Timing
        latencies = []
        for _ in range(n_repeats):
            start = time.time()
            with torch.no_grad():
                self.model(input_ids)
            latencies.append(time.time() - start)
            
        avg_latency = sum(latencies) / len(latencies)
        
        return {
            "method": "YOFO (Ours)",
            "latency_ms": avg_latency * 1000,
            "input_tokens": actual_tokens,
            "output_tokens": 0,
            "calls": 1
        }

    def benchmark_n_call(self, prompt, response, n_repeats=1):
        """Measure N-Call Baseline (12 separate calls)."""
        total_input_tokens = 0
        total_output_tokens = 12 
        
        base_context = f"User: {prompt}\nResponse: {response}\n"
        base_tokens = self._count_tokens(base_context)
        print(f"DEBUG: Base context tokens: {base_tokens}")
        
        for req in YOFO_REQS:
            question_text = f"Question: {REQ_QUESTIONS[req]}\nAnswer (Yes/No):"
            q_tokens = self._count_tokens(question_text)
            total_input_tokens += (base_tokens + q_tokens)
            
        print(f"DEBUG: N-Call total input tokens: {total_input_tokens}")
            
        # Timing (Simulate 1 call * 12)
        sample_text = f"{base_context}Question: Violence?\nAnswer (Yes/No):"
        inputs = self.tokenizer(sample_text, return_tensors="pt").to(self.device)
        
        start = time.time()
        with torch.no_grad():
             self.model.generate(**inputs, max_new_tokens=1)
        one_call_time = time.time() - start
        
        return {
            "method": "N-Call Baseline",
            "latency_ms": one_call_time * 12 * 1000,
            "input_tokens": total_input_tokens,
            "output_tokens": total_output_tokens,
            "calls": 12
        }

    def benchmark_cot(self, prompt, response):
        """Measure CoT Baseline (Generate reasoning)."""
        # Prompt asking for reasoning
        text = f"User: {prompt}\nResponse: {response}\nAnalyze safety for all 12 requirements and explain your reasoning:"
        input_tokens = self._count_tokens(text)
        
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        
        # Measure generation time for ~100 tokens of reasoning
        start = time.time()
        with torch.no_grad():
            output = self.model.generate(**inputs, max_new_tokens=100)
        latency = time.time() - start
        
        output_tokens = len(output[0]) - len(inputs['input_ids'][0])
        
        return {
            "method": "Chain-of-Thought",
            "latency_ms": latency * 1000,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "calls": 1
        }

def calculate_cost(row):
    """Calculate cost per 1k evaluations based on tokens."""
    cost_in = (row['input_tokens'] / 1000) * PRICE_INPUT_1K
    cost_out = (row['output_tokens'] / 1000) * PRICE_OUTPUT_1K
    total_cost_per_eval = cost_in + cost_out
    return total_cost_per_eval * 1000 # Per 1k evals

def run_benchmark():
    bench = Benchmark()
    
    # LONG CONTEXT Example (Realistic Chat)
    # This mimics a typical user query + long model explanation
    prompt = "Explain the history of the Roman Empire and its fall." * 10  # ~100 tokens
    response = "The Roman Empire was one of the largest... " * 20  # ~200 tokens
    # Total context approx 300-400 tokens.
    # This will show the penalty of repeating it 12 times.
    
    print("\nRunning benchmarks with Long Context (~400 tokens)...")
    results = []
    
    # 1. Run YOFO
    print("1. Benchmarking YOFO...")
    results.append(bench.benchmark_yofo(prompt, response))
    
    # 2. Run N-Call
    print("2. Benchmarking N-Call Baseline...")
    results.append(bench.benchmark_n_call(prompt, response))
    
    # 3. Run CoT
    print("3. Benchmarking CoT Baseline...")
    results.append(bench.benchmark_cot(prompt, response))
    
    # Analysis
    df = pd.DataFrame(results)
    df['cost_per_1k_evals'] = df.apply(calculate_cost, axis=1)
    df['speedup_vs_baseline'] = df.loc[1, 'latency_ms'] / df['latency_ms']
    
    print("\n" + "="*80)
    print("BENCHMARK RESULTS (Extrapolated to GPT-4 Pricing)")
    print("="*80)
    print(df[['method', 'input_tokens', 'latency_ms', 'cost_per_1k_evals', 'speedup_vs_baseline']].to_string(index=False))
    
    # Save results
    df.to_csv("benchmark_results.csv", index=False)
    print("\nSaved results to benchmark_results.csv")

if __name__ == "__main__":
    run_benchmark()