File size: 8,021 Bytes
3742716 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | #!/usr/bin/env python3
"""
Simple comparison of V1 vs V2 model generation quality
"""
import sys
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
from peft import PeftModel
sys.path.insert(0, str(Path(__file__).parent.parent))
from classes.expression import Expression
class ExpressionStoppingCriteria(StoppingCriteria):
def __init__(self, tokenizer, stop_sequences):
self.tokenizer = tokenizer
self.stop_ids = [tokenizer.encode(seq, add_special_tokens=False)
for seq in stop_sequences]
def __call__(self, input_ids, scores, **kwargs):
for stop_ids in self.stop_ids:
if len(stop_ids) > 0 and len(input_ids[0]) >= len(stop_ids):
if input_ids[0][-len(stop_ids):].tolist() == stop_ids:
return True
return False
def load_model(model_name, model_label):
print(f"\n{'='*60}")
print(f"Loading {model_label}: {model_name}")
print('='*60)
# Load base GPT-2
print("Loading base GPT-2...")
model = AutoModelForCausalLM.from_pretrained(
"gpt2",
torch_dtype=torch.float16,
device_map="auto"
)
# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({
"additional_special_tokens": ["<|startofex|>", "<|endofex|>"]
})
# Resize embeddings
model.resize_token_embeddings(len(tokenizer))
# Load adapter and merge
print(f"Loading adapter from {model_name}...")
model = PeftModel.from_pretrained(model, model_name)
print("Merging adapter...")
model = model.merge_and_unload()
model.eval()
print(f"✓ {model_label} loaded successfully")
return model, tokenizer
def test_model(model, tokenizer, model_label, n_samples=20):
print(f"\n{'='*60}")
print(f"Testing {model_label} - {n_samples} generations")
print('='*60)
# Same prompt for both models
prompt = """vars: x_1, x_2
oper: *, +, -, sin, cos
cons: C
expr:"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Stopping criteria
stopping_criteria = StoppingCriteriaList([
ExpressionStoppingCriteria(tokenizer, ["<|endofex|>", "\n\nvars:"])
])
# Use OPTIMAL config for each model (from FINAL_RESULTS_V1_VS_V2.md)
if model_label == "V1":
# V1 optimal: 83.3% valid rate
gen_config = {
"temperature": 0.5,
"top_k": 40,
"top_p": 0.9,
"repetition_penalty": 1.15,
"max_new_tokens": 100,
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
}
print("Using V1 optimal config: temp=0.5, top_k=40, rep_penalty=1.15")
else: # V2
# V2 optimal: 90% valid rate
gen_config = {
"temperature": 0.7,
"top_k": 0,
"top_p": 0.8,
"repetition_penalty": 1.0,
"max_new_tokens": 128,
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
}
print("Using V2 optimal config: temp=0.7, top_p=0.8 (nucleus sampling)")
results = {
"valid_count": 0,
"correct_symbols_count": 0,
"expressions": []
}
allowed_vars = {"x_1", "x_2", "C"}
allowed_ops = {"*", "+", "-", "sin", "cos", "(", ")"}
print(f"\nGenerating {n_samples} expressions...\n")
for i in range(n_samples):
output = model.generate(
**inputs,
**gen_config,
stopping_criteria=stopping_criteria
)
text = tokenizer.decode(output[0], skip_special_tokens=False)
# Extract expression
if "expr:" in text:
expr_str = text.split("expr:")[-1].strip()
expr_str = expr_str.split("<|endofex|>")[0].strip()
else:
expr_str = text
# Check if valid (can be parsed and evaluated)
is_valid = False
try:
expr = Expression(expr_str, is_prefix=False)
X_test = [[1.0, 2.0]] # Simple test
result = expr.evaluate(X_test)
if len(result) > 0 and all(x != float('inf') and x != float('-inf') and x == x for x in result):
is_valid = True
results["valid_count"] += 1
except:
pass
# Check if uses only correct symbols
has_correct_symbols = True
# Remove spaces and check tokens
expr_clean = expr_str.replace(" ", "")
# Check for allowed patterns
for char in expr_clean:
if char.isalpha() and char not in "xCsinco_":
has_correct_symbols = False
break
# Check for garbage words
garbage_words = ["Buyable", "Instore", "Online", "Muslims", "crash", "Berman",
"vars:", "oper:", "expressed", "fluent", "Avenger", "repositories"]
for word in garbage_words:
if word in expr_str:
has_correct_symbols = False
break
if has_correct_symbols:
results["correct_symbols_count"] += 1
results["expressions"].append({
"index": i + 1,
"expression": expr_str[:80], # Limit display length
"valid": is_valid,
"correct_symbols": has_correct_symbols
})
# Show first 5 samples
if i < 5:
status = "✓ Valid" if is_valid else "✗ Invalid"
symbols = "✓ Clean" if has_correct_symbols else "✗ Garbage"
print(f" [{i+1:2d}] {status:10s} {symbols:10s} | {expr_str[:60]}")
print(f"\n{'-'*60}")
print(f"RESULTS FOR {model_label}:")
print(f" Valid expressions: {results['valid_count']:2d}/{n_samples} ({results['valid_count']/n_samples*100:.1f}%)")
print(f" Correct symbols only: {results['correct_symbols_count']:2d}/{n_samples} ({results['correct_symbols_count']/n_samples*100:.1f}%)")
print(f"{'-'*60}")
return results
def main():
print("\n" + "="*60)
print("V1 vs V2 MODEL COMPARISON")
print("="*60)
print("Testing same prompt on both models")
print("Measuring: valid expressions + symbol correctness\n")
# Test V1
v1_model, v1_tokenizer = load_model("augustocsc/Se124M_700K_infix", "V1")
v1_results = test_model(v1_model, v1_tokenizer, "V1", n_samples=20)
# Clean up V1 from memory
del v1_model
torch.cuda.empty_cache()
# Test V2
v2_model, v2_tokenizer = load_model("augustocsc/Se124M_700K_infix_v2", "V2")
v2_results = test_model(v2_model, v2_tokenizer, "V2", n_samples=20)
# Final comparison
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)
print(f"\n{'Metric':<30s} {'V1':>10s} {'V2':>10s} {'Winner':>10s}")
print("-"*60)
v1_valid = v1_results["valid_count"]
v2_valid = v2_results["valid_count"]
valid_winner = "V1" if v1_valid > v2_valid else ("V2" if v2_valid > v1_valid else "TIE")
print(f"{'Valid Expressions':<30s} {v1_valid:>10d} {v2_valid:>10d} {valid_winner:>10s}")
v1_clean = v1_results["correct_symbols_count"]
v2_clean = v2_results["correct_symbols_count"]
clean_winner = "V1" if v1_clean > v2_clean else ("V2" if v2_clean > v1_clean else "TIE")
print(f"{'Correct Symbols Only':<30s} {v1_clean:>10d} {v2_clean:>10d} {clean_winner:>10s}")
print("-"*60)
print(f"{'Valid Rate':<30s} {v1_valid/20*100:>9.1f}% {v2_valid/20*100:>9.1f}%")
print(f"{'Clean Symbol Rate':<30s} {v1_clean/20*100:>9.1f}% {v2_clean/20*100:>9.1f}%")
print("="*60)
# Conclusion
print("\nConclusion:")
if v1_valid > v2_valid and v1_clean > v2_clean:
print(" → V1 is better on both metrics")
elif v2_valid > v1_valid and v2_clean > v1_clean:
print(" → V2 is better on both metrics")
else:
print(" → Mixed results - models have different strengths")
if __name__ == "__main__":
main()
|