File size: 8,021 Bytes
3742716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
"""
Simple comparison of V1 vs V2 model generation quality
"""

import sys
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
from peft import PeftModel

sys.path.insert(0, str(Path(__file__).parent.parent))
from classes.expression import Expression


class ExpressionStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer, stop_sequences):
        self.tokenizer = tokenizer
        self.stop_ids = [tokenizer.encode(seq, add_special_tokens=False)
                        for seq in stop_sequences]

    def __call__(self, input_ids, scores, **kwargs):
        for stop_ids in self.stop_ids:
            if len(stop_ids) > 0 and len(input_ids[0]) >= len(stop_ids):
                if input_ids[0][-len(stop_ids):].tolist() == stop_ids:
                    return True
        return False


def load_model(model_name, model_label):
    print(f"\n{'='*60}")
    print(f"Loading {model_label}: {model_name}")
    print('='*60)

    # Load base GPT-2
    print("Loading base GPT-2...")
    model = AutoModelForCausalLM.from_pretrained(
        "gpt2",
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Setup tokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.add_special_tokens({
        "additional_special_tokens": ["<|startofex|>", "<|endofex|>"]
    })

    # Resize embeddings
    model.resize_token_embeddings(len(tokenizer))

    # Load adapter and merge
    print(f"Loading adapter from {model_name}...")
    model = PeftModel.from_pretrained(model, model_name)
    print("Merging adapter...")
    model = model.merge_and_unload()
    model.eval()

    print(f"✓ {model_label} loaded successfully")
    return model, tokenizer


def test_model(model, tokenizer, model_label, n_samples=20):
    print(f"\n{'='*60}")
    print(f"Testing {model_label} - {n_samples} generations")
    print('='*60)

    # Same prompt for both models
    prompt = """vars: x_1, x_2
oper: *, +, -, sin, cos
cons: C
expr:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Stopping criteria
    stopping_criteria = StoppingCriteriaList([
        ExpressionStoppingCriteria(tokenizer, ["<|endofex|>", "\n\nvars:"])
    ])

    # Use OPTIMAL config for each model (from FINAL_RESULTS_V1_VS_V2.md)
    if model_label == "V1":
        # V1 optimal: 83.3% valid rate
        gen_config = {
            "temperature": 0.5,
            "top_k": 40,
            "top_p": 0.9,
            "repetition_penalty": 1.15,
            "max_new_tokens": 100,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
        }
        print("Using V1 optimal config: temp=0.5, top_k=40, rep_penalty=1.15")
    else:  # V2
        # V2 optimal: 90% valid rate
        gen_config = {
            "temperature": 0.7,
            "top_k": 0,
            "top_p": 0.8,
            "repetition_penalty": 1.0,
            "max_new_tokens": 128,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
        }
        print("Using V2 optimal config: temp=0.7, top_p=0.8 (nucleus sampling)")

    results = {
        "valid_count": 0,
        "correct_symbols_count": 0,
        "expressions": []
    }

    allowed_vars = {"x_1", "x_2", "C"}
    allowed_ops = {"*", "+", "-", "sin", "cos", "(", ")"}

    print(f"\nGenerating {n_samples} expressions...\n")

    for i in range(n_samples):
        output = model.generate(
            **inputs,
            **gen_config,
            stopping_criteria=stopping_criteria
        )
        text = tokenizer.decode(output[0], skip_special_tokens=False)

        # Extract expression
        if "expr:" in text:
            expr_str = text.split("expr:")[-1].strip()
            expr_str = expr_str.split("<|endofex|>")[0].strip()
        else:
            expr_str = text

        # Check if valid (can be parsed and evaluated)
        is_valid = False
        try:
            expr = Expression(expr_str, is_prefix=False)
            X_test = [[1.0, 2.0]]  # Simple test
            result = expr.evaluate(X_test)
            if len(result) > 0 and all(x != float('inf') and x != float('-inf') and x == x for x in result):
                is_valid = True
                results["valid_count"] += 1
        except:
            pass

        # Check if uses only correct symbols
        has_correct_symbols = True
        # Remove spaces and check tokens
        expr_clean = expr_str.replace(" ", "")
        # Check for allowed patterns
        for char in expr_clean:
            if char.isalpha() and char not in "xCsinco_":
                has_correct_symbols = False
                break

        # Check for garbage words
        garbage_words = ["Buyable", "Instore", "Online", "Muslims", "crash", "Berman",
                        "vars:", "oper:", "expressed", "fluent", "Avenger", "repositories"]
        for word in garbage_words:
            if word in expr_str:
                has_correct_symbols = False
                break

        if has_correct_symbols:
            results["correct_symbols_count"] += 1

        results["expressions"].append({
            "index": i + 1,
            "expression": expr_str[:80],  # Limit display length
            "valid": is_valid,
            "correct_symbols": has_correct_symbols
        })

        # Show first 5 samples
        if i < 5:
            status = "✓ Valid" if is_valid else "✗ Invalid"
            symbols = "✓ Clean" if has_correct_symbols else "✗ Garbage"
            print(f"  [{i+1:2d}] {status:10s} {symbols:10s} | {expr_str[:60]}")

    print(f"\n{'-'*60}")
    print(f"RESULTS FOR {model_label}:")
    print(f"  Valid expressions:    {results['valid_count']:2d}/{n_samples} ({results['valid_count']/n_samples*100:.1f}%)")
    print(f"  Correct symbols only: {results['correct_symbols_count']:2d}/{n_samples} ({results['correct_symbols_count']/n_samples*100:.1f}%)")
    print(f"{'-'*60}")

    return results


def main():
    print("\n" + "="*60)
    print("V1 vs V2 MODEL COMPARISON")
    print("="*60)
    print("Testing same prompt on both models")
    print("Measuring: valid expressions + symbol correctness\n")

    # Test V1
    v1_model, v1_tokenizer = load_model("augustocsc/Se124M_700K_infix", "V1")
    v1_results = test_model(v1_model, v1_tokenizer, "V1", n_samples=20)

    # Clean up V1 from memory
    del v1_model
    torch.cuda.empty_cache()

    # Test V2
    v2_model, v2_tokenizer = load_model("augustocsc/Se124M_700K_infix_v2", "V2")
    v2_results = test_model(v2_model, v2_tokenizer, "V2", n_samples=20)

    # Final comparison
    print("\n" + "="*60)
    print("FINAL COMPARISON")
    print("="*60)
    print(f"\n{'Metric':<30s} {'V1':>10s} {'V2':>10s} {'Winner':>10s}")
    print("-"*60)

    v1_valid = v1_results["valid_count"]
    v2_valid = v2_results["valid_count"]
    valid_winner = "V1" if v1_valid > v2_valid else ("V2" if v2_valid > v1_valid else "TIE")
    print(f"{'Valid Expressions':<30s} {v1_valid:>10d} {v2_valid:>10d} {valid_winner:>10s}")

    v1_clean = v1_results["correct_symbols_count"]
    v2_clean = v2_results["correct_symbols_count"]
    clean_winner = "V1" if v1_clean > v2_clean else ("V2" if v2_clean > v1_clean else "TIE")
    print(f"{'Correct Symbols Only':<30s} {v1_clean:>10d} {v2_clean:>10d} {clean_winner:>10s}")

    print("-"*60)
    print(f"{'Valid Rate':<30s} {v1_valid/20*100:>9.1f}% {v2_valid/20*100:>9.1f}%")
    print(f"{'Clean Symbol Rate':<30s} {v1_clean/20*100:>9.1f}% {v2_clean/20*100:>9.1f}%")
    print("="*60)

    # Conclusion
    print("\nConclusion:")
    if v1_valid > v2_valid and v1_clean > v2_clean:
        print("  → V1 is better on both metrics")
    elif v2_valid > v1_valid and v2_clean > v1_clean:
        print("  → V2 is better on both metrics")
    else:
        print("  → Mixed results - models have different strengths")


if __name__ == "__main__":
    main()