File size: 6,376 Bytes
cfcbbc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
"""
Compare two model variants to see if they have different configurations.
Usage:
  export CBORG_API_KEY=...
  python compare_model_configs.py openai/o:latest openai/o3
"""
import os
import sys
from openai import OpenAI
import json

def test_model_detailed(client, model_id):
    """Test a model and return detailed response information."""
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": "What is 2+2?"}],
            max_tokens=10,
            temperature=1.0,  # Explicitly set
            top_p=1.0,        # Explicitly set
        )
        
        # Extract all available information
        info = {
            'model': response.model,
            'id': response.id,
            'created': response.created,
            'object': response.object,
            'system_fingerprint': getattr(response, 'system_fingerprint', None),
            'usage': {
                'prompt_tokens': response.usage.prompt_tokens,
                'completion_tokens': response.usage.completion_tokens,
                'total_tokens': response.usage.total_tokens,
            },
            'response_content': response.choices[0].message.content,
            'finish_reason': response.choices[0].finish_reason,
        }
        
        # Try to get any additional metadata
        try:
            info['raw_response'] = str(response)
        except:
            pass
            
        return info, None
    except Exception as e:
        return None, str(e)

def main():
    if len(sys.argv) < 3:
        print("Usage: python compare_model_configs.py <model1> <model2>")
        print("Example: python compare_model_configs.py openai/o:latest openai/o3")
        sys.exit(1)
    
    model1 = sys.argv[1]
    model2 = sys.argv[2]
    
    api_key = os.environ.get('CBORG_API_KEY')
    if not api_key:
        print("Error: CBORG_API_KEY environment variable not set.")
        sys.exit(1)

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.cborg.lbl.gov"
    )
    
    print("=" * 100)
    print(f"COMPARING: {model1} vs {model2}")
    print("=" * 100)
    print()
    
    # Test model 1
    print(f"Testing {model1}...")
    info1, error1 = test_model_detailed(client, model1)
    
    if error1:
        print(f"❌ Error: {error1}")
        sys.exit(1)
    
    # Test model 2
    print(f"Testing {model2}...")
    info2, error2 = test_model_detailed(client, model2)
    
    if error2:
        print(f"❌ Error: {error2}")
        sys.exit(1)
    
    print()
    print("=" * 100)
    print("COMPARISON RESULTS")
    print("=" * 100)
    print()
    
    # Compare underlying models
    print("1. UNDERLYING MODEL:")
    print(f"   {model1:<30}{info1['model']}")
    print(f"   {model2:<30}{info2['model']}")
    if info1['model'] == info2['model']:
        print("   ✓ SAME underlying model")
    else:
        print("   ⚠️  DIFFERENT underlying models!")
    print()
    
    # Compare system fingerprints (if available)
    print("2. SYSTEM FINGERPRINT:")
    print(f"   {model1:<30}{info1['system_fingerprint']}")
    print(f"   {model2:<30}{info2['system_fingerprint']}")
    if info1['system_fingerprint'] == info2['system_fingerprint']:
        print("   ✓ SAME system fingerprint")
    elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None:
        print("   ⚠️  System fingerprint not available")
    else:
        print("   ⚠️  DIFFERENT system fingerprints!")
    print()
    
    # Compare token usage patterns
    print("3. TOKEN USAGE (for same prompt):")
    print(f"   {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}")
    print(f"   {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}")
    if info1['usage'] == info2['usage']:
        print("   ✓ IDENTICAL token usage")
    else:
        print("   ⚠️  Different token usage (could indicate different behavior)")
    print()
    
    # Compare responses
    print("4. RESPONSE CONTENT:")
    print(f"   {model1}: \"{info1['response_content']}\"")
    print(f"   {model2}: \"{info2['response_content']}\"")
    if info1['response_content'] == info2['response_content']:
        print("   ✓ IDENTICAL responses")
    else:
        print("   ⚠️  Different responses")
    print()
    
    # Show raw response if available
    if 'raw_response' in info1:
        print("5. RAW RESPONSE MODEL 1:")
        print(f"   {info1['raw_response'][:500]}")
        print()
        print("6. RAW RESPONSE MODEL 2:")
        print(f"   {info2['raw_response'][:500]}")
        print()
    
    # Final verdict
    print("=" * 100)
    print("VERDICT:")
    print("=" * 100)
    
    same_count = 0
    total_count = 4
    
    if info1['model'] == info2['model']:
        same_count += 1
    if info1['system_fingerprint'] == info2['system_fingerprint'] or \
       (info1['system_fingerprint'] is None and info2['system_fingerprint'] is None):
        same_count += 1
    if info1['usage'] == info2['usage']:
        same_count += 1
    if info1['response_content'] == info2['response_content']:
        same_count += 1
    
    print(f"Similarity: {same_count}/{total_count} metrics match")
    print()
    
    if same_count == total_count:
        print("✓ Models appear to be IDENTICAL")
        print("  → Same underlying model, same configuration")
        print("  → Likely just different aliases for the same deployment")
    elif info1['model'] == info2['model'] and same_count >= 2:
        print("⚠️  Models use the SAME base model but show some differences")
        print("  → Could be due to:")
        print("    - Different deployment instances")
        print("    - Randomness in generation")
        print("    - Different routing/load balancing")
    else:
        print("⚠️  Models appear to be DIFFERENT")
        print("  → Different configurations or versions")
    
    print()
    print("NOTE: In your dataset, these models have different performance because")
    print("      they represent different experimental runs, not necessarily different")
    print("      model configurations.")
    print("=" * 100)

if __name__ == '__main__':
    main()