File size: 6,376 Bytes
cfcbbc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
#!/usr/bin/env python3
"""
Compare two model variants to see if they have different configurations.
Usage:
export CBORG_API_KEY=...
python compare_model_configs.py openai/o:latest openai/o3
"""
import os
import sys
from openai import OpenAI
import json
def test_model_detailed(client, model_id):
"""Test a model and return detailed response information."""
try:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "What is 2+2?"}],
max_tokens=10,
temperature=1.0, # Explicitly set
top_p=1.0, # Explicitly set
)
# Extract all available information
info = {
'model': response.model,
'id': response.id,
'created': response.created,
'object': response.object,
'system_fingerprint': getattr(response, 'system_fingerprint', None),
'usage': {
'prompt_tokens': response.usage.prompt_tokens,
'completion_tokens': response.usage.completion_tokens,
'total_tokens': response.usage.total_tokens,
},
'response_content': response.choices[0].message.content,
'finish_reason': response.choices[0].finish_reason,
}
# Try to get any additional metadata
try:
info['raw_response'] = str(response)
except:
pass
return info, None
except Exception as e:
return None, str(e)
def main():
if len(sys.argv) < 3:
print("Usage: python compare_model_configs.py <model1> <model2>")
print("Example: python compare_model_configs.py openai/o:latest openai/o3")
sys.exit(1)
model1 = sys.argv[1]
model2 = sys.argv[2]
api_key = os.environ.get('CBORG_API_KEY')
if not api_key:
print("Error: CBORG_API_KEY environment variable not set.")
sys.exit(1)
client = OpenAI(
api_key=api_key,
base_url="https://api.cborg.lbl.gov"
)
print("=" * 100)
print(f"COMPARING: {model1} vs {model2}")
print("=" * 100)
print()
# Test model 1
print(f"Testing {model1}...")
info1, error1 = test_model_detailed(client, model1)
if error1:
print(f"❌ Error: {error1}")
sys.exit(1)
# Test model 2
print(f"Testing {model2}...")
info2, error2 = test_model_detailed(client, model2)
if error2:
print(f"❌ Error: {error2}")
sys.exit(1)
print()
print("=" * 100)
print("COMPARISON RESULTS")
print("=" * 100)
print()
# Compare underlying models
print("1. UNDERLYING MODEL:")
print(f" {model1:<30} → {info1['model']}")
print(f" {model2:<30} → {info2['model']}")
if info1['model'] == info2['model']:
print(" ✓ SAME underlying model")
else:
print(" ⚠️ DIFFERENT underlying models!")
print()
# Compare system fingerprints (if available)
print("2. SYSTEM FINGERPRINT:")
print(f" {model1:<30} → {info1['system_fingerprint']}")
print(f" {model2:<30} → {info2['system_fingerprint']}")
if info1['system_fingerprint'] == info2['system_fingerprint']:
print(" ✓ SAME system fingerprint")
elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None:
print(" ⚠️ System fingerprint not available")
else:
print(" ⚠️ DIFFERENT system fingerprints!")
print()
# Compare token usage patterns
print("3. TOKEN USAGE (for same prompt):")
print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}")
print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}")
if info1['usage'] == info2['usage']:
print(" ✓ IDENTICAL token usage")
else:
print(" ⚠️ Different token usage (could indicate different behavior)")
print()
# Compare responses
print("4. RESPONSE CONTENT:")
print(f" {model1}: \"{info1['response_content']}\"")
print(f" {model2}: \"{info2['response_content']}\"")
if info1['response_content'] == info2['response_content']:
print(" ✓ IDENTICAL responses")
else:
print(" ⚠️ Different responses")
print()
# Show raw response if available
if 'raw_response' in info1:
print("5. RAW RESPONSE MODEL 1:")
print(f" {info1['raw_response'][:500]}")
print()
print("6. RAW RESPONSE MODEL 2:")
print(f" {info2['raw_response'][:500]}")
print()
# Final verdict
print("=" * 100)
print("VERDICT:")
print("=" * 100)
same_count = 0
total_count = 4
if info1['model'] == info2['model']:
same_count += 1
if info1['system_fingerprint'] == info2['system_fingerprint'] or \
(info1['system_fingerprint'] is None and info2['system_fingerprint'] is None):
same_count += 1
if info1['usage'] == info2['usage']:
same_count += 1
if info1['response_content'] == info2['response_content']:
same_count += 1
print(f"Similarity: {same_count}/{total_count} metrics match")
print()
if same_count == total_count:
print("✓ Models appear to be IDENTICAL")
print(" → Same underlying model, same configuration")
print(" → Likely just different aliases for the same deployment")
elif info1['model'] == info2['model'] and same_count >= 2:
print("⚠️ Models use the SAME base model but show some differences")
print(" → Could be due to:")
print(" - Different deployment instances")
print(" - Randomness in generation")
print(" - Different routing/load balancing")
else:
print("⚠️ Models appear to be DIFFERENT")
print(" → Different configurations or versions")
print()
print("NOTE: In your dataset, these models have different performance because")
print(" they represent different experimental runs, not necessarily different")
print(" model configurations.")
print("=" * 100)
if __name__ == '__main__':
main()
|