LLM4HEP / compare_model_configs.py
ho22joshua's picture
initial commit
cfcbbc8
#!/usr/bin/env python3
"""
Compare two model variants to see if they have different configurations.
Usage:
export CBORG_API_KEY=...
python compare_model_configs.py openai/o:latest openai/o3
"""
import os
import sys
from openai import OpenAI
import json
def test_model_detailed(client, model_id):
"""Test a model and return detailed response information."""
try:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "What is 2+2?"}],
max_tokens=10,
temperature=1.0, # Explicitly set
top_p=1.0, # Explicitly set
)
# Extract all available information
info = {
'model': response.model,
'id': response.id,
'created': response.created,
'object': response.object,
'system_fingerprint': getattr(response, 'system_fingerprint', None),
'usage': {
'prompt_tokens': response.usage.prompt_tokens,
'completion_tokens': response.usage.completion_tokens,
'total_tokens': response.usage.total_tokens,
},
'response_content': response.choices[0].message.content,
'finish_reason': response.choices[0].finish_reason,
}
# Try to get any additional metadata
try:
info['raw_response'] = str(response)
except:
pass
return info, None
except Exception as e:
return None, str(e)
def main():
if len(sys.argv) < 3:
print("Usage: python compare_model_configs.py <model1> <model2>")
print("Example: python compare_model_configs.py openai/o:latest openai/o3")
sys.exit(1)
model1 = sys.argv[1]
model2 = sys.argv[2]
api_key = os.environ.get('CBORG_API_KEY')
if not api_key:
print("Error: CBORG_API_KEY environment variable not set.")
sys.exit(1)
client = OpenAI(
api_key=api_key,
base_url="https://api.cborg.lbl.gov"
)
print("=" * 100)
print(f"COMPARING: {model1} vs {model2}")
print("=" * 100)
print()
# Test model 1
print(f"Testing {model1}...")
info1, error1 = test_model_detailed(client, model1)
if error1:
print(f"❌ Error: {error1}")
sys.exit(1)
# Test model 2
print(f"Testing {model2}...")
info2, error2 = test_model_detailed(client, model2)
if error2:
print(f"❌ Error: {error2}")
sys.exit(1)
print()
print("=" * 100)
print("COMPARISON RESULTS")
print("=" * 100)
print()
# Compare underlying models
print("1. UNDERLYING MODEL:")
print(f" {model1:<30}{info1['model']}")
print(f" {model2:<30}{info2['model']}")
if info1['model'] == info2['model']:
print(" ✓ SAME underlying model")
else:
print(" ⚠️ DIFFERENT underlying models!")
print()
# Compare system fingerprints (if available)
print("2. SYSTEM FINGERPRINT:")
print(f" {model1:<30}{info1['system_fingerprint']}")
print(f" {model2:<30}{info2['system_fingerprint']}")
if info1['system_fingerprint'] == info2['system_fingerprint']:
print(" ✓ SAME system fingerprint")
elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None:
print(" ⚠️ System fingerprint not available")
else:
print(" ⚠️ DIFFERENT system fingerprints!")
print()
# Compare token usage patterns
print("3. TOKEN USAGE (for same prompt):")
print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}")
print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}")
if info1['usage'] == info2['usage']:
print(" ✓ IDENTICAL token usage")
else:
print(" ⚠️ Different token usage (could indicate different behavior)")
print()
# Compare responses
print("4. RESPONSE CONTENT:")
print(f" {model1}: \"{info1['response_content']}\"")
print(f" {model2}: \"{info2['response_content']}\"")
if info1['response_content'] == info2['response_content']:
print(" ✓ IDENTICAL responses")
else:
print(" ⚠️ Different responses")
print()
# Show raw response if available
if 'raw_response' in info1:
print("5. RAW RESPONSE MODEL 1:")
print(f" {info1['raw_response'][:500]}")
print()
print("6. RAW RESPONSE MODEL 2:")
print(f" {info2['raw_response'][:500]}")
print()
# Final verdict
print("=" * 100)
print("VERDICT:")
print("=" * 100)
same_count = 0
total_count = 4
if info1['model'] == info2['model']:
same_count += 1
if info1['system_fingerprint'] == info2['system_fingerprint'] or \
(info1['system_fingerprint'] is None and info2['system_fingerprint'] is None):
same_count += 1
if info1['usage'] == info2['usage']:
same_count += 1
if info1['response_content'] == info2['response_content']:
same_count += 1
print(f"Similarity: {same_count}/{total_count} metrics match")
print()
if same_count == total_count:
print("✓ Models appear to be IDENTICAL")
print(" → Same underlying model, same configuration")
print(" → Likely just different aliases for the same deployment")
elif info1['model'] == info2['model'] and same_count >= 2:
print("⚠️ Models use the SAME base model but show some differences")
print(" → Could be due to:")
print(" - Different deployment instances")
print(" - Randomness in generation")
print(" - Different routing/load balancing")
else:
print("⚠️ Models appear to be DIFFERENT")
print(" → Different configurations or versions")
print()
print("NOTE: In your dataset, these models have different performance because")
print(" they represent different experimental runs, not necessarily different")
print(" model configurations.")
print("=" * 100)
if __name__ == '__main__':
main()