|
|
|
|
|
""" |
|
|
Compare two model variants to see if they have different configurations. |
|
|
Usage: |
|
|
export CBORG_API_KEY=... |
|
|
python compare_model_configs.py openai/o:latest openai/o3 |
|
|
""" |
|
|
import os |
|
|
import sys |
|
|
from openai import OpenAI |
|
|
import json |
|
|
|
|
|
def test_model_detailed(client, model_id): |
|
|
"""Test a model and return detailed response information.""" |
|
|
try: |
|
|
response = client.chat.completions.create( |
|
|
model=model_id, |
|
|
messages=[{"role": "user", "content": "What is 2+2?"}], |
|
|
max_tokens=10, |
|
|
temperature=1.0, |
|
|
top_p=1.0, |
|
|
) |
|
|
|
|
|
|
|
|
info = { |
|
|
'model': response.model, |
|
|
'id': response.id, |
|
|
'created': response.created, |
|
|
'object': response.object, |
|
|
'system_fingerprint': getattr(response, 'system_fingerprint', None), |
|
|
'usage': { |
|
|
'prompt_tokens': response.usage.prompt_tokens, |
|
|
'completion_tokens': response.usage.completion_tokens, |
|
|
'total_tokens': response.usage.total_tokens, |
|
|
}, |
|
|
'response_content': response.choices[0].message.content, |
|
|
'finish_reason': response.choices[0].finish_reason, |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
info['raw_response'] = str(response) |
|
|
except: |
|
|
pass |
|
|
|
|
|
return info, None |
|
|
except Exception as e: |
|
|
return None, str(e) |
|
|
|
|
|
def main(): |
|
|
if len(sys.argv) < 3: |
|
|
print("Usage: python compare_model_configs.py <model1> <model2>") |
|
|
print("Example: python compare_model_configs.py openai/o:latest openai/o3") |
|
|
sys.exit(1) |
|
|
|
|
|
model1 = sys.argv[1] |
|
|
model2 = sys.argv[2] |
|
|
|
|
|
api_key = os.environ.get('CBORG_API_KEY') |
|
|
if not api_key: |
|
|
print("Error: CBORG_API_KEY environment variable not set.") |
|
|
sys.exit(1) |
|
|
|
|
|
client = OpenAI( |
|
|
api_key=api_key, |
|
|
base_url="https://api.cborg.lbl.gov" |
|
|
) |
|
|
|
|
|
print("=" * 100) |
|
|
print(f"COMPARING: {model1} vs {model2}") |
|
|
print("=" * 100) |
|
|
print() |
|
|
|
|
|
|
|
|
print(f"Testing {model1}...") |
|
|
info1, error1 = test_model_detailed(client, model1) |
|
|
|
|
|
if error1: |
|
|
print(f"❌ Error: {error1}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
print(f"Testing {model2}...") |
|
|
info2, error2 = test_model_detailed(client, model2) |
|
|
|
|
|
if error2: |
|
|
print(f"❌ Error: {error2}") |
|
|
sys.exit(1) |
|
|
|
|
|
print() |
|
|
print("=" * 100) |
|
|
print("COMPARISON RESULTS") |
|
|
print("=" * 100) |
|
|
print() |
|
|
|
|
|
|
|
|
print("1. UNDERLYING MODEL:") |
|
|
print(f" {model1:<30} → {info1['model']}") |
|
|
print(f" {model2:<30} → {info2['model']}") |
|
|
if info1['model'] == info2['model']: |
|
|
print(" ✓ SAME underlying model") |
|
|
else: |
|
|
print(" ⚠️ DIFFERENT underlying models!") |
|
|
print() |
|
|
|
|
|
|
|
|
print("2. SYSTEM FINGERPRINT:") |
|
|
print(f" {model1:<30} → {info1['system_fingerprint']}") |
|
|
print(f" {model2:<30} → {info2['system_fingerprint']}") |
|
|
if info1['system_fingerprint'] == info2['system_fingerprint']: |
|
|
print(" ✓ SAME system fingerprint") |
|
|
elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None: |
|
|
print(" ⚠️ System fingerprint not available") |
|
|
else: |
|
|
print(" ⚠️ DIFFERENT system fingerprints!") |
|
|
print() |
|
|
|
|
|
|
|
|
print("3. TOKEN USAGE (for same prompt):") |
|
|
print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}") |
|
|
print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}") |
|
|
if info1['usage'] == info2['usage']: |
|
|
print(" ✓ IDENTICAL token usage") |
|
|
else: |
|
|
print(" ⚠️ Different token usage (could indicate different behavior)") |
|
|
print() |
|
|
|
|
|
|
|
|
print("4. RESPONSE CONTENT:") |
|
|
print(f" {model1}: \"{info1['response_content']}\"") |
|
|
print(f" {model2}: \"{info2['response_content']}\"") |
|
|
if info1['response_content'] == info2['response_content']: |
|
|
print(" ✓ IDENTICAL responses") |
|
|
else: |
|
|
print(" ⚠️ Different responses") |
|
|
print() |
|
|
|
|
|
|
|
|
if 'raw_response' in info1: |
|
|
print("5. RAW RESPONSE MODEL 1:") |
|
|
print(f" {info1['raw_response'][:500]}") |
|
|
print() |
|
|
print("6. RAW RESPONSE MODEL 2:") |
|
|
print(f" {info2['raw_response'][:500]}") |
|
|
print() |
|
|
|
|
|
|
|
|
print("=" * 100) |
|
|
print("VERDICT:") |
|
|
print("=" * 100) |
|
|
|
|
|
same_count = 0 |
|
|
total_count = 4 |
|
|
|
|
|
if info1['model'] == info2['model']: |
|
|
same_count += 1 |
|
|
if info1['system_fingerprint'] == info2['system_fingerprint'] or \ |
|
|
(info1['system_fingerprint'] is None and info2['system_fingerprint'] is None): |
|
|
same_count += 1 |
|
|
if info1['usage'] == info2['usage']: |
|
|
same_count += 1 |
|
|
if info1['response_content'] == info2['response_content']: |
|
|
same_count += 1 |
|
|
|
|
|
print(f"Similarity: {same_count}/{total_count} metrics match") |
|
|
print() |
|
|
|
|
|
if same_count == total_count: |
|
|
print("✓ Models appear to be IDENTICAL") |
|
|
print(" → Same underlying model, same configuration") |
|
|
print(" → Likely just different aliases for the same deployment") |
|
|
elif info1['model'] == info2['model'] and same_count >= 2: |
|
|
print("⚠️ Models use the SAME base model but show some differences") |
|
|
print(" → Could be due to:") |
|
|
print(" - Different deployment instances") |
|
|
print(" - Randomness in generation") |
|
|
print(" - Different routing/load balancing") |
|
|
else: |
|
|
print("⚠️ Models appear to be DIFFERENT") |
|
|
print(" → Different configurations or versions") |
|
|
|
|
|
print() |
|
|
print("NOTE: In your dataset, these models have different performance because") |
|
|
print(" they represent different experimental runs, not necessarily different") |
|
|
print(" model configurations.") |
|
|
print("=" * 100) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|