File size: 6,376 Bytes

cfcbbc8

#!/usr/bin/env python3
"""
Compare two model variants to see if they have different configurations.
Usage:
  export CBORG_API_KEY=...
  python compare_model_configs.py openai/o:latest openai/o3
"""
import os
import sys
from openai import OpenAI
import json

def test_model_detailed(client, model_id):
    """Test a model and return detailed response information."""
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": "What is 2+2?"}],
            max_tokens=10,
            temperature=1.0,  # Explicitly set
            top_p=1.0,        # Explicitly set
        )
        
        # Extract all available information
        info = {
            'model': response.model,
            'id': response.id,
            'created': response.created,
            'object': response.object,
            'system_fingerprint': getattr(response, 'system_fingerprint', None),
            'usage': {
                'prompt_tokens': response.usage.prompt_tokens,
                'completion_tokens': response.usage.completion_tokens,
                'total_tokens': response.usage.total_tokens,
            },
            'response_content': response.choices[0].message.content,
            'finish_reason': response.choices[0].finish_reason,
        }
        
        # Try to get any additional metadata
        try:
            info['raw_response'] = str(response)
        except:
            pass
            
        return info, None
    except Exception as e:
        return None, str(e)

def main():
    if len(sys.argv) < 3:
        print("Usage: python compare_model_configs.py <model1> <model2>")
        print("Example: python compare_model_configs.py openai/o:latest openai/o3")
        sys.exit(1)
    
    model1 = sys.argv[1]
    model2 = sys.argv[2]
    
    api_key = os.environ.get('CBORG_API_KEY')
    if not api_key:
        print("Error: CBORG_API_KEY environment variable not set.")
        sys.exit(1)

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.cborg.lbl.gov"
    )
    
    print("=" * 100)
    print(f"COMPARING: {model1} vs {model2}")
    print("=" * 100)
    print()
    
    # Test model 1
    print(f"Testing {model1}...")
    info1, error1 = test_model_detailed(client, model1)
    
    if error1:
        print(f"❌ Error: {error1}")
        sys.exit(1)
    
    # Test model 2
    print(f"Testing {model2}...")
    info2, error2 = test_model_detailed(client, model2)
    
    if error2:
        print(f"❌ Error: {error2}")
        sys.exit(1)
    
    print()
    print("=" * 100)
    print("COMPARISON RESULTS")
    print("=" * 100)
    print()
    
    # Compare underlying models
    print("1. UNDERLYING MODEL:")
    print(f"   {model1:<30} → {info1['model']}")
    print(f"   {model2:<30} → {info2['model']}")
    if info1['model'] == info2['model']:
        print("   ✓ SAME underlying model")
    else:
        print("   ⚠️  DIFFERENT underlying models!")
    print()
    
    # Compare system fingerprints (if available)
    print("2. SYSTEM FINGERPRINT:")
    print(f"   {model1:<30} → {info1['system_fingerprint']}")
    print(f"   {model2:<30} → {info2['system_fingerprint']}")
    if info1['system_fingerprint'] == info2['system_fingerprint']:
        print("   ✓ SAME system fingerprint")
    elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None:
        print("   ⚠️  System fingerprint not available")
    else:
        print("   ⚠️  DIFFERENT system fingerprints!")
    print()
    
    # Compare token usage patterns
    print("3. TOKEN USAGE (for same prompt):")
    print(f"   {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}")
    print(f"   {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}")
    if info1['usage'] == info2['usage']:
        print("   ✓ IDENTICAL token usage")
    else:
        print("   ⚠️  Different token usage (could indicate different behavior)")
    print()
    
    # Compare responses
    print("4. RESPONSE CONTENT:")
    print(f"   {model1}: \"{info1['response_content']}\"")
    print(f"   {model2}: \"{info2['response_content']}\"")
    if info1['response_content'] == info2['response_content']:
        print("   ✓ IDENTICAL responses")
    else:
        print("   ⚠️  Different responses")
    print()
    
    # Show raw response if available
    if 'raw_response' in info1:
        print("5. RAW RESPONSE MODEL 1:")
        print(f"   {info1['raw_response'][:500]}")
        print()
        print("6. RAW RESPONSE MODEL 2:")
        print(f"   {info2['raw_response'][:500]}")
        print()
    
    # Final verdict
    print("=" * 100)
    print("VERDICT:")
    print("=" * 100)
    
    same_count = 0
    total_count = 4
    
    if info1['model'] == info2['model']:
        same_count += 1
    if info1['system_fingerprint'] == info2['system_fingerprint'] or \
       (info1['system_fingerprint'] is None and info2['system_fingerprint'] is None):
        same_count += 1
    if info1['usage'] == info2['usage']:
        same_count += 1
    if info1['response_content'] == info2['response_content']:
        same_count += 1
    
    print(f"Similarity: {same_count}/{total_count} metrics match")
    print()
    
    if same_count == total_count:
        print("✓ Models appear to be IDENTICAL")
        print("  → Same underlying model, same configuration")
        print("  → Likely just different aliases for the same deployment")
    elif info1['model'] == info2['model'] and same_count >= 2:
        print("⚠️  Models use the SAME base model but show some differences")
        print("  → Could be due to:")
        print("    - Different deployment instances")
        print("    - Randomness in generation")
        print("    - Different routing/load balancing")
    else:
        print("⚠️  Models appear to be DIFFERENT")
        print("  → Different configurations or versions")
    
    print()
    print("NOTE: In your dataset, these models have different performance because")
    print("      they represent different experimental runs, not necessarily different")
    print("      model configurations.")
    print("=" * 100)

if __name__ == '__main__':
    main()