LLM4HEP / get_all_model_versions.py
ho22joshua's picture
initial commit
cfcbbc8
#!/usr/bin/env python3
"""
Script to get version information for all models in the dataset.
Usage:
export CBORG_API_KEY=...
python get_all_model_versions.py
"""
import os
import sys
import pandas as pd
from openai import OpenAI
def test_model_version(client, model_id):
"""Test a model and return the underlying model name."""
try:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "Hi"}],
max_tokens=5
)
return response.model
except Exception as e:
error_msg = str(e)[:150]
return f"ERROR: {error_msg}"
def main():
api_key = os.environ.get('CBORG_API_KEY')
if not api_key:
print("Error: CBORG_API_KEY environment variable not set.")
sys.exit(1)
client = OpenAI(
api_key=api_key,
base_url="https://api.cborg.lbl.gov"
)
# Load the dataset to get all unique models
df = pd.read_csv('/global/cfs/projectdirs/atlas/joshua/llm4hep/results_summary.csv', comment='#')
df = df.dropna(subset=['supervisor', 'coder'])
# Get all unique models
all_models = sorted(set(df['supervisor'].unique()) | set(df['coder'].unique()))
print("=" * 100)
print("TESTING ALL MODELS IN DATASET FOR VERSION INFORMATION")
print("=" * 100)
print(f"\nFound {len(all_models)} unique models in the dataset")
print()
results = {}
for idx, model in enumerate(all_models, 1):
print(f"[{idx}/{len(all_models)}] Testing {model:<45}", end=" ", flush=True)
underlying = test_model_version(client, model)
results[model] = underlying
if underlying.startswith('ERROR'):
print("❌")
else:
print("βœ“")
# Print results
print("\n" + "=" * 100)
print("RESULTS: MODEL MAPPINGS")
print("=" * 100)
for model in sorted(results.keys()):
underlying = results[model]
if underlying.startswith('ERROR'):
print(f"❌ {model:<45} {underlying[:50]}")
else:
if model == underlying:
print(f" {model:<45} (no alias)")
else:
print(f" {model:<45} β†’ {underlying}")
# Save to file
output_file = 'model_version_mappings.txt'
with open(output_file, 'w') as f:
f.write("MODEL VERSION MAPPINGS\n")
f.write("=" * 100 + "\n")
f.write(f"Discovered on: October 29, 2025\n")
f.write(f"Total models tested: {len(results)}\n\n")
for model in sorted(results.keys()):
underlying = results[model]
if not underlying.startswith('ERROR'):
if model == underlying:
f.write(f"{model} (no alias)\n")
else:
f.write(f"{model} β†’ {underlying}\n")
print(f"\nβœ“ Results saved to {output_file}")
print("=" * 100)
if __name__ == '__main__':
main()