βš–οΈ AI Model Comparison with Crazyrouter

Compare GPT-4o vs Claude vs Gemini vs DeepSeek β€” same prompt, same API, side by side.

One of the biggest advantages of Crazyrouter is the ability to test multiple models instantly. No separate accounts, no different SDKs. Just change the model name.


Quick Comparison Script

from openai import OpenAI
import time

client = OpenAI(
    base_url="https://crazyrouter.com/v1",
    api_key="sk-your-crazyrouter-key"
)

MODELS = [
    "gpt-4o",
    "gpt-4o-mini",
    "claude-sonnet-4-20250514",
    "claude-haiku-3.5",
    "gemini-2.0-flash",
    "deepseek-chat",
    "deepseek-reasoner",
]

PROMPT = "Explain the difference between TCP and UDP in exactly 3 sentences."

print(f"Prompt: {PROMPT}\n")
print("=" * 60)

for model in MODELS:
    try:
        start = time.time()
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": PROMPT}],
            max_tokens=200
        )
        elapsed = time.time() - start
        content = response.choices[0].message.content
        tokens = response.usage.total_tokens

        print(f"\nπŸ€– {model}")
        print(f"⏱️  {elapsed:.2f}s | πŸ“Š {tokens} tokens")
        print(f"πŸ’¬ {content}")
        print("-" * 60)
    except Exception as e:
        print(f"\n❌ {model}: {e}")
        print("-" * 60)

Benchmark: Speed Test

import time
from openai import OpenAI

client = OpenAI(
    base_url="https://crazyrouter.com/v1",
    api_key="sk-your-crazyrouter-key"
)

def benchmark(model, prompt, runs=3):
    times = []
    for _ in range(runs):
        start = time.time()
        client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100
        )
        times.append(time.time() - start)
    avg = sum(times) / len(times)
    return avg

models = ["gpt-4o-mini", "claude-haiku-3.5", "gemini-2.0-flash", "deepseek-chat"]
prompt = "What is 2+2? Reply with just the number."

print("Speed Benchmark (avg of 3 runs)")
print("=" * 40)
for m in models:
    avg = benchmark(m, prompt)
    print(f"{m:30s} {avg:.2f}s")

Coding Comparison

CODING_PROMPT = """Write a Python function that:
1. Takes a list of integers
2. Returns the longest increasing subsequence
3. Include type hints and a docstring
"""

CODING_MODELS = [
    "gpt-4o",
    "claude-sonnet-4-20250514",
    "deepseek-chat",
    "gemini-2.0-flash",
]

for model in CODING_MODELS:
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": CODING_PROMPT}],
        max_tokens=500
    )
    print(f"\n{'='*60}")
    print(f"πŸ€– {model}")
    print(f"{'='*60}")
    print(response.choices[0].message.content)

Reasoning Comparison

Test models that support chain-of-thought reasoning:

REASONING_PROMPT = """A farmer has 17 sheep. All but 9 die. How many sheep are left?
Think step by step."""

REASONING_MODELS = [
    "gpt-4o",
    "o3-mini",
    "deepseek-reasoner",
    "claude-sonnet-4-20250514",
]

for model in REASONING_MODELS:
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": REASONING_PROMPT}],
        max_tokens=300
    )
    print(f"\nπŸ€– {model}: {response.choices[0].message.content[:200]}")

Cost Comparison

# Approximate pricing per 1M tokens (input/output)
PRICING = {
    "gpt-4o":           {"input": 2.50, "output": 10.00},
    "gpt-4o-mini":      {"input": 0.15, "output": 0.60},
    "claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
    "claude-haiku-3.5": {"input": 0.80, "output": 4.00},
    "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
    "deepseek-chat":    {"input": 0.14, "output": 0.28},
}

def estimate_cost(model, input_tokens, output_tokens):
    p = PRICING.get(model, {"input": 0, "output": 0})
    return (input_tokens * p["input"] + output_tokens * p["output"]) / 1_000_000

# Example: 1000 requests, avg 500 input + 200 output tokens each
requests = 1000
input_tok = 500
output_tok = 200

print(f"Cost estimate for {requests} requests ({input_tok} in / {output_tok} out tokens each):\n")
for model, price in PRICING.items():
    cost = requests * estimate_cost(model, input_tok, output_tok)
    print(f"  {model:30s} ${cost:.4f}")

When to Use Which Model

Use Case Recommended Model Why
General chat gpt-4o-mini Fast, cheap, good quality
Complex analysis gpt-4o or claude-sonnet-4-20250514 Best reasoning
Coding deepseek-chat or claude-sonnet-4-20250514 Strong code generation
Long documents gemini-2.0-flash 1M token context
Math/Logic deepseek-reasoner or o3-mini Chain-of-thought
Budget tasks deepseek-chat $0.14/1M input
Speed critical gemini-2.0-flash Fastest response

Try It Live

πŸ‘‰ Crazyrouter Demo on Hugging Face β€” switch models in real-time


Links

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. πŸ™‹ Ask for provider support