Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

File size: 3,917 Bytes

7336adb

#!/usr/bin/env python3
"""Run heuristic + multiple LLM baselines and show comparison table.

Usage:
    python3 run_all_baselines.py
"""

from __future__ import annotations

import json
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

# Load .env
_env_path = Path(__file__).parent / ".env"
if _env_path.exists():
    for line in _env_path.read_text().splitlines():
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            key, _, value = line.partition("=")
            os.environ.setdefault(key.strip(), value.strip())

from baseline_heuristic import ALL_TASKS
from baseline_heuristic import run_heuristic_episode
from baseline_inference import PROVIDERS, run_llm_episode

try:
    from openai import OpenAI
except ImportError:
    print("Error: pip install openai")
    sys.exit(1)


def run_heuristic() -> dict[str, float]:
    scores = {}
    for task_id in ALL_TASKS:
        scores[task_id] = round(run_heuristic_episode(task_id), 4)
    return scores


def run_llm_provider(provider_name: str, model: str | None = None) -> dict[str, float]:
    prov = PROVIDERS[provider_name]
    api_key = os.environ.get(prov["env_key"])
    if not api_key:
        return {t: -1.0 for t in ALL_TASKS}  # -1 = no key

    model_name = model or prov["default_model"]
    client_kwargs: dict = {"api_key": api_key}
    if prov["base_url"]:
        client_kwargs["base_url"] = prov["base_url"]
    client = OpenAI(**client_kwargs)

    scores: dict[str, float] = {}
    for task_id in ALL_TASKS:
        try:
            score = run_llm_episode(task_id, client, model_name)
            scores[task_id] = round(score, 4)
            print(f"  [{provider_name}/{model_name}] {task_id}: {score:.4f}", file=sys.stderr)
        except Exception as e:
            err_str = str(e)[:80]
            print(f"  [{provider_name}/{model_name}] {task_id}: ERROR — {err_str}", file=sys.stderr)
            scores[task_id] = 0.0
    return scores


def main() -> None:
    print("Running all baselines...\n", file=sys.stderr)

    results: dict[str, dict[str, float]] = {}

    # Run heuristic first (fast, deterministic)
    print("--- Heuristic baseline ---", file=sys.stderr)
    results["Heuristic"] = run_heuristic()
    print(f"  Done: {json.dumps(results['Heuristic'])}", file=sys.stderr)

    # Run LLM providers sequentially (avoids thread hang issues)
    llm_runs = [
        ("Cerebras/Llama-3.1-8B", "cerebras", "llama3.1-8b"),
        ("Groq/Llama-3.1-8B", "groq", "llama-3.1-8b-instant"),
    ]

    for label, provider, model in llm_runs:
        print(f"\n--- {label} ---", file=sys.stderr)
        try:
            results[label] = run_llm_provider(provider, model)
        except Exception as e:
            print(f"  {label}: FAILED — {e}", file=sys.stderr)
            results[label] = {t: 0.0 for t in ALL_TASKS}

    # Print comparison table
    print("\n" + "=" * 80)
    print("BASELINE COMPARISON TABLE")
    print("=" * 80)

    headers = list(results.keys())
    print(f"\n{'Task':<12}", end="")
    for h in headers:
        print(f"{h:>25}", end="")
    print()
    print("-" * (12 + 25 * len(headers)))

    for task_id in ALL_TASKS:
        print(f"{task_id:<12}", end="")
        for h in headers:
            score = results[h].get(task_id, 0.0)
            if score < 0:
                print(f"{'no key':>25}", end="")
            else:
                print(f"{score:>25.4f}", end="")
        print()

    print("-" * (12 + 25 * len(headers)))

    # Averages
    print(f"{'AVERAGE':<12}", end="")
    for h in headers:
        valid = [v for v in results[h].values() if v >= 0]
        avg = sum(valid) / len(valid) if valid else 0
        print(f"{avg:>25.4f}", end="")
    print()

    # Save JSON
    print(json.dumps(results, indent=2))


if __name__ == "__main__":
    main()