openenv-hackathon / baseline.py
Vittal-M's picture
Upload baseline.py with huggingface_hub
325052f verified
"""Baseline inference script for the Scheduling Optimisation Environment.
Runs GPT-4o-mini (or falls back to deterministic mock responses) against all
three tasks and prints a structured score report.
Usage:
OPENAI_API_KEY=sk-... python baseline.py
"""
from __future__ import annotations
import json
import os
import sys
from typing import Any
from environment import INSTANCE_BANK
from graders.grader_classification import ConflictGrader
from graders.grader_detection import FeasibilityGrader
from graders.grader_fix import RepairGrader
from models import Action
def _get_openai_client():
"""Return an OpenAI client, or None if unavailable."""
api_key = os.environ.get("OPENAI_API_KEY", "")
if not api_key:
return None
try:
from openai import OpenAI
return OpenAI(api_key=api_key)
except Exception:
return None
def _llm_response(client, system_prompt: str, user_prompt: str) -> str:
"""Call GPT-4o-mini and return the response text."""
try:
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
max_tokens=1024,
temperature=0.0,
)
return resp.choices[0].message.content.strip()
except Exception as e:
print(f" [LLM error: {e}]")
return ""
# ---------------------------------------------------------------------------
# Mock fallback responses (used when no API key is available)
# ---------------------------------------------------------------------------
# Ground-truth feasibility labels — index aligns with INSTANCE_BANK
_MOCK_FEASIBILITY: dict[int, str] = {
0: "infeasible", 1: "infeasible", 2: "infeasible", 3: "infeasible",
4: "infeasible", 5: "infeasible", 6: "infeasible", 7: "infeasible",
8: "infeasible", 9: "infeasible", 10: "feasible", 11: "feasible",
}
# Ground-truth violation types for infeasible instances
_MOCK_CLASSIFICATION: dict[int, str] = {
0: "resource_overload",
1: "deadline_violation",
2: "precedence_violation",
3: "availability_conflict",
4: "capacity_exceeded",
5: "resource_overload",
6: "deadline_violation",
7: "precedence_violation",
8: "availability_conflict",
9: "capacity_exceeded",
}
def _mock_repair(instance_idx: int) -> str:
"""Return the known optimal schedule JSON for mock mode."""
entry = INSTANCE_BANK[instance_idx]
optimal = entry.get("optimal_schedule", {})
if not optimal:
# Return the proposed schedule unchanged as a safe fallback
optimal = entry["instance"].get("proposed_schedule", {})
return json.dumps(optimal)
# ---------------------------------------------------------------------------
# Baseline runner
# ---------------------------------------------------------------------------
def run_baseline() -> dict[str, Any]:
"""Execute the baseline across all three tasks and return scores."""
client = _get_openai_client()
use_llm = client is not None
mode = "GPT-4o-mini" if use_llm else "mock (no API key — oracle responses)"
print(f"\n{'='*65}")
print(f" SchedulingOptEnv — Baseline Evaluation ({mode})")
print(f"{'='*65}\n")
results: dict[str, Any] = {"mode": mode, "tasks": {}}
# ----- Task 1: Feasibility Check -----
feas_grader = FeasibilityGrader()
feas_scores: list[float] = []
print("Task 1: Feasibility Check (easy)")
for i, entry in enumerate(INSTANCE_BANK):
instance_str = json.dumps(entry["instance"], indent=2)
if use_llm:
resp = _llm_response(
client,
(
"You are a scheduling expert. Determine if the proposed schedule "
"satisfies all constraints. Reply with ONLY 'feasible' or 'infeasible'."
),
instance_str,
)
else:
resp = _MOCK_FEASIBILITY.get(i, "infeasible")
action = Action(response=resp, task_id="feasibility_check")
score = feas_grader.grade(action, entry)
feas_scores.append(score)
status = "CORRECT" if score >= 0.95 else "wrong"
expected = "feasible" if entry["is_feasible"] else "infeasible"
print(
f" Instance {i:2d}: {status:7s} (score={score:.2f}) "
f"expected={expected} [{entry['description'][:45]}]"
)
avg_feas = sum(feas_scores) / len(feas_scores) if feas_scores else 0.0
results["tasks"]["feasibility_check"] = {
"average_score": round(avg_feas, 4),
"num_instances": len(feas_scores),
"scores": feas_scores,
}
print(f" >> Average: {avg_feas:.3f}\n")
# ----- Task 2: Conflict Classification -----
conf_grader = ConflictGrader()
conf_scores: list[float] = []
infeasible_entries = [(i, e) for i, e in enumerate(INSTANCE_BANK) if not e["is_feasible"]]
print("Task 2: Conflict Classification (medium)")
for i, entry in infeasible_entries:
instance_str = json.dumps(entry["instance"], indent=2)
if use_llm:
resp = _llm_response(
client,
(
"You are a scheduling expert. Identify the constraint violation type. "
"Reply with ONLY one of: resource_overload, deadline_violation, "
"precedence_violation, availability_conflict, capacity_exceeded."
),
instance_str,
)
else:
resp = _MOCK_CLASSIFICATION.get(i, "resource_overload")
action = Action(response=resp, task_id="conflict_classification")
score = conf_grader.grade(action, entry)
conf_scores.append(score)
status = "EXACT" if score >= 0.95 else ("partial" if score >= 0.45 else "wrong")
print(
f" Instance {i:2d}: {status:7s} (score={score:.2f}) "
f"expected={entry['violation_type']}"
)
avg_conf = sum(conf_scores) / len(conf_scores) if conf_scores else 0.0
results["tasks"]["conflict_classification"] = {
"average_score": round(avg_conf, 4),
"num_instances": len(conf_scores),
"scores": conf_scores,
}
print(f" >> Average: {avg_conf:.3f}\n")
# ----- Task 3: Schedule Repair -----
repair_grader = RepairGrader()
repair_scores: list[float] = []
repairable = [
(i, e) for i, e in enumerate(INSTANCE_BANK)
if not e["is_feasible"] and e.get("optimal_schedule")
]
print("Task 3: Schedule Repair (hard)")
for i, entry in repairable:
instance_str = json.dumps(entry["instance"], indent=2)
if use_llm:
resp = _llm_response(
client,
(
"You are a scheduling expert. Repair the infeasible schedule by "
"returning a JSON object with key 'assignments': a list of "
'{"job_id", "machine_id", "start_time"} dicts that satisfies all '
"constraints and minimises makespan. Return ONLY valid JSON."
),
instance_str,
)
else:
resp = _mock_repair(i)
action = Action(response=resp, task_id="schedule_repair")
score = repair_grader.grade(action, entry)
repair_scores.append(score)
print(
f" Instance {i:2d}: score={score:.2f} "
f"optimal_makespan={entry['optimal_makespan']} "
f"[{entry['description'][:45]}]"
)
avg_repair = sum(repair_scores) / len(repair_scores) if repair_scores else 0.0
results["tasks"]["schedule_repair"] = {
"average_score": round(avg_repair, 4),
"num_instances": len(repair_scores),
"scores": repair_scores,
}
print(f" >> Average: {avg_repair:.3f}\n")
# ----- Summary -----
overall = (avg_feas + avg_conf + avg_repair) / 3
results["overall_average"] = round(overall, 4)
print(f"{'='*65}")
print(f" Overall Average Score: {overall:.3f}")
print(f"{'='*65}\n")
return results
if __name__ == "__main__":
try:
run_baseline()
except Exception as e:
print(f"Baseline failed: {e}", file=sys.stderr)
sys.exit(1)