| """Baseline inference script for the Scheduling Optimisation Environment. |
| |
| Runs GPT-4o-mini (or falls back to deterministic mock responses) against all |
| three tasks and prints a structured score report. |
| |
| Usage: |
| OPENAI_API_KEY=sk-... python baseline.py |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import sys |
| from typing import Any |
|
|
| from environment import INSTANCE_BANK |
| from graders.grader_classification import ConflictGrader |
| from graders.grader_detection import FeasibilityGrader |
| from graders.grader_fix import RepairGrader |
| from models import Action |
|
|
|
|
| def _get_openai_client(): |
| """Return an OpenAI client, or None if unavailable.""" |
| api_key = os.environ.get("OPENAI_API_KEY", "") |
| if not api_key: |
| return None |
| try: |
| from openai import OpenAI |
| return OpenAI(api_key=api_key) |
| except Exception: |
| return None |
|
|
|
|
| def _llm_response(client, system_prompt: str, user_prompt: str) -> str: |
| """Call GPT-4o-mini and return the response text.""" |
| try: |
| resp = client.chat.completions.create( |
| model="gpt-4o-mini", |
| messages=[ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt}, |
| ], |
| max_tokens=1024, |
| temperature=0.0, |
| ) |
| return resp.choices[0].message.content.strip() |
| except Exception as e: |
| print(f" [LLM error: {e}]") |
| return "" |
|
|
|
|
| |
| |
| |
|
|
| |
| _MOCK_FEASIBILITY: dict[int, str] = { |
| 0: "infeasible", 1: "infeasible", 2: "infeasible", 3: "infeasible", |
| 4: "infeasible", 5: "infeasible", 6: "infeasible", 7: "infeasible", |
| 8: "infeasible", 9: "infeasible", 10: "feasible", 11: "feasible", |
| } |
|
|
| |
| _MOCK_CLASSIFICATION: dict[int, str] = { |
| 0: "resource_overload", |
| 1: "deadline_violation", |
| 2: "precedence_violation", |
| 3: "availability_conflict", |
| 4: "capacity_exceeded", |
| 5: "resource_overload", |
| 6: "deadline_violation", |
| 7: "precedence_violation", |
| 8: "availability_conflict", |
| 9: "capacity_exceeded", |
| } |
|
|
|
|
| def _mock_repair(instance_idx: int) -> str: |
| """Return the known optimal schedule JSON for mock mode.""" |
| entry = INSTANCE_BANK[instance_idx] |
| optimal = entry.get("optimal_schedule", {}) |
| if not optimal: |
| |
| optimal = entry["instance"].get("proposed_schedule", {}) |
| return json.dumps(optimal) |
|
|
|
|
| |
| |
| |
|
|
|
|
| def run_baseline() -> dict[str, Any]: |
| """Execute the baseline across all three tasks and return scores.""" |
| client = _get_openai_client() |
| use_llm = client is not None |
| mode = "GPT-4o-mini" if use_llm else "mock (no API key — oracle responses)" |
| print(f"\n{'='*65}") |
| print(f" SchedulingOptEnv — Baseline Evaluation ({mode})") |
| print(f"{'='*65}\n") |
|
|
| results: dict[str, Any] = {"mode": mode, "tasks": {}} |
|
|
| |
| feas_grader = FeasibilityGrader() |
| feas_scores: list[float] = [] |
| print("Task 1: Feasibility Check (easy)") |
| for i, entry in enumerate(INSTANCE_BANK): |
| instance_str = json.dumps(entry["instance"], indent=2) |
| if use_llm: |
| resp = _llm_response( |
| client, |
| ( |
| "You are a scheduling expert. Determine if the proposed schedule " |
| "satisfies all constraints. Reply with ONLY 'feasible' or 'infeasible'." |
| ), |
| instance_str, |
| ) |
| else: |
| resp = _MOCK_FEASIBILITY.get(i, "infeasible") |
| action = Action(response=resp, task_id="feasibility_check") |
| score = feas_grader.grade(action, entry) |
| feas_scores.append(score) |
| status = "CORRECT" if score >= 0.95 else "wrong" |
| expected = "feasible" if entry["is_feasible"] else "infeasible" |
| print( |
| f" Instance {i:2d}: {status:7s} (score={score:.2f}) " |
| f"expected={expected} [{entry['description'][:45]}]" |
| ) |
|
|
| avg_feas = sum(feas_scores) / len(feas_scores) if feas_scores else 0.0 |
| results["tasks"]["feasibility_check"] = { |
| "average_score": round(avg_feas, 4), |
| "num_instances": len(feas_scores), |
| "scores": feas_scores, |
| } |
| print(f" >> Average: {avg_feas:.3f}\n") |
|
|
| |
| conf_grader = ConflictGrader() |
| conf_scores: list[float] = [] |
| infeasible_entries = [(i, e) for i, e in enumerate(INSTANCE_BANK) if not e["is_feasible"]] |
| print("Task 2: Conflict Classification (medium)") |
| for i, entry in infeasible_entries: |
| instance_str = json.dumps(entry["instance"], indent=2) |
| if use_llm: |
| resp = _llm_response( |
| client, |
| ( |
| "You are a scheduling expert. Identify the constraint violation type. " |
| "Reply with ONLY one of: resource_overload, deadline_violation, " |
| "precedence_violation, availability_conflict, capacity_exceeded." |
| ), |
| instance_str, |
| ) |
| else: |
| resp = _MOCK_CLASSIFICATION.get(i, "resource_overload") |
| action = Action(response=resp, task_id="conflict_classification") |
| score = conf_grader.grade(action, entry) |
| conf_scores.append(score) |
| status = "EXACT" if score >= 0.95 else ("partial" if score >= 0.45 else "wrong") |
| print( |
| f" Instance {i:2d}: {status:7s} (score={score:.2f}) " |
| f"expected={entry['violation_type']}" |
| ) |
|
|
| avg_conf = sum(conf_scores) / len(conf_scores) if conf_scores else 0.0 |
| results["tasks"]["conflict_classification"] = { |
| "average_score": round(avg_conf, 4), |
| "num_instances": len(conf_scores), |
| "scores": conf_scores, |
| } |
| print(f" >> Average: {avg_conf:.3f}\n") |
|
|
| |
| repair_grader = RepairGrader() |
| repair_scores: list[float] = [] |
| repairable = [ |
| (i, e) for i, e in enumerate(INSTANCE_BANK) |
| if not e["is_feasible"] and e.get("optimal_schedule") |
| ] |
| print("Task 3: Schedule Repair (hard)") |
| for i, entry in repairable: |
| instance_str = json.dumps(entry["instance"], indent=2) |
| if use_llm: |
| resp = _llm_response( |
| client, |
| ( |
| "You are a scheduling expert. Repair the infeasible schedule by " |
| "returning a JSON object with key 'assignments': a list of " |
| '{"job_id", "machine_id", "start_time"} dicts that satisfies all ' |
| "constraints and minimises makespan. Return ONLY valid JSON." |
| ), |
| instance_str, |
| ) |
| else: |
| resp = _mock_repair(i) |
| action = Action(response=resp, task_id="schedule_repair") |
| score = repair_grader.grade(action, entry) |
| repair_scores.append(score) |
| print( |
| f" Instance {i:2d}: score={score:.2f} " |
| f"optimal_makespan={entry['optimal_makespan']} " |
| f"[{entry['description'][:45]}]" |
| ) |
|
|
| avg_repair = sum(repair_scores) / len(repair_scores) if repair_scores else 0.0 |
| results["tasks"]["schedule_repair"] = { |
| "average_score": round(avg_repair, 4), |
| "num_instances": len(repair_scores), |
| "scores": repair_scores, |
| } |
| print(f" >> Average: {avg_repair:.3f}\n") |
|
|
| |
| overall = (avg_feas + avg_conf + avg_repair) / 3 |
| results["overall_average"] = round(overall, 4) |
| print(f"{'='*65}") |
| print(f" Overall Average Score: {overall:.3f}") |
| print(f"{'='*65}\n") |
|
|
| return results |
|
|
|
|
| if __name__ == "__main__": |
| try: |
| run_baseline() |
| except Exception as e: |
| print(f"Baseline failed: {e}", file=sys.stderr) |
| sys.exit(1) |
|
|