import os import json import argparse from openai import OpenAI from core.environment import EmailOpsEnv from core.models import Action # Mandatory environment variables with defaults per OpenEnv spec API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1") MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini") HF_TOKEN = os.getenv("HF_TOKEN") # No default for token def run_baseline(api_key: str, model_name: str, base_url: str): client = OpenAI(api_key=api_key, base_url=base_url) env = EmailOpsEnv() tasks = ["easy", "medium", "hard"] print(f"Running baseline on model: {model_name}") print("=" * 40) for task_name in tasks: # START: Structured logging for OpenEnv automated grading print(f"START: {task_name}") obs = env.reset(task_name) step_count = 0 max_steps = 15 is_done = False total_reward = 0.0 while not is_done and step_count < max_steps: system_prompt = ( "You are an intelligent email operations agent. " f"Your current goal is: {env.task.description}\n" "You must perform actions to achieve this goal. Once you are finished, output the 'submit' action.\n" "Available action types:\n" " - open_email (requires email_id)\n" " - close_email\n" " - move_email (requires email_id, folder_name)\n" " - reply (requires email_id, reply_body)\n" " - delete_email (requires email_id)\n" " - flag_email (requires email_id)\n" " - submit" ) try: response = client.beta.chat.completions.parse( model=model_name, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"Current Observation:\n{obs.model_dump_json(indent=2)}\nWhat is your next action?"} ], response_format=Action, temperature=0.1 ) action = response.choices[0].message.parsed if not action: break # STEP: Structured logging for OpenEnv automated grading print(f"STEP: {action.model_dump_json()}") obs, reward, is_done, metrics = env.step(action) total_reward = reward if action.action_type == "submit": break except Exception as e: print(f"Error during inference: {e}") break step_count += 1 # END: Structured logging for OpenEnv automated grading result = { "task": task_name, "steps": step_count, "reward": total_reward, "metrics": env.metrics } print(f"END: {json.dumps(result)}") print("-" * 40) if __name__ == "__main__": parser = argparse.ArgumentParser() # Prioritizing environment variables as per requirements parser.add_argument("--api-key", type=str, default=HF_TOKEN) parser.add_argument("--model", type=str, default=MODEL_NAME) parser.add_argument("--base-url", type=str, default=API_BASE_URL) args = parser.parse_args() # HF_TOKEN is mandatory for automated submissions if not args.api_key: print("Please set HF_TOKEN environment variable.") exit(1) run_baseline(args.api_key, args.model, args.base_url)