#!/usr/bin/env python3 """ Baseline inference script for Customer Support Environment. Uses OpenAI-compatible API to run a baseline agent on all three tasks and report scores. Requires API_BASE_URL, MODEL_NAME, and HF_TOKEN environment variables. Usage: export HF_TOKEN="your-token-here" export API_BASE_URL="https://router.huggingface.co/v1" export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" python baseline.py --task easy --episodes 100 python baseline.py --task all --episodes 50 """ import argparse import os import sys from typing import Dict, List import json from openai import OpenAI # Import environment components from server.customer_support_env_environment import CustomerSupportEnvironment from models import CustomerSupportAction, CustomerSupportObservation class OpenAIBaselineAgent: """Baseline agent using OpenAI-compatible API for ticket handling""" def __init__(self, api_key: str, base_url: str = "https://router.huggingface.co/v1", model: str = "meta-llama/Llama-3.3-70B-Instruct"): self.client = OpenAI(api_key=api_key, base_url=base_url) self.model = model def get_action(self, observation: CustomerSupportObservation, task_id: str) -> CustomerSupportAction: """ Get agent action using OpenAI API. Args: observation: Current observation from environment task_id: Task difficulty level Returns: CustomerSupportAction based on LLM response """ # Construct prompt based on task difficulty if task_id == "easy": task_instructions = "Categorize this support ticket into one of: billing, technical, account, shipping, general." elif task_id == "medium": task_instructions = "Categorize the ticket, assign a priority (low/medium/high/critical), and route to the appropriate team (tier1/tier2/billing/technical/management)." else: # hard task_instructions = "Fully handle this ticket: categorize, prioritize, route to the right team, and draft a professional response." prompt = f"""You are a customer support AI assistant. {task_instructions} TICKET INFORMATION: - ID: {observation.ticket_id} - Channel: {observation.channel} - Timestamp: {observation.timestamp} CUSTOMER MESSAGE: {observation.customer_message} CUSTOMER HISTORY: - Account Age: {observation.account_age_days} days - Total Tickets: {observation.total_tickets} - Resolved Tickets: {observation.resolved_tickets} - Satisfaction Score: {observation.satisfaction_score}/5.0 - Premium Customer: {"Yes" if observation.is_premium else "No"} - Lifetime Value: ${observation.lifetime_value:.2f} Based on this information, provide your response in JSON format with these fields: {{ "category": "billing" | "technical" | "account" | "shipping" | "general", "priority": "low" | "medium" | "high" | "critical", "assigned_team": "tier1" | "tier2" | "billing" | "technical" | "management", "response_draft": "Your professional response to the customer (minimum 20 characters)", "escalate": true | false }} Respond with ONLY the JSON, no additional text.""" try: response = self.client.chat.completions.create( model=self.model, messages=[ { "role": "system", "content": "You are a customer support expert. Always respond with valid JSON.", }, {"role": "user", "content": prompt}, ], temperature=0.3, # Low temperature for consistent decisions max_tokens=500, ) # Parse JSON response content = response.choices[0].message.content.strip() # Remove markdown code blocks if present if content.startswith("```"): content = content.split("```")[1] if content.startswith("json"): content = content[4:] content = content.strip() action_dict = json.loads(content) # Create action object action = CustomerSupportAction( category=action_dict["category"], priority=action_dict["priority"], assigned_team=action_dict["assigned_team"], response_draft=action_dict["response_draft"], internal_notes=None, escalate=action_dict.get("escalate", False), ) return action except Exception as e: print(f"Error calling OpenAI API: {e}") print(f"Response content: {content if 'content' in locals() else 'N/A'}") # Return a reasonable default action return CustomerSupportAction( category="general", priority="medium", assigned_team="tier1", response_draft="Thank you for contacting support. We'll review your request and get back to you shortly.", escalate=False, ) def run_episode(env: CustomerSupportEnvironment, agent: OpenAIBaselineAgent, task_id: str) -> Dict: """ Run a single episode. Args: env: Environment instance agent: Agent instance task_id: Task difficulty Returns: Dict with episode results """ obs = env.reset() action = agent.get_action(obs, task_id) obs = env.step(action) return { "reward": obs.reward, "grader_score": obs.metadata["grader_score"], "cumulative_reward": obs.metadata["cumulative_reward"], "ground_truth": obs.metadata["ground_truth"], "agent_action": obs.metadata["agent_action"], } def evaluate_task(task_id: str, num_episodes: int, agent: OpenAIBaselineAgent) -> Dict: """ Evaluate agent on a specific task. Args: task_id: Task difficulty num_episodes: Number of episodes to run agent: Agent instance Returns: Dict with evaluation results """ print(f"\n{'='*70}") print(f"Evaluating Task: {task_id.upper()}") print(f"{'='*70}") env = CustomerSupportEnvironment(task_id=task_id, seed=42) results = [] for episode in range(num_episodes): result = run_episode(env, agent, task_id) results.append(result) if (episode + 1) % 10 == 0: avg_score = sum(r["grader_score"] for r in results) / len(results) print(f"Episode {episode + 1}/{num_episodes} - Avg Score: {avg_score:.3f}") # Calculate statistics scores = [r["grader_score"] for r in results] rewards = [r["reward"] for r in results] avg_score = sum(scores) / len(scores) avg_reward = sum(rewards) / len(rewards) success_rate = sum(1 for s in scores if s >= env.task_configs[task_id]["success_threshold"]) / len(scores) print(f"\n{'-'*70}") print(f"Results for {task_id.upper()} task:") print(f" Average Grader Score: {avg_score:.3f}") print(f" Average Reward: {avg_reward:.3f}") print(f" Success Rate: {success_rate:.1%} (threshold: {env.task_configs[task_id]['success_threshold']})") print(f" Min Score: {min(scores):.3f}") print(f" Max Score: {max(scores):.3f}") print(f"{'-'*70}") return { "task_id": task_id, "num_episodes": num_episodes, "avg_score": avg_score, "avg_reward": avg_reward, "success_rate": success_rate, "min_score": min(scores), "max_score": max(scores), "all_results": results, } def main(): parser = argparse.ArgumentParser(description="Run baseline inference on Customer Support Environment") parser.add_argument( "--task", type=str, default="all", choices=["easy", "medium", "hard", "all"], help="Task difficulty to evaluate (default: all)", ) parser.add_argument( "--episodes", type=int, default=50, help="Number of episodes per task (default: 50)" ) parser.add_argument( "--model", type=str, default=None, help="Model to use (default: MODEL_NAME env var)" ) parser.add_argument( "--output", type=str, default="baseline_results.json", help="Output file for results (default: baseline_results.json)" ) args = parser.parse_args() # Check for API key api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY") if not api_key: print("Error: HF_TOKEN or API_KEY environment variable not set.") print("Please set it with: export HF_TOKEN='your-token-here'") sys.exit(1) api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") model_name = args.model or os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct") # Initialize agent print(f"Initializing baseline agent (model: {model_name})...") print(f"API Base URL: {api_base_url}") agent = OpenAIBaselineAgent(api_key=api_key, base_url=api_base_url, model=model_name) # Determine which tasks to run tasks = ["easy", "medium", "hard"] if args.task == "all" else [args.task] # Run evaluations all_results = {} for task in tasks: result = evaluate_task(task, args.episodes, agent) all_results[task] = result # Print summary print(f"\n{'='*70}") print("SUMMARY") print(f"{'='*70}") for task, result in all_results.items(): print(f"{task.upper():8s} | Score: {result['avg_score']:.3f} | Success: {result['success_rate']:.1%}") print(f"{'='*70}") # Save results os.makedirs("outputs", exist_ok=True) with open(args.output, "w") as f: json.dump(all_results, f, indent=2, default=str) print(f"\nResults saved to: {args.output}") if __name__ == "__main__": main()