Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Baseline inference script for Customer Support Environment. | |
| Uses OpenAI-compatible API to run a baseline agent on all three tasks and report scores. | |
| Requires API_BASE_URL, MODEL_NAME, and HF_TOKEN environment variables. | |
| Usage: | |
| export HF_TOKEN="your-token-here" | |
| export API_BASE_URL="https://router.huggingface.co/v1" | |
| export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct" | |
| python baseline.py --task easy --episodes 100 | |
| python baseline.py --task all --episodes 50 | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| from typing import Dict, List | |
| import json | |
| from openai import OpenAI | |
| # Import environment components | |
| from server.customer_support_env_environment import CustomerSupportEnvironment | |
| from models import CustomerSupportAction, CustomerSupportObservation | |
| class OpenAIBaselineAgent: | |
| """Baseline agent using OpenAI-compatible API for ticket handling""" | |
| def __init__(self, api_key: str, base_url: str = "https://router.huggingface.co/v1", model: str = "meta-llama/Llama-3.3-70B-Instruct"): | |
| self.client = OpenAI(api_key=api_key, base_url=base_url) | |
| self.model = model | |
| def get_action(self, observation: CustomerSupportObservation, task_id: str) -> CustomerSupportAction: | |
| """ | |
| Get agent action using OpenAI API. | |
| Args: | |
| observation: Current observation from environment | |
| task_id: Task difficulty level | |
| Returns: | |
| CustomerSupportAction based on LLM response | |
| """ | |
| # Construct prompt based on task difficulty | |
| if task_id == "easy": | |
| task_instructions = "Categorize this support ticket into one of: billing, technical, account, shipping, general." | |
| elif task_id == "medium": | |
| task_instructions = "Categorize the ticket, assign a priority (low/medium/high/critical), and route to the appropriate team (tier1/tier2/billing/technical/management)." | |
| else: # hard | |
| task_instructions = "Fully handle this ticket: categorize, prioritize, route to the right team, and draft a professional response." | |
| prompt = f"""You are a customer support AI assistant. {task_instructions} | |
| TICKET INFORMATION: | |
| - ID: {observation.ticket_id} | |
| - Channel: {observation.channel} | |
| - Timestamp: {observation.timestamp} | |
| CUSTOMER MESSAGE: | |
| {observation.customer_message} | |
| CUSTOMER HISTORY: | |
| - Account Age: {observation.account_age_days} days | |
| - Total Tickets: {observation.total_tickets} | |
| - Resolved Tickets: {observation.resolved_tickets} | |
| - Satisfaction Score: {observation.satisfaction_score}/5.0 | |
| - Premium Customer: {"Yes" if observation.is_premium else "No"} | |
| - Lifetime Value: ${observation.lifetime_value:.2f} | |
| Based on this information, provide your response in JSON format with these fields: | |
| {{ | |
| "category": "billing" | "technical" | "account" | "shipping" | "general", | |
| "priority": "low" | "medium" | "high" | "critical", | |
| "assigned_team": "tier1" | "tier2" | "billing" | "technical" | "management", | |
| "response_draft": "Your professional response to the customer (minimum 20 characters)", | |
| "escalate": true | false | |
| }} | |
| Respond with ONLY the JSON, no additional text.""" | |
| try: | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a customer support expert. Always respond with valid JSON.", | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| temperature=0.3, # Low temperature for consistent decisions | |
| max_tokens=500, | |
| ) | |
| # Parse JSON response | |
| content = response.choices[0].message.content.strip() | |
| # Remove markdown code blocks if present | |
| if content.startswith("```"): | |
| content = content.split("```")[1] | |
| if content.startswith("json"): | |
| content = content[4:] | |
| content = content.strip() | |
| action_dict = json.loads(content) | |
| # Create action object | |
| action = CustomerSupportAction( | |
| category=action_dict["category"], | |
| priority=action_dict["priority"], | |
| assigned_team=action_dict["assigned_team"], | |
| response_draft=action_dict["response_draft"], | |
| internal_notes=None, | |
| escalate=action_dict.get("escalate", False), | |
| ) | |
| return action | |
| except Exception as e: | |
| print(f"Error calling OpenAI API: {e}") | |
| print(f"Response content: {content if 'content' in locals() else 'N/A'}") | |
| # Return a reasonable default action | |
| return CustomerSupportAction( | |
| category="general", | |
| priority="medium", | |
| assigned_team="tier1", | |
| response_draft="Thank you for contacting support. We'll review your request and get back to you shortly.", | |
| escalate=False, | |
| ) | |
| def run_episode(env: CustomerSupportEnvironment, agent: OpenAIBaselineAgent, task_id: str) -> Dict: | |
| """ | |
| Run a single episode. | |
| Args: | |
| env: Environment instance | |
| agent: Agent instance | |
| task_id: Task difficulty | |
| Returns: | |
| Dict with episode results | |
| """ | |
| obs = env.reset() | |
| action = agent.get_action(obs, task_id) | |
| obs = env.step(action) | |
| return { | |
| "reward": obs.reward, | |
| "grader_score": obs.metadata["grader_score"], | |
| "cumulative_reward": obs.metadata["cumulative_reward"], | |
| "ground_truth": obs.metadata["ground_truth"], | |
| "agent_action": obs.metadata["agent_action"], | |
| } | |
| def evaluate_task(task_id: str, num_episodes: int, agent: OpenAIBaselineAgent) -> Dict: | |
| """ | |
| Evaluate agent on a specific task. | |
| Args: | |
| task_id: Task difficulty | |
| num_episodes: Number of episodes to run | |
| agent: Agent instance | |
| Returns: | |
| Dict with evaluation results | |
| """ | |
| print(f"\n{'='*70}") | |
| print(f"Evaluating Task: {task_id.upper()}") | |
| print(f"{'='*70}") | |
| env = CustomerSupportEnvironment(task_id=task_id, seed=42) | |
| results = [] | |
| for episode in range(num_episodes): | |
| result = run_episode(env, agent, task_id) | |
| results.append(result) | |
| if (episode + 1) % 10 == 0: | |
| avg_score = sum(r["grader_score"] for r in results) / len(results) | |
| print(f"Episode {episode + 1}/{num_episodes} - Avg Score: {avg_score:.3f}") | |
| # Calculate statistics | |
| scores = [r["grader_score"] for r in results] | |
| rewards = [r["reward"] for r in results] | |
| avg_score = sum(scores) / len(scores) | |
| avg_reward = sum(rewards) / len(rewards) | |
| success_rate = sum(1 for s in scores if s >= env.task_configs[task_id]["success_threshold"]) / len(scores) | |
| print(f"\n{'-'*70}") | |
| print(f"Results for {task_id.upper()} task:") | |
| print(f" Average Grader Score: {avg_score:.3f}") | |
| print(f" Average Reward: {avg_reward:.3f}") | |
| print(f" Success Rate: {success_rate:.1%} (threshold: {env.task_configs[task_id]['success_threshold']})") | |
| print(f" Min Score: {min(scores):.3f}") | |
| print(f" Max Score: {max(scores):.3f}") | |
| print(f"{'-'*70}") | |
| return { | |
| "task_id": task_id, | |
| "num_episodes": num_episodes, | |
| "avg_score": avg_score, | |
| "avg_reward": avg_reward, | |
| "success_rate": success_rate, | |
| "min_score": min(scores), | |
| "max_score": max(scores), | |
| "all_results": results, | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Run baseline inference on Customer Support Environment") | |
| parser.add_argument( | |
| "--task", | |
| type=str, | |
| default="all", | |
| choices=["easy", "medium", "hard", "all"], | |
| help="Task difficulty to evaluate (default: all)", | |
| ) | |
| parser.add_argument( | |
| "--episodes", type=int, default=50, help="Number of episodes per task (default: 50)" | |
| ) | |
| parser.add_argument( | |
| "--model", type=str, default=None, help="Model to use (default: MODEL_NAME env var)" | |
| ) | |
| parser.add_argument( | |
| "--output", type=str, default="baseline_results.json", help="Output file for results (default: baseline_results.json)" | |
| ) | |
| args = parser.parse_args() | |
| # Check for API key | |
| api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY") | |
| if not api_key: | |
| print("Error: HF_TOKEN or API_KEY environment variable not set.") | |
| print("Please set it with: export HF_TOKEN='your-token-here'") | |
| sys.exit(1) | |
| api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| model_name = args.model or os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct") | |
| # Initialize agent | |
| print(f"Initializing baseline agent (model: {model_name})...") | |
| print(f"API Base URL: {api_base_url}") | |
| agent = OpenAIBaselineAgent(api_key=api_key, base_url=api_base_url, model=model_name) | |
| # Determine which tasks to run | |
| tasks = ["easy", "medium", "hard"] if args.task == "all" else [args.task] | |
| # Run evaluations | |
| all_results = {} | |
| for task in tasks: | |
| result = evaluate_task(task, args.episodes, agent) | |
| all_results[task] = result | |
| # Print summary | |
| print(f"\n{'='*70}") | |
| print("SUMMARY") | |
| print(f"{'='*70}") | |
| for task, result in all_results.items(): | |
| print(f"{task.upper():8s} | Score: {result['avg_score']:.3f} | Success: {result['success_rate']:.1%}") | |
| print(f"{'='*70}") | |
| # Save results | |
| os.makedirs("outputs", exist_ok=True) | |
| with open(args.output, "w") as f: | |
| json.dump(all_results, f, indent=2, default=str) | |
| print(f"\nResults saved to: {args.output}") | |
| if __name__ == "__main__": | |
| main() | |