customer_support_env / baseline.py
ravindrakapse's picture
Upload folder using huggingface_hub
385ccc1 verified
#!/usr/bin/env python3
"""
Baseline inference script for Customer Support Environment.
Uses OpenAI-compatible API to run a baseline agent on all three tasks and report scores.
Requires API_BASE_URL, MODEL_NAME, and HF_TOKEN environment variables.
Usage:
export HF_TOKEN="your-token-here"
export API_BASE_URL="https://router.huggingface.co/v1"
export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
python baseline.py --task easy --episodes 100
python baseline.py --task all --episodes 50
"""
import argparse
import os
import sys
from typing import Dict, List
import json
from openai import OpenAI
# Import environment components
from server.customer_support_env_environment import CustomerSupportEnvironment
from models import CustomerSupportAction, CustomerSupportObservation
class OpenAIBaselineAgent:
"""Baseline agent using OpenAI-compatible API for ticket handling"""
def __init__(self, api_key: str, base_url: str = "https://router.huggingface.co/v1", model: str = "meta-llama/Llama-3.3-70B-Instruct"):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model = model
def get_action(self, observation: CustomerSupportObservation, task_id: str) -> CustomerSupportAction:
"""
Get agent action using OpenAI API.
Args:
observation: Current observation from environment
task_id: Task difficulty level
Returns:
CustomerSupportAction based on LLM response
"""
# Construct prompt based on task difficulty
if task_id == "easy":
task_instructions = "Categorize this support ticket into one of: billing, technical, account, shipping, general."
elif task_id == "medium":
task_instructions = "Categorize the ticket, assign a priority (low/medium/high/critical), and route to the appropriate team (tier1/tier2/billing/technical/management)."
else: # hard
task_instructions = "Fully handle this ticket: categorize, prioritize, route to the right team, and draft a professional response."
prompt = f"""You are a customer support AI assistant. {task_instructions}
TICKET INFORMATION:
- ID: {observation.ticket_id}
- Channel: {observation.channel}
- Timestamp: {observation.timestamp}
CUSTOMER MESSAGE:
{observation.customer_message}
CUSTOMER HISTORY:
- Account Age: {observation.account_age_days} days
- Total Tickets: {observation.total_tickets}
- Resolved Tickets: {observation.resolved_tickets}
- Satisfaction Score: {observation.satisfaction_score}/5.0
- Premium Customer: {"Yes" if observation.is_premium else "No"}
- Lifetime Value: ${observation.lifetime_value:.2f}
Based on this information, provide your response in JSON format with these fields:
{{
"category": "billing" | "technical" | "account" | "shipping" | "general",
"priority": "low" | "medium" | "high" | "critical",
"assigned_team": "tier1" | "tier2" | "billing" | "technical" | "management",
"response_draft": "Your professional response to the customer (minimum 20 characters)",
"escalate": true | false
}}
Respond with ONLY the JSON, no additional text."""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a customer support expert. Always respond with valid JSON.",
},
{"role": "user", "content": prompt},
],
temperature=0.3, # Low temperature for consistent decisions
max_tokens=500,
)
# Parse JSON response
content = response.choices[0].message.content.strip()
# Remove markdown code blocks if present
if content.startswith("```"):
content = content.split("```")[1]
if content.startswith("json"):
content = content[4:]
content = content.strip()
action_dict = json.loads(content)
# Create action object
action = CustomerSupportAction(
category=action_dict["category"],
priority=action_dict["priority"],
assigned_team=action_dict["assigned_team"],
response_draft=action_dict["response_draft"],
internal_notes=None,
escalate=action_dict.get("escalate", False),
)
return action
except Exception as e:
print(f"Error calling OpenAI API: {e}")
print(f"Response content: {content if 'content' in locals() else 'N/A'}")
# Return a reasonable default action
return CustomerSupportAction(
category="general",
priority="medium",
assigned_team="tier1",
response_draft="Thank you for contacting support. We'll review your request and get back to you shortly.",
escalate=False,
)
def run_episode(env: CustomerSupportEnvironment, agent: OpenAIBaselineAgent, task_id: str) -> Dict:
"""
Run a single episode.
Args:
env: Environment instance
agent: Agent instance
task_id: Task difficulty
Returns:
Dict with episode results
"""
obs = env.reset()
action = agent.get_action(obs, task_id)
obs = env.step(action)
return {
"reward": obs.reward,
"grader_score": obs.metadata["grader_score"],
"cumulative_reward": obs.metadata["cumulative_reward"],
"ground_truth": obs.metadata["ground_truth"],
"agent_action": obs.metadata["agent_action"],
}
def evaluate_task(task_id: str, num_episodes: int, agent: OpenAIBaselineAgent) -> Dict:
"""
Evaluate agent on a specific task.
Args:
task_id: Task difficulty
num_episodes: Number of episodes to run
agent: Agent instance
Returns:
Dict with evaluation results
"""
print(f"\n{'='*70}")
print(f"Evaluating Task: {task_id.upper()}")
print(f"{'='*70}")
env = CustomerSupportEnvironment(task_id=task_id, seed=42)
results = []
for episode in range(num_episodes):
result = run_episode(env, agent, task_id)
results.append(result)
if (episode + 1) % 10 == 0:
avg_score = sum(r["grader_score"] for r in results) / len(results)
print(f"Episode {episode + 1}/{num_episodes} - Avg Score: {avg_score:.3f}")
# Calculate statistics
scores = [r["grader_score"] for r in results]
rewards = [r["reward"] for r in results]
avg_score = sum(scores) / len(scores)
avg_reward = sum(rewards) / len(rewards)
success_rate = sum(1 for s in scores if s >= env.task_configs[task_id]["success_threshold"]) / len(scores)
print(f"\n{'-'*70}")
print(f"Results for {task_id.upper()} task:")
print(f" Average Grader Score: {avg_score:.3f}")
print(f" Average Reward: {avg_reward:.3f}")
print(f" Success Rate: {success_rate:.1%} (threshold: {env.task_configs[task_id]['success_threshold']})")
print(f" Min Score: {min(scores):.3f}")
print(f" Max Score: {max(scores):.3f}")
print(f"{'-'*70}")
return {
"task_id": task_id,
"num_episodes": num_episodes,
"avg_score": avg_score,
"avg_reward": avg_reward,
"success_rate": success_rate,
"min_score": min(scores),
"max_score": max(scores),
"all_results": results,
}
def main():
parser = argparse.ArgumentParser(description="Run baseline inference on Customer Support Environment")
parser.add_argument(
"--task",
type=str,
default="all",
choices=["easy", "medium", "hard", "all"],
help="Task difficulty to evaluate (default: all)",
)
parser.add_argument(
"--episodes", type=int, default=50, help="Number of episodes per task (default: 50)"
)
parser.add_argument(
"--model", type=str, default=None, help="Model to use (default: MODEL_NAME env var)"
)
parser.add_argument(
"--output", type=str, default="baseline_results.json", help="Output file for results (default: baseline_results.json)"
)
args = parser.parse_args()
# Check for API key
api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
if not api_key:
print("Error: HF_TOKEN or API_KEY environment variable not set.")
print("Please set it with: export HF_TOKEN='your-token-here'")
sys.exit(1)
api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
model_name = args.model or os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
# Initialize agent
print(f"Initializing baseline agent (model: {model_name})...")
print(f"API Base URL: {api_base_url}")
agent = OpenAIBaselineAgent(api_key=api_key, base_url=api_base_url, model=model_name)
# Determine which tasks to run
tasks = ["easy", "medium", "hard"] if args.task == "all" else [args.task]
# Run evaluations
all_results = {}
for task in tasks:
result = evaluate_task(task, args.episodes, agent)
all_results[task] = result
# Print summary
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
for task, result in all_results.items():
print(f"{task.upper():8s} | Score: {result['avg_score']:.3f} | Success: {result['success_rate']:.1%}")
print(f"{'='*70}")
# Save results
os.makedirs("outputs", exist_ok=True)
with open(args.output, "w") as f:
json.dump(all_results, f, indent=2, default=str)
print(f"\nResults saved to: {args.output}")
if __name__ == "__main__":
main()