Spaces:

ravindrakapse
/

customer_support_env

Sleeping

App Files Files Community

customer_support_env / baseline.py

ravindrakapse

Upload folder using huggingface_hub

385ccc1 verified about 2 months ago

raw

history blame contribute delete

9.81 kB

	#!/usr/bin/env python3
	"""
	Baseline inference script for Customer Support Environment.

	Uses OpenAI-compatible API to run a baseline agent on all three tasks and report scores.
	Requires API_BASE_URL, MODEL_NAME, and HF_TOKEN environment variables.

	Usage:
	export HF_TOKEN="your-token-here"
	export API_BASE_URL="https://router.huggingface.co/v1"
	export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
	python baseline.py --task easy --episodes 100
	python baseline.py --task all --episodes 50
	"""

	import argparse
	import os
	import sys
	from typing import Dict, List
	import json
	from openai import OpenAI

	# Import environment components
	from server.customer_support_env_environment import CustomerSupportEnvironment
	from models import CustomerSupportAction, CustomerSupportObservation


	class OpenAIBaselineAgent:
	"""Baseline agent using OpenAI-compatible API for ticket handling"""

	def __init__(self, api_key: str, base_url: str = "https://router.huggingface.co/v1", model: str = "meta-llama/Llama-3.3-70B-Instruct"):
	self.client = OpenAI(api_key=api_key, base_url=base_url)
	self.model = model

	def get_action(self, observation: CustomerSupportObservation, task_id: str) -> CustomerSupportAction:
	"""
	Get agent action using OpenAI API.

	Args:
	observation: Current observation from environment
	task_id: Task difficulty level

	Returns:
	CustomerSupportAction based on LLM response
	"""
	# Construct prompt based on task difficulty
	if task_id == "easy":
	task_instructions = "Categorize this support ticket into one of: billing, technical, account, shipping, general."
	elif task_id == "medium":
	task_instructions = "Categorize the ticket, assign a priority (low/medium/high/critical), and route to the appropriate team (tier1/tier2/billing/technical/management)."
	else: # hard
	task_instructions = "Fully handle this ticket: categorize, prioritize, route to the right team, and draft a professional response."

	prompt = f"""You are a customer support AI assistant. {task_instructions}

	TICKET INFORMATION:
	- ID: {observation.ticket_id}
	- Channel: {observation.channel}
	- Timestamp: {observation.timestamp}

	CUSTOMER MESSAGE:
	{observation.customer_message}

	CUSTOMER HISTORY:
	- Account Age: {observation.account_age_days} days
	- Total Tickets: {observation.total_tickets}
	- Resolved Tickets: {observation.resolved_tickets}
	- Satisfaction Score: {observation.satisfaction_score}/5.0
	- Premium Customer: {"Yes" if observation.is_premium else "No"}
	- Lifetime Value: ${observation.lifetime_value:.2f}

	Based on this information, provide your response in JSON format with these fields:
	{{
	"category": "billing" \| "technical" \| "account" \| "shipping" \| "general",
	"priority": "low" \| "medium" \| "high" \| "critical",
	"assigned_team": "tier1" \| "tier2" \| "billing" \| "technical" \| "management",
	"response_draft": "Your professional response to the customer (minimum 20 characters)",
	"escalate": true \| false
	}}

	Respond with ONLY the JSON, no additional text."""

	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[
	{
	"role": "system",
	"content": "You are a customer support expert. Always respond with valid JSON.",
	},
	{"role": "user", "content": prompt},
	],
	temperature=0.3, # Low temperature for consistent decisions
	max_tokens=500,
	)

	# Parse JSON response
	content = response.choices[0].message.content.strip()

	# Remove markdown code blocks if present
	if content.startswith("```"):
	content = content.split("```")[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()

	action_dict = json.loads(content)

	# Create action object
	action = CustomerSupportAction(
	category=action_dict["category"],
	priority=action_dict["priority"],
	assigned_team=action_dict["assigned_team"],
	response_draft=action_dict["response_draft"],
	internal_notes=None,
	escalate=action_dict.get("escalate", False),
	)

	return action

	except Exception as e:
	print(f"Error calling OpenAI API: {e}")
	print(f"Response content: {content if 'content' in locals() else 'N/A'}")
	# Return a reasonable default action
	return CustomerSupportAction(
	category="general",
	priority="medium",
	assigned_team="tier1",
	response_draft="Thank you for contacting support. We'll review your request and get back to you shortly.",
	escalate=False,
	)


	def run_episode(env: CustomerSupportEnvironment, agent: OpenAIBaselineAgent, task_id: str) -> Dict:
	"""
	Run a single episode.

	Args:
	env: Environment instance
	agent: Agent instance
	task_id: Task difficulty

	Returns:
	Dict with episode results
	"""
	obs = env.reset()
	action = agent.get_action(obs, task_id)
	obs = env.step(action)

	return {
	"reward": obs.reward,
	"grader_score": obs.metadata["grader_score"],
	"cumulative_reward": obs.metadata["cumulative_reward"],
	"ground_truth": obs.metadata["ground_truth"],
	"agent_action": obs.metadata["agent_action"],
	}


	def evaluate_task(task_id: str, num_episodes: int, agent: OpenAIBaselineAgent) -> Dict:
	"""
	Evaluate agent on a specific task.

	Args:
	task_id: Task difficulty
	num_episodes: Number of episodes to run
	agent: Agent instance

	Returns:
	Dict with evaluation results
	"""
	print(f"\n{'='*70}")
	print(f"Evaluating Task: {task_id.upper()}")
	print(f"{'='*70}")

	env = CustomerSupportEnvironment(task_id=task_id, seed=42)
	results = []

	for episode in range(num_episodes):
	result = run_episode(env, agent, task_id)
	results.append(result)

	if (episode + 1) % 10 == 0:
	avg_score = sum(r["grader_score"] for r in results) / len(results)
	print(f"Episode {episode + 1}/{num_episodes} - Avg Score: {avg_score:.3f}")

	# Calculate statistics
	scores = [r["grader_score"] for r in results]
	rewards = [r["reward"] for r in results]

	avg_score = sum(scores) / len(scores)
	avg_reward = sum(rewards) / len(rewards)
	success_rate = sum(1 for s in scores if s >= env.task_configs[task_id]["success_threshold"]) / len(scores)

	print(f"\n{'-'*70}")
	print(f"Results for {task_id.upper()} task:")
	print(f" Average Grader Score: {avg_score:.3f}")
	print(f" Average Reward: {avg_reward:.3f}")
	print(f" Success Rate: {success_rate:.1%} (threshold: {env.task_configs[task_id]['success_threshold']})")
	print(f" Min Score: {min(scores):.3f}")
	print(f" Max Score: {max(scores):.3f}")
	print(f"{'-'*70}")

	return {
	"task_id": task_id,
	"num_episodes": num_episodes,
	"avg_score": avg_score,
	"avg_reward": avg_reward,
	"success_rate": success_rate,
	"min_score": min(scores),
	"max_score": max(scores),
	"all_results": results,
	}


	def main():
	parser = argparse.ArgumentParser(description="Run baseline inference on Customer Support Environment")
	parser.add_argument(
	"--task",
	type=str,
	default="all",
	choices=["easy", "medium", "hard", "all"],
	help="Task difficulty to evaluate (default: all)",
	)
	parser.add_argument(
	"--episodes", type=int, default=50, help="Number of episodes per task (default: 50)"
	)
	parser.add_argument(
	"--model", type=str, default=None, help="Model to use (default: MODEL_NAME env var)"
	)
	parser.add_argument(
	"--output", type=str, default="baseline_results.json", help="Output file for results (default: baseline_results.json)"
	)

	args = parser.parse_args()

	# Check for API key
	api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
	if not api_key:
	print("Error: HF_TOKEN or API_KEY environment variable not set.")
	print("Please set it with: export HF_TOKEN='your-token-here'")
	sys.exit(1)

	api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	model_name = args.model or os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")

	# Initialize agent
	print(f"Initializing baseline agent (model: {model_name})...")
	print(f"API Base URL: {api_base_url}")
	agent = OpenAIBaselineAgent(api_key=api_key, base_url=api_base_url, model=model_name)

	# Determine which tasks to run
	tasks = ["easy", "medium", "hard"] if args.task == "all" else [args.task]

	# Run evaluations
	all_results = {}
	for task in tasks:
	result = evaluate_task(task, args.episodes, agent)
	all_results[task] = result

	# Print summary
	print(f"\n{'='*70}")
	print("SUMMARY")
	print(f"{'='*70}")
	for task, result in all_results.items():
	print(f"{task.upper():8s} \| Score: {result['avg_score']:.3f} \| Success: {result['success_rate']:.1%}")
	print(f"{'='*70}")

	# Save results
	os.makedirs("outputs", exist_ok=True)
	with open(args.output, "w") as f:
	json.dump(all_results, f, indent=2, default=str)
	print(f"\nResults saved to: {args.output}")


	if __name__ == "__main__":
	main()