Spaces:

ravindrakapse
/

customer_support_env

Sleeping

File size: 9,809 Bytes

#!/usr/bin/env python3
"""
Baseline inference script for Customer Support Environment.

Uses OpenAI-compatible API to run a baseline agent on all three tasks and report scores.
Requires API_BASE_URL, MODEL_NAME, and HF_TOKEN environment variables.

Usage:
    export HF_TOKEN="your-token-here"
    export API_BASE_URL="https://router.huggingface.co/v1"
    export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
    python baseline.py --task easy --episodes 100
    python baseline.py --task all --episodes 50
"""

import argparse
import os
import sys
from typing import Dict, List
import json
from openai import OpenAI

# Import environment components
from server.customer_support_env_environment import CustomerSupportEnvironment
from models import CustomerSupportAction, CustomerSupportObservation


class OpenAIBaselineAgent:
    """Baseline agent using OpenAI-compatible API for ticket handling"""

    def __init__(self, api_key: str, base_url: str = "https://router.huggingface.co/v1", model: str = "meta-llama/Llama-3.3-70B-Instruct"):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model = model

    def get_action(self, observation: CustomerSupportObservation, task_id: str) -> CustomerSupportAction:
        """
        Get agent action using OpenAI API.

        Args:
            observation: Current observation from environment
            task_id: Task difficulty level

        Returns:
            CustomerSupportAction based on LLM response
        """
        # Construct prompt based on task difficulty
        if task_id == "easy":
            task_instructions = "Categorize this support ticket into one of: billing, technical, account, shipping, general."
        elif task_id == "medium":
            task_instructions = "Categorize the ticket, assign a priority (low/medium/high/critical), and route to the appropriate team (tier1/tier2/billing/technical/management)."
        else:  # hard
            task_instructions = "Fully handle this ticket: categorize, prioritize, route to the right team, and draft a professional response."

        prompt = f"""You are a customer support AI assistant. {task_instructions}

TICKET INFORMATION:
- ID: {observation.ticket_id}
- Channel: {observation.channel}
- Timestamp: {observation.timestamp}

CUSTOMER MESSAGE:
{observation.customer_message}

CUSTOMER HISTORY:
- Account Age: {observation.account_age_days} days
- Total Tickets: {observation.total_tickets}
- Resolved Tickets: {observation.resolved_tickets}
- Satisfaction Score: {observation.satisfaction_score}/5.0
- Premium Customer: {"Yes" if observation.is_premium else "No"}
- Lifetime Value: ${observation.lifetime_value:.2f}

Based on this information, provide your response in JSON format with these fields:
{{
  "category": "billing" | "technical" | "account" | "shipping" | "general",
  "priority": "low" | "medium" | "high" | "critical",
  "assigned_team": "tier1" | "tier2" | "billing" | "technical" | "management",
  "response_draft": "Your professional response to the customer (minimum 20 characters)",
  "escalate": true | false
}}

Respond with ONLY the JSON, no additional text."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a customer support expert. Always respond with valid JSON.",
                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.3,  # Low temperature for consistent decisions
                max_tokens=500,
            )

            # Parse JSON response
            content = response.choices[0].message.content.strip()

            # Remove markdown code blocks if present
            if content.startswith("```"):
                content = content.split("```")[1]
                if content.startswith("json"):
                    content = content[4:]
                content = content.strip()

            action_dict = json.loads(content)

            # Create action object
            action = CustomerSupportAction(
                category=action_dict["category"],
                priority=action_dict["priority"],
                assigned_team=action_dict["assigned_team"],
                response_draft=action_dict["response_draft"],
                internal_notes=None,
                escalate=action_dict.get("escalate", False),
            )

            return action

        except Exception as e:
            print(f"Error calling OpenAI API: {e}")
            print(f"Response content: {content if 'content' in locals() else 'N/A'}")
            # Return a reasonable default action
            return CustomerSupportAction(
                category="general",
                priority="medium",
                assigned_team="tier1",
                response_draft="Thank you for contacting support. We'll review your request and get back to you shortly.",
                escalate=False,
            )


def run_episode(env: CustomerSupportEnvironment, agent: OpenAIBaselineAgent, task_id: str) -> Dict:
    """
    Run a single episode.

    Args:
        env: Environment instance
        agent: Agent instance
        task_id: Task difficulty

    Returns:
        Dict with episode results
    """
    obs = env.reset()
    action = agent.get_action(obs, task_id)
    obs = env.step(action)

    return {
        "reward": obs.reward,
        "grader_score": obs.metadata["grader_score"],
        "cumulative_reward": obs.metadata["cumulative_reward"],
        "ground_truth": obs.metadata["ground_truth"],
        "agent_action": obs.metadata["agent_action"],
    }


def evaluate_task(task_id: str, num_episodes: int, agent: OpenAIBaselineAgent) -> Dict:
    """
    Evaluate agent on a specific task.

    Args:
        task_id: Task difficulty
        num_episodes: Number of episodes to run
        agent: Agent instance

    Returns:
        Dict with evaluation results
    """
    print(f"\n{'='*70}")
    print(f"Evaluating Task: {task_id.upper()}")
    print(f"{'='*70}")

    env = CustomerSupportEnvironment(task_id=task_id, seed=42)
    results = []

    for episode in range(num_episodes):
        result = run_episode(env, agent, task_id)
        results.append(result)

        if (episode + 1) % 10 == 0:
            avg_score = sum(r["grader_score"] for r in results) / len(results)
            print(f"Episode {episode + 1}/{num_episodes} - Avg Score: {avg_score:.3f}")

    # Calculate statistics
    scores = [r["grader_score"] for r in results]
    rewards = [r["reward"] for r in results]

    avg_score = sum(scores) / len(scores)
    avg_reward = sum(rewards) / len(rewards)
    success_rate = sum(1 for s in scores if s >= env.task_configs[task_id]["success_threshold"]) / len(scores)

    print(f"\n{'-'*70}")
    print(f"Results for {task_id.upper()} task:")
    print(f"  Average Grader Score: {avg_score:.3f}")
    print(f"  Average Reward: {avg_reward:.3f}")
    print(f"  Success Rate: {success_rate:.1%} (threshold: {env.task_configs[task_id]['success_threshold']})")
    print(f"  Min Score: {min(scores):.3f}")
    print(f"  Max Score: {max(scores):.3f}")
    print(f"{'-'*70}")

    return {
        "task_id": task_id,
        "num_episodes": num_episodes,
        "avg_score": avg_score,
        "avg_reward": avg_reward,
        "success_rate": success_rate,
        "min_score": min(scores),
        "max_score": max(scores),
        "all_results": results,
    }


def main():
    parser = argparse.ArgumentParser(description="Run baseline inference on Customer Support Environment")
    parser.add_argument(
        "--task",
        type=str,
        default="all",
        choices=["easy", "medium", "hard", "all"],
        help="Task difficulty to evaluate (default: all)",
    )
    parser.add_argument(
        "--episodes", type=int, default=50, help="Number of episodes per task (default: 50)"
    )
    parser.add_argument(
        "--model", type=str, default=None, help="Model to use (default: MODEL_NAME env var)"
    )
    parser.add_argument(
        "--output", type=str, default="baseline_results.json", help="Output file for results (default: baseline_results.json)"
    )

    args = parser.parse_args()

    # Check for API key
    api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
    if not api_key:
        print("Error: HF_TOKEN or API_KEY environment variable not set.")
        print("Please set it with: export HF_TOKEN='your-token-here'")
        sys.exit(1)

    api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
    model_name = args.model or os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")

    # Initialize agent
    print(f"Initializing baseline agent (model: {model_name})...")
    print(f"API Base URL: {api_base_url}")
    agent = OpenAIBaselineAgent(api_key=api_key, base_url=api_base_url, model=model_name)

    # Determine which tasks to run
    tasks = ["easy", "medium", "hard"] if args.task == "all" else [args.task]

    # Run evaluations
    all_results = {}
    for task in tasks:
        result = evaluate_task(task, args.episodes, agent)
        all_results[task] = result

    # Print summary
    print(f"\n{'='*70}")
    print("SUMMARY")
    print(f"{'='*70}")
    for task, result in all_results.items():
        print(f"{task.upper():8s} | Score: {result['avg_score']:.3f} | Success: {result['success_rate']:.1%}")
    print(f"{'='*70}")

    # Save results
    os.makedirs("outputs", exist_ok=True)
    with open(args.output, "w") as f:
        json.dump(all_results, f, indent=2, default=str)
    print(f"\nResults saved to: {args.output}")


if __name__ == "__main__":
    main()