Spaces:

aibyml
/

Chatbot_Audit_Assistant

Running

File size: 7,375 Bytes

b92688d

import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
import random
import time
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI()

# Define benchmark prompt
PROMPT_A = "Benchmark Human-like Template"
PROMPT_B = "Custom Template"

template_messages_A = [
    {
        "role": "system",
        "content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic."
    },
    {
        "role": "user",
        "content": "{question}"
    }
]

def format_messages(template, question):
    return [
        {
            "role": msg["role"],
            "content": msg["content"].format(question=question)
        }
        for msg in template
    ]

def run_agent(question: str, group: str, custom_template: str):
    if group == "A":
        messages = format_messages(template_messages_A, question)
    else:
        # Use custom template for group B
        template_messages_B = [
            {
                "role": "system",
                "content": custom_template
            },
            {
                "role": "user",
                "content": "{question}"
            }
        ]
        messages = format_messages(template_messages_B, question)
    
    # Run GPT
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    return completion.choices[0].message.content

def analyze_response(text):
    messages = [
        {"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."},
        {"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech. 
                                    Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale:
                                    - 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech.
                                    - 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used
                                    - 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech.
                                    - 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech.
                                    - 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used.

                                    After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}.
                                    Don't provide explanations, only the single integer value.

                                    Text to evaluate: 
                                    {text}

                                    Scoring Output:
                                    """}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=1,
        n=1,
        stop=None,
        temperature=0
    )
    
    return int(response.choices[0].message.content)

def create_plot(scores_A, scores_B):
    labels = ['Benchmark', 'Custom']
    colors = ['#2DD4BF', '#F43F5E']
    
    fig, ax = plt.subplots()
    ax.set_ylabel('Human-like score')
    ax.set_ylim([0, 5])
    
    bplot = ax.boxplot([scores_A, scores_B],
                       patch_artist=True,
                       tick_labels=labels)
    
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)
    
    return fig

def run_experiment(questions, custom_template):
    results_A = []
    results_B = []
    all_responses = []
    
    for question in questions:
        # Randomly assign group
        group = "A" if random.random() < 0.5 else "B"
        
        # Get response
        response = run_agent(question, group, custom_template)
        
        # Analyze response
        score = analyze_response(response)
        
        # Store results
        if group == "A":
            results_A.append(score)
        else:
            results_B.append(score)
            
        all_responses.append({
            "question": question,
            "group": "Benchmark" if group == "A" else "Custom",
            "response": response,
            "score": score
        })
    
    # Create visualization
    fig = create_plot(results_A, results_B)
    
    return results_A, results_B, all_responses, fig

def gradio_interface(questions, custom_template):
    # Split questions into list
    question_list = [q.strip() for q in questions.split('\n') if q.strip()]
    
    # Run experiment
    scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template)
    
    # Format detailed results
    detailed_results = ""
    for r in responses:
        detailed_results += f"Question: {r['question']}\n"
        detailed_results += f"Template: {r['group']}\n"
        detailed_results += f"Response: {r['response']}\n"
        detailed_results += f"Score: {r['score']}\n"
        detailed_results += "-" * 50 + "\n"
    
    # Calculate averages
    avg_A = sum(scores_A) / len(scores_A) if scores_A else 0
    avg_B = sum(scores_B) / len(scores_B) if scores_B else 0
    
    summary = f"""
    Summary:
    Benchmark Template - Average Score: {avg_A:.2f}
    Custom Template - Average Score: {avg_B:.2f}
    
    Number of responses:
    Benchmark Template: {len(scores_A)}
    Custom Template: {len(scores_B)}
    """
    
    return fig, summary, detailed_results

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(
            lines=5,
            placeholder="Enter questions (one per line)...",
            label="Questions"
        ),
        gr.Textbox(
            lines=3,
            placeholder="Enter your custom template prompt design...",
            label="Check How Human Your Template Prompt (different GPTs could have different scores)",
            value="You are a helpful assistant that always answers questions. Keep it short."
        )
    ],
    outputs=[
        gr.Plot(label="Results Visualization"),
        gr.Textbox(label="Summary", lines=6),
        gr.Textbox(label="Detailed Results", lines=10)
    ],
    title="A/B Testing Prompt Template Design Analysis",
    description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.",
    examples=[
        ["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?",
         "You are a helpful assistant that always answers questions. Keep it short and professional."]
    ]
)

if __name__ == "__main__":
    iface.launch()