Spaces:
Running
Running
File size: 7,375 Bytes
b92688d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
import random
import time
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
# Initialize OpenAI client
client = OpenAI()
# Define benchmark prompt
PROMPT_A = "Benchmark Human-like Template"
PROMPT_B = "Custom Template"
template_messages_A = [
{
"role": "system",
"content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic."
},
{
"role": "user",
"content": "{question}"
}
]
def format_messages(template, question):
return [
{
"role": msg["role"],
"content": msg["content"].format(question=question)
}
for msg in template
]
def run_agent(question: str, group: str, custom_template: str):
if group == "A":
messages = format_messages(template_messages_A, question)
else:
# Use custom template for group B
template_messages_B = [
{
"role": "system",
"content": custom_template
},
{
"role": "user",
"content": "{question}"
}
]
messages = format_messages(template_messages_B, question)
# Run GPT
completion = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
return completion.choices[0].message.content
def analyze_response(text):
messages = [
{"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."},
{"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech.
Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale:
- 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech.
- 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used
- 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech.
- 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech.
- 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used.
After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}.
Don't provide explanations, only the single integer value.
Text to evaluate:
{text}
Scoring Output:
"""}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
max_tokens=1,
n=1,
stop=None,
temperature=0
)
return int(response.choices[0].message.content)
def create_plot(scores_A, scores_B):
labels = ['Benchmark', 'Custom']
colors = ['#2DD4BF', '#F43F5E']
fig, ax = plt.subplots()
ax.set_ylabel('Human-like score')
ax.set_ylim([0, 5])
bplot = ax.boxplot([scores_A, scores_B],
patch_artist=True,
tick_labels=labels)
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
return fig
def run_experiment(questions, custom_template):
results_A = []
results_B = []
all_responses = []
for question in questions:
# Randomly assign group
group = "A" if random.random() < 0.5 else "B"
# Get response
response = run_agent(question, group, custom_template)
# Analyze response
score = analyze_response(response)
# Store results
if group == "A":
results_A.append(score)
else:
results_B.append(score)
all_responses.append({
"question": question,
"group": "Benchmark" if group == "A" else "Custom",
"response": response,
"score": score
})
# Create visualization
fig = create_plot(results_A, results_B)
return results_A, results_B, all_responses, fig
def gradio_interface(questions, custom_template):
# Split questions into list
question_list = [q.strip() for q in questions.split('\n') if q.strip()]
# Run experiment
scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template)
# Format detailed results
detailed_results = ""
for r in responses:
detailed_results += f"Question: {r['question']}\n"
detailed_results += f"Template: {r['group']}\n"
detailed_results += f"Response: {r['response']}\n"
detailed_results += f"Score: {r['score']}\n"
detailed_results += "-" * 50 + "\n"
# Calculate averages
avg_A = sum(scores_A) / len(scores_A) if scores_A else 0
avg_B = sum(scores_B) / len(scores_B) if scores_B else 0
summary = f"""
Summary:
Benchmark Template - Average Score: {avg_A:.2f}
Custom Template - Average Score: {avg_B:.2f}
Number of responses:
Benchmark Template: {len(scores_A)}
Custom Template: {len(scores_B)}
"""
return fig, summary, detailed_results
# Create Gradio interface
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(
lines=5,
placeholder="Enter questions (one per line)...",
label="Questions"
),
gr.Textbox(
lines=3,
placeholder="Enter your custom template prompt design...",
label="Check How Human Your Template Prompt (different GPTs could have different scores)",
value="You are a helpful assistant that always answers questions. Keep it short."
)
],
outputs=[
gr.Plot(label="Results Visualization"),
gr.Textbox(label="Summary", lines=6),
gr.Textbox(label="Detailed Results", lines=10)
],
title="A/B Testing Prompt Template Design Analysis",
description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.",
examples=[
["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?",
"You are a helpful assistant that always answers questions. Keep it short and professional."]
]
)
if __name__ == "__main__":
iface.launch()
|