aibyml commited on
Commit
b92688d
·
verified ·
1 Parent(s): d6af622

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +209 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from openai import OpenAI
5
+ import random
6
+ import time
7
+ from dotenv import load_dotenv
8
+ import os
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Initialize OpenAI client
14
+ client = OpenAI()
15
+
16
+ # Define benchmark prompt
17
+ PROMPT_A = "Benchmark Human-like Template"
18
+ PROMPT_B = "Custom Template"
19
+
20
+ template_messages_A = [
21
+ {
22
+ "role": "system",
23
+ "content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic."
24
+ },
25
+ {
26
+ "role": "user",
27
+ "content": "{question}"
28
+ }
29
+ ]
30
+
31
+ def format_messages(template, question):
32
+ return [
33
+ {
34
+ "role": msg["role"],
35
+ "content": msg["content"].format(question=question)
36
+ }
37
+ for msg in template
38
+ ]
39
+
40
+ def run_agent(question: str, group: str, custom_template: str):
41
+ if group == "A":
42
+ messages = format_messages(template_messages_A, question)
43
+ else:
44
+ # Use custom template for group B
45
+ template_messages_B = [
46
+ {
47
+ "role": "system",
48
+ "content": custom_template
49
+ },
50
+ {
51
+ "role": "user",
52
+ "content": "{question}"
53
+ }
54
+ ]
55
+ messages = format_messages(template_messages_B, question)
56
+
57
+ # Run GPT
58
+ completion = client.chat.completions.create(
59
+ model="gpt-4o",
60
+ messages=messages
61
+ )
62
+
63
+ return completion.choices[0].message.content
64
+
65
+ def analyze_response(text):
66
+ messages = [
67
+ {"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."},
68
+ {"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech.
69
+ Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale:
70
+ - 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech.
71
+ - 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used
72
+ - 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech.
73
+ - 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech.
74
+ - 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used.
75
+
76
+ After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}.
77
+ Don't provide explanations, only the single integer value.
78
+
79
+ Text to evaluate:
80
+ {text}
81
+
82
+ Scoring Output:
83
+ """}
84
+ ]
85
+
86
+ response = client.chat.completions.create(
87
+ model="gpt-4o",
88
+ messages=messages,
89
+ max_tokens=1,
90
+ n=1,
91
+ stop=None,
92
+ temperature=0
93
+ )
94
+
95
+ return int(response.choices[0].message.content)
96
+
97
+ def create_plot(scores_A, scores_B):
98
+ labels = ['Benchmark', 'Custom']
99
+ colors = ['#2DD4BF', '#F43F5E']
100
+
101
+ fig, ax = plt.subplots()
102
+ ax.set_ylabel('Human-like score')
103
+ ax.set_ylim([0, 5])
104
+
105
+ bplot = ax.boxplot([scores_A, scores_B],
106
+ patch_artist=True,
107
+ tick_labels=labels)
108
+
109
+ for patch, color in zip(bplot['boxes'], colors):
110
+ patch.set_facecolor(color)
111
+
112
+ return fig
113
+
114
+ def run_experiment(questions, custom_template):
115
+ results_A = []
116
+ results_B = []
117
+ all_responses = []
118
+
119
+ for question in questions:
120
+ # Randomly assign group
121
+ group = "A" if random.random() < 0.5 else "B"
122
+
123
+ # Get response
124
+ response = run_agent(question, group, custom_template)
125
+
126
+ # Analyze response
127
+ score = analyze_response(response)
128
+
129
+ # Store results
130
+ if group == "A":
131
+ results_A.append(score)
132
+ else:
133
+ results_B.append(score)
134
+
135
+ all_responses.append({
136
+ "question": question,
137
+ "group": "Benchmark" if group == "A" else "Custom",
138
+ "response": response,
139
+ "score": score
140
+ })
141
+
142
+ # Create visualization
143
+ fig = create_plot(results_A, results_B)
144
+
145
+ return results_A, results_B, all_responses, fig
146
+
147
+ def gradio_interface(questions, custom_template):
148
+ # Split questions into list
149
+ question_list = [q.strip() for q in questions.split('\n') if q.strip()]
150
+
151
+ # Run experiment
152
+ scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template)
153
+
154
+ # Format detailed results
155
+ detailed_results = ""
156
+ for r in responses:
157
+ detailed_results += f"Question: {r['question']}\n"
158
+ detailed_results += f"Template: {r['group']}\n"
159
+ detailed_results += f"Response: {r['response']}\n"
160
+ detailed_results += f"Score: {r['score']}\n"
161
+ detailed_results += "-" * 50 + "\n"
162
+
163
+ # Calculate averages
164
+ avg_A = sum(scores_A) / len(scores_A) if scores_A else 0
165
+ avg_B = sum(scores_B) / len(scores_B) if scores_B else 0
166
+
167
+ summary = f"""
168
+ Summary:
169
+ Benchmark Template - Average Score: {avg_A:.2f}
170
+ Custom Template - Average Score: {avg_B:.2f}
171
+
172
+ Number of responses:
173
+ Benchmark Template: {len(scores_A)}
174
+ Custom Template: {len(scores_B)}
175
+ """
176
+
177
+ return fig, summary, detailed_results
178
+
179
+ # Create Gradio interface
180
+ iface = gr.Interface(
181
+ fn=gradio_interface,
182
+ inputs=[
183
+ gr.Textbox(
184
+ lines=5,
185
+ placeholder="Enter questions (one per line)...",
186
+ label="Questions"
187
+ ),
188
+ gr.Textbox(
189
+ lines=3,
190
+ placeholder="Enter your custom template prompt design...",
191
+ label="Check How Human Your Template Prompt (different GPTs could have different scores)",
192
+ value="You are a helpful assistant that always answers questions. Keep it short."
193
+ )
194
+ ],
195
+ outputs=[
196
+ gr.Plot(label="Results Visualization"),
197
+ gr.Textbox(label="Summary", lines=6),
198
+ gr.Textbox(label="Detailed Results", lines=10)
199
+ ],
200
+ title="A/B Testing Prompt Template Design Analysis",
201
+ description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.",
202
+ examples=[
203
+ ["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?",
204
+ "You are a helpful assistant that always answers questions. Keep it short and professional."]
205
+ ]
206
+ )
207
+
208
+ if __name__ == "__main__":
209
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ python-dotenv
4
+ matplotlib
5
+ numpy