import gradio as gr import numpy as np import matplotlib.pyplot as plt from openai import OpenAI import random import time from dotenv import load_dotenv import os # Load environment variables load_dotenv() # Initialize OpenAI client client = OpenAI() # Define benchmark prompt PROMPT_A = "Benchmark Human-like Template" PROMPT_B = "Custom Template" template_messages_A = [ { "role": "system", "content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic." }, { "role": "user", "content": "{question}" } ] def format_messages(template, question): return [ { "role": msg["role"], "content": msg["content"].format(question=question) } for msg in template ] def run_agent(question: str, group: str, custom_template: str): if group == "A": messages = format_messages(template_messages_A, question) else: # Use custom template for group B template_messages_B = [ { "role": "system", "content": custom_template }, { "role": "user", "content": "{question}" } ] messages = format_messages(template_messages_B, question) # Run GPT completion = client.chat.completions.create( model="gpt-4o", messages=messages ) return completion.choices[0].message.content def analyze_response(text): messages = [ {"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."}, {"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech. Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale: - 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech. - 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used - 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech. - 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech. - 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used. After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}. Don't provide explanations, only the single integer value. Text to evaluate: {text} Scoring Output: """} ] response = client.chat.completions.create( model="gpt-4o", messages=messages, max_tokens=1, n=1, stop=None, temperature=0 ) return int(response.choices[0].message.content) def create_plot(scores_A, scores_B): labels = ['Benchmark', 'Custom'] colors = ['#2DD4BF', '#F43F5E'] fig, ax = plt.subplots() ax.set_ylabel('Human-like score') ax.set_ylim([0, 5]) bplot = ax.boxplot([scores_A, scores_B], patch_artist=True, tick_labels=labels) for patch, color in zip(bplot['boxes'], colors): patch.set_facecolor(color) return fig def run_experiment(questions, custom_template): results_A = [] results_B = [] all_responses = [] for question in questions: # Randomly assign group group = "A" if random.random() < 0.5 else "B" # Get response response = run_agent(question, group, custom_template) # Analyze response score = analyze_response(response) # Store results if group == "A": results_A.append(score) else: results_B.append(score) all_responses.append({ "question": question, "group": "Benchmark" if group == "A" else "Custom", "response": response, "score": score }) # Create visualization fig = create_plot(results_A, results_B) return results_A, results_B, all_responses, fig def gradio_interface(questions, custom_template): # Split questions into list question_list = [q.strip() for q in questions.split('\n') if q.strip()] # Run experiment scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template) # Format detailed results detailed_results = "" for r in responses: detailed_results += f"Question: {r['question']}\n" detailed_results += f"Template: {r['group']}\n" detailed_results += f"Response: {r['response']}\n" detailed_results += f"Score: {r['score']}\n" detailed_results += "-" * 50 + "\n" # Calculate averages avg_A = sum(scores_A) / len(scores_A) if scores_A else 0 avg_B = sum(scores_B) / len(scores_B) if scores_B else 0 summary = f""" Summary: Benchmark Template - Average Score: {avg_A:.2f} Custom Template - Average Score: {avg_B:.2f} Number of responses: Benchmark Template: {len(scores_A)} Custom Template: {len(scores_B)} """ return fig, summary, detailed_results # Create Gradio interface iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox( lines=5, placeholder="Enter questions (one per line)...", label="Questions" ), gr.Textbox( lines=3, placeholder="Enter your custom template prompt design...", label="Check How Human Your Template Prompt (different GPTs could have different scores)", value="You are a helpful assistant that always answers questions. Keep it short." ) ], outputs=[ gr.Plot(label="Results Visualization"), gr.Textbox(label="Summary", lines=6), gr.Textbox(label="Detailed Results", lines=10) ], title="A/B Testing Prompt Template Design Analysis", description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.", examples=[ ["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?", "You are a helpful assistant that always answers questions. Keep it short and professional."] ] ) if __name__ == "__main__": iface.launch()