Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from openai import OpenAI | |
| import random | |
| import time | |
| from dotenv import load_dotenv | |
| import os | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize OpenAI client | |
| client = OpenAI() | |
| # Define benchmark prompt | |
| PROMPT_A = "Benchmark Human-like Template" | |
| PROMPT_B = "Custom Template" | |
| template_messages_A = [ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic." | |
| }, | |
| { | |
| "role": "user", | |
| "content": "{question}" | |
| } | |
| ] | |
| def format_messages(template, question): | |
| return [ | |
| { | |
| "role": msg["role"], | |
| "content": msg["content"].format(question=question) | |
| } | |
| for msg in template | |
| ] | |
| def run_agent(question: str, group: str, custom_template: str): | |
| if group == "A": | |
| messages = format_messages(template_messages_A, question) | |
| else: | |
| # Use custom template for group B | |
| template_messages_B = [ | |
| { | |
| "role": "system", | |
| "content": custom_template | |
| }, | |
| { | |
| "role": "user", | |
| "content": "{question}" | |
| } | |
| ] | |
| messages = format_messages(template_messages_B, question) | |
| # Run GPT | |
| completion = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=messages | |
| ) | |
| return completion.choices[0].message.content | |
| def analyze_response(text): | |
| messages = [ | |
| {"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."}, | |
| {"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech. | |
| Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale: | |
| - 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech. | |
| - 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used | |
| - 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech. | |
| - 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech. | |
| - 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used. | |
| After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}. | |
| Don't provide explanations, only the single integer value. | |
| Text to evaluate: | |
| {text} | |
| Scoring Output: | |
| """} | |
| ] | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=messages, | |
| max_tokens=1, | |
| n=1, | |
| stop=None, | |
| temperature=0 | |
| ) | |
| return int(response.choices[0].message.content) | |
| def create_plot(scores_A, scores_B): | |
| labels = ['Benchmark', 'Custom'] | |
| colors = ['#2DD4BF', '#F43F5E'] | |
| fig, ax = plt.subplots() | |
| ax.set_ylabel('Human-like score') | |
| ax.set_ylim([0, 5]) | |
| bplot = ax.boxplot([scores_A, scores_B], | |
| patch_artist=True, | |
| tick_labels=labels) | |
| for patch, color in zip(bplot['boxes'], colors): | |
| patch.set_facecolor(color) | |
| return fig | |
| def run_experiment(questions, custom_template): | |
| results_A = [] | |
| results_B = [] | |
| all_responses = [] | |
| for question in questions: | |
| # Randomly assign group | |
| group = "A" if random.random() < 0.5 else "B" | |
| # Get response | |
| response = run_agent(question, group, custom_template) | |
| # Analyze response | |
| score = analyze_response(response) | |
| # Store results | |
| if group == "A": | |
| results_A.append(score) | |
| else: | |
| results_B.append(score) | |
| all_responses.append({ | |
| "question": question, | |
| "group": "Benchmark" if group == "A" else "Custom", | |
| "response": response, | |
| "score": score | |
| }) | |
| # Create visualization | |
| fig = create_plot(results_A, results_B) | |
| return results_A, results_B, all_responses, fig | |
| def gradio_interface(questions, custom_template): | |
| # Split questions into list | |
| question_list = [q.strip() for q in questions.split('\n') if q.strip()] | |
| # Run experiment | |
| scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template) | |
| # Format detailed results | |
| detailed_results = "" | |
| for r in responses: | |
| detailed_results += f"Question: {r['question']}\n" | |
| detailed_results += f"Template: {r['group']}\n" | |
| detailed_results += f"Response: {r['response']}\n" | |
| detailed_results += f"Score: {r['score']}\n" | |
| detailed_results += "-" * 50 + "\n" | |
| # Calculate averages | |
| avg_A = sum(scores_A) / len(scores_A) if scores_A else 0 | |
| avg_B = sum(scores_B) / len(scores_B) if scores_B else 0 | |
| summary = f""" | |
| Summary: | |
| Benchmark Template - Average Score: {avg_A:.2f} | |
| Custom Template - Average Score: {avg_B:.2f} | |
| Number of responses: | |
| Benchmark Template: {len(scores_A)} | |
| Custom Template: {len(scores_B)} | |
| """ | |
| return fig, summary, detailed_results | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=5, | |
| placeholder="Enter questions (one per line)...", | |
| label="Questions" | |
| ), | |
| gr.Textbox( | |
| lines=3, | |
| placeholder="Enter your custom template prompt design...", | |
| label="Check How Human Your Template Prompt (different GPTs could have different scores)", | |
| value="You are a helpful assistant that always answers questions. Keep it short." | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Plot(label="Results Visualization"), | |
| gr.Textbox(label="Summary", lines=6), | |
| gr.Textbox(label="Detailed Results", lines=10) | |
| ], | |
| title="A/B Testing Prompt Template Design Analysis", | |
| description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.", | |
| examples=[ | |
| ["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?", | |
| "You are a helpful assistant that always answers questions. Keep it short and professional."] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |