Spaces:

aibyml
/

Chatbot_Audit_Assistant

Running

App Files Files Community

Chatbot_Audit_Assistant / app.py

aibyml

Upload 2 files

b92688d verified 10 months ago

raw

history blame contribute delete

7.38 kB

	import gradio as gr
	import numpy as np
	import matplotlib.pyplot as plt
	from openai import OpenAI
	import random
	import time
	from dotenv import load_dotenv
	import os

	# Load environment variables
	load_dotenv()

	# Initialize OpenAI client
	client = OpenAI()

	# Define benchmark prompt
	PROMPT_A = "Benchmark Human-like Template"
	PROMPT_B = "Custom Template"

	template_messages_A = [
	{
	"role": "system",
	"content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic."
	},
	{
	"role": "user",
	"content": "{question}"
	}
	]

	def format_messages(template, question):
	return [
	{
	"role": msg["role"],
	"content": msg["content"].format(question=question)
	}
	for msg in template
	]

	def run_agent(question: str, group: str, custom_template: str):
	if group == "A":
	messages = format_messages(template_messages_A, question)
	else:
	# Use custom template for group B
	template_messages_B = [
	{
	"role": "system",
	"content": custom_template
	},
	{
	"role": "user",
	"content": "{question}"
	}
	]
	messages = format_messages(template_messages_B, question)

	# Run GPT
	completion = client.chat.completions.create(
	model="gpt-4o",
	messages=messages
	)

	return completion.choices[0].message.content

	def analyze_response(text):
	messages = [
	{"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."},
	{"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech.
	Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale:
	- 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech.
	- 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used
	- 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech.
	- 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech.
	- 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used.

	After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}.
	Don't provide explanations, only the single integer value.

	Text to evaluate:
	{text}

	Scoring Output:
	"""}
	]

	response = client.chat.completions.create(
	model="gpt-4o",
	messages=messages,
	max_tokens=1,
	n=1,
	stop=None,
	temperature=0
	)

	return int(response.choices[0].message.content)

	def create_plot(scores_A, scores_B):
	labels = ['Benchmark', 'Custom']
	colors = ['#2DD4BF', '#F43F5E']

	fig, ax = plt.subplots()
	ax.set_ylabel('Human-like score')
	ax.set_ylim([0, 5])

	bplot = ax.boxplot([scores_A, scores_B],
	patch_artist=True,
	tick_labels=labels)

	for patch, color in zip(bplot['boxes'], colors):
	patch.set_facecolor(color)

	return fig

	def run_experiment(questions, custom_template):
	results_A = []
	results_B = []
	all_responses = []

	for question in questions:
	# Randomly assign group
	group = "A" if random.random() < 0.5 else "B"

	# Get response
	response = run_agent(question, group, custom_template)

	# Analyze response
	score = analyze_response(response)

	# Store results
	if group == "A":
	results_A.append(score)
	else:
	results_B.append(score)

	all_responses.append({
	"question": question,
	"group": "Benchmark" if group == "A" else "Custom",
	"response": response,
	"score": score
	})

	# Create visualization
	fig = create_plot(results_A, results_B)

	return results_A, results_B, all_responses, fig

	def gradio_interface(questions, custom_template):
	# Split questions into list
	question_list = [q.strip() for q in questions.split('\n') if q.strip()]

	# Run experiment
	scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template)

	# Format detailed results
	detailed_results = ""
	for r in responses:
	detailed_results += f"Question: {r['question']}\n"
	detailed_results += f"Template: {r['group']}\n"
	detailed_results += f"Response: {r['response']}\n"
	detailed_results += f"Score: {r['score']}\n"
	detailed_results += "-" * 50 + "\n"

	# Calculate averages
	avg_A = sum(scores_A) / len(scores_A) if scores_A else 0
	avg_B = sum(scores_B) / len(scores_B) if scores_B else 0

	summary = f"""
	Summary:
	Benchmark Template - Average Score: {avg_A:.2f}
	Custom Template - Average Score: {avg_B:.2f}

	Number of responses:
	Benchmark Template: {len(scores_A)}
	Custom Template: {len(scores_B)}
	"""

	return fig, summary, detailed_results

	# Create Gradio interface
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(
	lines=5,
	placeholder="Enter questions (one per line)...",
	label="Questions"
	),
	gr.Textbox(
	lines=3,
	placeholder="Enter your custom template prompt design...",
	label="Check How Human Your Template Prompt (different GPTs could have different scores)",
	value="You are a helpful assistant that always answers questions. Keep it short."
	)
	],
	outputs=[
	gr.Plot(label="Results Visualization"),
	gr.Textbox(label="Summary", lines=6),
	gr.Textbox(label="Detailed Results", lines=10)
	],
	title="A/B Testing Prompt Template Design Analysis",
	description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.",
	examples=[
	["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?",
	"You are a helpful assistant that always answers questions. Keep it short and professional."]
	]
	)

	if __name__ == "__main__":
	iface.launch()