Spaces:

aibyml
/

Chatbot_Audit_Assistant

Running

App Files Files Community

aibyml commited on May 27, 2025

Commit

b92688d

verified ·

1 Parent(s): d6af622

Upload 2 files

Browse files

Files changed (2) hide show

app.py +209 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+from openai import OpenAI
+import random
+import time
+from dotenv import load_dotenv
+import os
+# Load environment variables
+load_dotenv()
+# Initialize OpenAI client
+client = OpenAI()
+# Define benchmark prompt
+PROMPT_A = "Benchmark Human-like Template"
+PROMPT_B = "Custom Template"
+template_messages_A = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant that always answers questions. Keep it short. Answer like you are a real human. For example, you can use emotions, metaphors and proverbs. Try to always be positive, and help the user with their questions, doubts and problems. Don't be pessimistic."
+    },
+    {
+        "role": "user",
+        "content": "{question}"
+    }
+]
+def format_messages(template, question):
+    return [
+        {
+            "role": msg["role"],
+            "content": msg["content"].format(question=question)
+        }
+        for msg in template
+    ]
+def run_agent(question: str, group: str, custom_template: str):
+    if group == "A":
+        messages = format_messages(template_messages_A, question)
+    else:
+        # Use custom template for group B
+        template_messages_B = [
+            {
+                "role": "system",
+                "content": custom_template
+            },
+            {
+                "role": "user",
+                "content": "{question}"
+            }
+        ]
+        messages = format_messages(template_messages_B, question)
+    # Run GPT
+    completion = client.chat.completions.create(
+        model="gpt-4o",
+        messages=messages
+    )
+    return completion.choices[0].message.content
+def analyze_response(text):
+    messages = [
+        {"role": "system", "content": "You are trained to analyze and detect the sentiment of given text."},
+        {"role": "user", "content": f"""Analyze the following recommendation and determine if the output is human-like. Check if there are emotions used, and metaphors and figure of speech.
+                                    Assign a score: Based on your evaluation assign a score to the agent's performans using the following scale:
+                                    - 1 (Poor): The agent is very machine like, doesn't use emotions, methaphors and figure of speech.
+                                    - 2 (Fair): The agent is some human-likeness, some emotions, methaphors and figure of speech are used
+                                    - 3 (Good): The agent is is human-like, uses enough emotions, methaphors and figure of speech.
+                                    - 4 (Very Good): The agent very human-like, uses multiple emotions, methaphors and figure of speech.
+                                    - 5 (Excellent): You almost cannot distinguish between the machine and the human, a lot emotions, methaphors and figure of speech are used.
+                                    After evaluating the conversation based on the criteria above, provide your score as an integer between 1 and 5. Only answer with a single character in the following value {1, 2, 3, 4, 5}.
+                                    Don't provide explanations, only the single integer value.
+                                    Text to evaluate:
+                                    {text}
+                                    Scoring Output:
+                                    """}
+    ]
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=messages,
+        max_tokens=1,
+        n=1,
+        stop=None,
+        temperature=0
+    )
+    return int(response.choices[0].message.content)
+def create_plot(scores_A, scores_B):
+    labels = ['Benchmark', 'Custom']
+    colors = ['#2DD4BF', '#F43F5E']
+    fig, ax = plt.subplots()
+    ax.set_ylabel('Human-like score')
+    ax.set_ylim([0, 5])
+    bplot = ax.boxplot([scores_A, scores_B],
+                       patch_artist=True,
+                       tick_labels=labels)
+    for patch, color in zip(bplot['boxes'], colors):
+        patch.set_facecolor(color)
+    return fig
+def run_experiment(questions, custom_template):
+    results_A = []
+    results_B = []
+    all_responses = []
+    for question in questions:
+        # Randomly assign group
+        group = "A" if random.random() < 0.5 else "B"
+        # Get response
+        response = run_agent(question, group, custom_template)
+        # Analyze response
+        score = analyze_response(response)
+        # Store results
+        if group == "A":
+            results_A.append(score)
+        else:
+            results_B.append(score)
+        all_responses.append({
+            "question": question,
+            "group": "Benchmark" if group == "A" else "Custom",
+            "response": response,
+            "score": score
+        })
+    # Create visualization
+    fig = create_plot(results_A, results_B)
+    return results_A, results_B, all_responses, fig
+def gradio_interface(questions, custom_template):
+    # Split questions into list
+    question_list = [q.strip() for q in questions.split('\n') if q.strip()]
+    # Run experiment
+    scores_A, scores_B, responses, fig = run_experiment(question_list, custom_template)
+    # Format detailed results
+    detailed_results = ""
+    for r in responses:
+        detailed_results += f"Question: {r['question']}\n"
+        detailed_results += f"Template: {r['group']}\n"
+        detailed_results += f"Response: {r['response']}\n"
+        detailed_results += f"Score: {r['score']}\n"
+        detailed_results += "-" * 50 + "\n"
+    # Calculate averages
+    avg_A = sum(scores_A) / len(scores_A) if scores_A else 0
+    avg_B = sum(scores_B) / len(scores_B) if scores_B else 0
+    summary = f"""
+    Summary:
+    Benchmark Template - Average Score: {avg_A:.2f}
+    Custom Template - Average Score: {avg_B:.2f}
+    Number of responses:
+    Benchmark Template: {len(scores_A)}
+    Custom Template: {len(scores_B)}
+    """
+    return fig, summary, detailed_results
+# Create Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Textbox(
+            lines=5,
+            placeholder="Enter questions (one per line)...",
+            label="Questions"
+        ),
+        gr.Textbox(
+            lines=3,
+            placeholder="Enter your custom template prompt design...",
+            label="Check How Human Your Template Prompt (different GPTs could have different scores)",
+            value="You are a helpful assistant that always answers questions. Keep it short."
+        )
+    ],
+    outputs=[
+        gr.Plot(label="Results Visualization"),
+        gr.Textbox(label="Summary", lines=6),
+        gr.Textbox(label="Detailed Results", lines=10)
+    ],
+    title="A/B Testing Prompt Template Design Analysis",
+    description="Compare prompt template design of your chatbot against a benchmark human-like template design and analyze your chatbot human-likeness scores.",
+    examples=[
+        ["What should I do when I feel sad?\nWhat do you think about falling in love?\nWhat do you think about getting divorced?\nWhat should I do when I feel happy?",
+         "You are a helpful assistant that always answers questions. Keep it short and professional."]
+    ]
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+openai
+python-dotenv
+matplotlib
+numpy