Spaces:

Fola-AI
/

ChatBot_with_evaluator

Sleeping

App Files Files Community

ChatBot_with_evaluator / app.py

Fola-AI

Upload app.py

ad9ebf7 verified 3 months ago

raw

history blame contribute delete

21.1 kB

	#!/usr/bin/env python
	# coding: utf-8

	# # Chatbot Program
	#
	# #### Chatbot with Evaluator - Hugging Face Deployment Ready
	# - Primary Agent: Google Gemini (via OpenAI API)
	# - Evaluator: Groq Llama 3.3 70B
	# - Fast API-based inference (no local models)

	# In[ ]:


	# imports

	import os
	import gradio as gr
	from openai import OpenAI
	import time
	from typing import Tuple, Optional
	import json
	from dotenv import load_dotenv


	# In[ ]:


	load_dotenv(override=True)


	# In[ ]:


	# Check for API keys
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")

	if GOOGLE_API_KEY:
	print(f"Google API Key exists and begins {GOOGLE_API_KEY[:2]}")
	else:
	print("Google API Key not set (and this is optional)")

	if GROQ_API_KEY:
	print(f"Groq API Key exists and begins {GROQ_API_KEY[:4]}")
	else:
	print("Groq API Key not set (and this is optional)")


	# In[ ]:


	# Model configurations
	AGENT_MODELS = {
	# "Gemini Pro": {
	# "model": "gemini-pro",
	# "description": "Google's Gemini Pro model",
	# "max_tokens": 2048
	# },
	"Gemini 1.5 flash": {
	"model": "gemini-1.5-flash",
	"description": "Fast Gemini model",
	"max_tokens": 2048
	}
	# "Gemini 1.5 Pro": {
	# "model": "gemini-1.5-pro",
	# "description": "Advanced Gemini model",
	# "max_tokens": 2048
	# }
	}

	EVALUATOR_MODELS = {
	"Llama 3.3 70B": {
	"model": "llama-3.3-70b-versatile",
	"description": "Groq's Llama 3.3 70B - Fast & Powerful"
	}
	# "Llama 3.1 70B": {
	# "model": "llama-3.1-70b-versatile",
	# "description": "Groq's Llama 3.1 70B"
	# },
	# "Mixtral 8x7B": {
	# "model": "mixtral-8x7b-32768",
	# "description": "Groq's Mixtral model"
	# }
	}


	# In[ ]:


	# ===========================
	# API Client Management Class
	# ===========================

	class APIClientManager:
	def __init__(self):
	self.gemini_client = None
	self.groq_client = None
	self.errors = []
	self.initialize_clients()

	def initialize_clients(self):
	"""Initialize API clients with error handling."""
	# Get API keys from environment
	google_api_key = os.getenv("GOOGLE_API_KEY")
	groq_api_key = os.getenv("GROQ_API_KEY")

	# Initialize Gemini client
	if google_api_key:
	try:
	self.gemini_client = OpenAI(
	api_key=google_api_key,
	base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
	)
	print("✅ Gemini API client initialized")
	except Exception as e:
	self.errors.append(f"Gemini initialization error: {e}")
	else:
	self.errors.append("GOOGLE_API_KEY not found in environment variables")

	# Initialize Groq client
	if groq_api_key:
	try:
	self.groq_client = OpenAI(
	api_key=groq_api_key,
	base_url="https://api.groq.com/openai/v1"
	)
	print("✅ Groq API client initialized")
	except Exception as e:
	self.errors.append(f"Groq initialization error: {e}")
	else:
	self.errors.append("GROQ_API_KEY not found in environment variables")

	def create_evaluator_prompt(self, user_input: str, agent_response: str) -> str:
	"""Create the evaluation prompt."""
	evaluator_prompt = (
	"You are an evaluator that decides whether a response to a question is acceptable. "
	"You are provided with a conversation between a User and an Agent. "
	"Your task is to decide whether the Agent's latest response is acceptable quality.\n\n"
	f"User Question: {user_input}\n\n"
	f"Agent Response: {agent_response}\n\n"
	"With this context, please evaluate the latest response, replying with whether the response is acceptable and your feedback.\n\n"
	"Format your evaluation as follows:\n"
	"1. Start with either 'ACCEPTABLE ✅' or 'UNACCEPTABLE ❌'\n"
	"2. Provide a brief quality score (1-10)\n"
	"3. List 2-3 specific strengths or issues\n"
	"4. Suggest one improvement if needed"
	)
	return evaluator_prompt

	def generate_agent_response(
	self,
	user_input: str,
	model_name: str = "Gemini 1.5 flash",
	temperature: float = 0.7,
	max_tokens: int = 500
	) -> Tuple[str, str, float]:
	"""Generate response using Gemini API."""

	if not self.gemini_client:
	return "❌ Gemini API not initialized. Please set GOOGLE_API_KEY environment variable.", "Error", 0

	try:
	model_config = AGENT_MODELS.get(model_name, AGENT_MODELS["Gemini 1.5 flash"])
	model_id = model_config["model"]

	# Make API call to Gemini
	start_time = time.time()

	response = self.gemini_client.chat.completions.create(
	model=model_id,
	messages=[
	{"role": "system", "content": "You are a helpful AI assistant. Provide clear, accurate, and helpful responses."},
	{"role": "user", "content": user_input}
	],
	temperature=temperature,
	max_tokens=min(max_tokens, model_config["max_tokens"]),
	top_p=0.9
	)

	elapsed_time = time.time() - start_time

	# Extract response
	agent_response = response.choices[0].message.content
	status = f"✅ {model_name} responded in {elapsed_time:.2f}s"

	return agent_response, status, elapsed_time

	except Exception as e:
	error_msg = f"❌ Gemini API error: {str(e)}"
	print(error_msg)

	# Check for common errors
	if "API key" in str(e):
	error_msg = "❌ Invalid Google API key. Please check GOOGLE_API_KEY."
	elif "quota" in str(e).lower():
	error_msg = "❌ API quota exceeded. Please try again later."
	elif "model" in str(e).lower():
	error_msg = f"❌ Model '{model_name}' not available. Try another model."

	return error_msg, "Error", 0

	def evaluate_response(
	self,
	user_input: str,
	agent_response: str,
	evaluator_model: str = "Llama 3.3 70B",
	temperature: float = 0.3
	) -> Tuple[str, str, float]:
	"""Evaluate the agent's response using Groq API."""

	if not self.groq_client:
	return "❌ Groq API not initialized. Please set GROQ_API_KEY environment variable.", "Error", 0

	try:
	model_config = EVALUATOR_MODELS.get(evaluator_model, EVALUATOR_MODELS["Llama 3.3 70B"])
	model_id = model_config["model"]

	# Create evaluation prompt using the class method
	eval_prompt = self.create_evaluator_prompt(user_input, agent_response)

	# Make API call to Groq
	start_time = time.time()

	response = self.groq_client.chat.completions.create(
	model=model_id,
	messages=[
	{"role": "system", "content": "You are a critical evaluator. Be honest but constructive in your feedback."},
	{"role": "user", "content": eval_prompt}
	],
	temperature=temperature,
	max_tokens=300,
	top_p=0.9
	)

	elapsed_time = time.time() - start_time

	# Extract evaluation
	evaluation = response.choices[0].message.content

	# Determine status based on evaluation
	if "ACCEPTABLE" in evaluation.upper():
	status = f"✅ Evaluation: Acceptable \| {evaluator_model} ({elapsed_time:.2f}s)"
	elif "UNACCEPTABLE" in evaluation.upper():
	status = f"❌ Evaluation: Needs Improvement \| {evaluator_model} ({elapsed_time:.2f}s)"
	else:
	status = f"🔍 Evaluation Complete \| {evaluator_model} ({elapsed_time:.2f}s)"

	return evaluation, status, elapsed_time

	except Exception as e:
	error_msg = f"❌ Groq API error: {str(e)}"
	print(error_msg)

	# Check for common errors
	if "API key" in str(e):
	error_msg = "❌ Invalid Groq API key. Please check GROQ_API_KEY."
	elif "rate" in str(e).lower():
	error_msg = "❌ Rate limit exceeded. Please wait a moment and try again."
	elif "model" in str(e).lower():
	error_msg = f"❌ Model '{evaluator_model}' not available."

	return error_msg, "Error", 0


	# In[ ]:


	# ===========================
	# Initialize Global Client Manager
	# ===========================

	api_manager = APIClientManager()


	# In[ ]:


	# ===========================
	# Main Processing Function
	# ===========================

	def process_with_evaluation(
	user_input: str,
	agent_model: str,
	evaluator_model: str,
	temperature: float,
	max_tokens: int,
	enable_evaluation: bool
	) -> Tuple[str, str, str, str]:
	"""Process user input through agent and optionally evaluate."""

	if not user_input.strip():
	return "Please enter a message.", "", "No input provided", ""

	# Step 1: Generate agent response
	agent_response, agent_status, agent_time = api_manager.generate_agent_response(
	user_input,
	agent_model,
	temperature,
	max_tokens
	)

	# Step 2: Evaluate response (if enabled)
	if enable_evaluation and "Error" not in agent_status:
	evaluation, eval_status, eval_time = api_manager.evaluate_response(
	user_input,
	agent_response,
	evaluator_model,
	temperature=0.3 # Lower temp for evaluation
	)

	# Combine status
	total_time = agent_time + eval_time
	combined_status = f"Agent: {agent_model} ({agent_time:.2f}s) \| Evaluator: {evaluator_model} ({eval_time:.2f}s) \| Total: {total_time:.2f}s"

	# Format evaluation for better display
	if "ACCEPTABLE" in evaluation.upper():
	eval_summary = "✅ Response Quality: ACCEPTABLE"
	elif "UNACCEPTABLE" in evaluation.upper():
	eval_summary = "❌ Response Quality: NEEDS IMPROVEMENT"
	else:
	eval_summary = "🔍 Evaluation Complete"

	else:
	evaluation = "Evaluation disabled or skipped due to error" if not enable_evaluation else "Skipped due to agent error"
	eval_summary = "🔕 No evaluation performed"
	combined_status = agent_status

	return agent_response, evaluation, combined_status, eval_summary


	# In[ ]:


	# ===========================
	# Gradio Interface
	# ===========================

	def create_interface():
	"""Create the Gradio interface."""

	css = """
	.gradio-container { max-width: 1400px !important; margin: auto; }
	.response-box { background: #f0f9ff; border-left: 4px solid #3b82f6; padding: 12px; border-radius: 8px; }
	.evaluation-box { background: #fef3c7; border-left: 4px solid #f59e0b; padding: 12px; border-radius: 8px; }
	.status-box { font-family: monospace; font-size: 12px; color: #6b7280; }
	.error-box { background: #fee2e2; border-left: 4px solid #ef4444; padding: 12px; border-radius: 8px; }
	.success-indicator { color: #10b981; font-weight: bold; }
	.warning-indicator { color: #f59e0b; font-weight: bold; }
	"""

	with gr.Blocks(
	title="AI Chatbot with Cross-Model Evaluator",
	theme=gr.themes.Soft(),
	css=css
	) as demo:

	# Header
	gr.Markdown("""
	# 🤖 AI Chatbot with Cross-Model Evaluator
	### Agent: Google Gemini 1.5 flash \| Evaluator: Groq Llama 3.3 70B

	This system uses two different AI models:
	1. Gemini generates responses to your questions
	2. Llama 70B evaluates the quality of those responses
	""")

	# API Status
	if api_manager.errors:
	with gr.Group():
	gr.Markdown("### ⚠️ Setup Issues:")
	for error in api_manager.errors:
	gr.Markdown(f"- {error}")
	gr.Markdown("""
	To fix:
	```bash
	export GOOGLE_API_KEY="your-google-api-key"
	export GROQ_API_KEY="your-groq-api-key"
	```
	Get keys from:
	- [Google AI Studio](https://makersuite.google.com/app/apikey)
	- [Groq Console](https://console.groq.com/keys)
	""")
	else:
	gr.Markdown("✅ All API clients initialized successfully")

	with gr.Row():
	# Left Column - Input Controls
	with gr.Column(scale=2):
	# Model Selection
	with gr.Group():
	gr.Markdown("### 🎯 Model Selection")
	agent_model = gr.Dropdown(
	choices=list(AGENT_MODELS.keys()),
	value="Gemini 1.5 flash",
	label="Agent Model (Response Generator)",
	info="Google Gemini model for generating responses"
	)

	evaluator_model = gr.Dropdown(
	choices=list(EVALUATOR_MODELS.keys()),
	value="Llama 3.3 70B",
	label="Evaluator Model",
	info="Groq model for evaluating response quality"
	)

	# User Input
	user_input = gr.Textbox(
	lines=4,
	placeholder="Ask me anything... For example: 'Explain quantum computing in simple terms'",
	label="💬 Your Question",
	max_lines=8
	)

	# Settings
	with gr.Group():
	gr.Markdown("### ⚙️ Generation Settings")
	with gr.Row():
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	label="Temperature (Creativity)",
	info="Higher = more creative, Lower = more focused"
	)
	max_tokens = gr.Slider(
	minimum=50,
	maximum=1000,
	value=500,
	step=50,
	label="Max Tokens",
	info="Maximum response length"
	)

	enable_evaluation = gr.Checkbox(
	value=True,
	label="🔍 Enable Cross-Model Evaluation",
	info="Let Llama 70B evaluate Gemini's response"
	)

	# Action Buttons
	with gr.Row():
	generate_btn = gr.Button(
	"🚀 Generate & Evaluate",
	variant="primary",
	size="lg"
	)
	clear_btn = gr.Button("🗑️ Clear All", size="lg")

	# Right Column - Outputs
	with gr.Column(scale=3):
	# Quality Indicator
	quality_indicator = gr.Textbox(
	label="📊 Response Quality",
	interactive=False,
	lines=1
	)

	# Agent Response
	with gr.Group():
	gr.Markdown("### 🤖 Agent Response")
	agent_output = gr.Textbox(
	lines=10,
	label="Gemini's Response",
	show_copy_button=True,
	interactive=False,
	elem_classes=["response-box"]
	)

	# Evaluation
	with gr.Group():
	gr.Markdown("### 🔍 Evaluation Result")
	evaluation_output = gr.Textbox(
	lines=8,
	label="Llama's Evaluation",
	show_copy_button=True,
	interactive=False,
	elem_classes=["evaluation-box"]
	)

	# Status
	status_output = gr.Textbox(
	lines=2,
	label="⏱️ Performance Metrics",
	interactive=False,
	elem_classes=["status-box"]
	)

	# Examples
	with gr.Row():
	gr.Examples(
	examples=[
	["What is the difference between machine learning and deep learning?"],
	["Write a Python function to calculate the factorial of a number"],
	["Explain the theory of relativity in simple terms"],
	["What are the main causes of climate change?"],
	["How does blockchain technology work?"],
	["What are the benefits and risks of artificial intelligence?"]
	],
	inputs=user_input,
	label="💡 Example Questions"
	)

	# How It Works
	with gr.Accordion("ℹ️ How Cross-Model Evaluation Works", open=False):
	gr.Markdown("""
	### The Two-Stage Process:

	1. Response Generation (Gemini)
	- Receives your question
	- Generates a comprehensive response
	- Optimized for helpfulness and accuracy

	2. Quality Evaluation (Llama 70B)
	- Analyzes the response for:
	- Accuracy and completeness
	- Clarity and coherence
	- Potential issues or biases
	- Provides feedback and improvement suggestions

	### Benefits:
	- ✅ Quality Assurance: Second model checks for errors
	- ✅ Bias Detection: Different model perspectives
	- ✅ Improvement Insights: Specific feedback on responses
	- ✅ Fast Processing: API-based, no local model loading

	### API Requirements:
	- Google API Key for Gemini (free tier available)
	- Groq API Key for Llama (free tier available)
	""")

	# Event Handlers
	generate_btn.click(
	fn=process_with_evaluation,
	inputs=[user_input, agent_model, evaluator_model, temperature, max_tokens, enable_evaluation],
	outputs=[agent_output, evaluation_output, status_output, quality_indicator]
	)

	clear_btn.click(
	fn=lambda: ("", "", "", ""),
	outputs=[user_input, agent_output, evaluation_output, status_output]
	)

	user_input.submit(
	fn=process_with_evaluation,
	inputs=[user_input, agent_model, evaluator_model, temperature, max_tokens, enable_evaluation],
	outputs=[agent_output, evaluation_output, status_output, quality_indicator]
	)

	return demo


	# In[ ]:


	# ===========================
	# Main Execution
	# ===========================

	if __name__ == "__main__":
	print("=" * 60)
	print("🚀 AI Chatbot with Cross-Model Evaluator")
	print("=" * 60)

	# Check API keys
	google_key = os.getenv("GOOGLE_API_KEY")
	groq_key = os.getenv("GROQ_API_KEY")

	if not google_key:
	print("⚠️ Warning: GOOGLE_API_KEY not found")
	print(" Set it with: export GOOGLE_API_KEY='your-key-here'")
	else:
	print(f"✅ Google API Key detected: {google_key[:10]}...")

	if not groq_key:
	print("⚠️ Warning: GROQ_API_KEY not found")
	print(" Set it with: export GROQ_API_KEY='your-key-here'")
	else:
	print(f"✅ Groq API Key detected: {groq_key[:10]}...")

	print("=" * 60)
	print("📝 Starting Gradio interface...")
	print("📌 Interface will be available at: http://localhost:7860")
	print("=" * 60)

	# Create and launch interface
	demo = create_interface()
	demo.launch()


	# In[ ]: