Built an evaluation model to assess the output of the current model
1. Be able to ask an LLM to evaluate answer
2. Be able to rerun if the answer fail the evaluation
3. Be able to incorporate into a workflow

In [4]:
from dotenv import load_dotenv
import os
from pypdf import PdfReader
import google.generativeai as genai
import gradio as gr
from pydantic import BaseModel
import json
load_dotenv(override=True)
genai.configure(api_key=os.getenv("GEMINI_API"))

In [2]:
# Read the PDF and summary 
reader = PdfReader("../Week_1/Data_w1/linkedin.pdf")
linkedin = ""
for page in reader.pages:
    linkedin += page.extract_text()

with open("../Week_1/Data_w1/summary.txt", "r") as f:
    summary = f.read()

In [3]:
# Create a system prompt
initial_system_prompt = f"You are acting as Ed Donner. You are answering questions on Ed Donner's website, \
particularly questions related to Ed Donner's career, background, skills and experience. \
Your responsibility is to represent Ed Donner for interactions on the website as faithfully as possible. \
You are given a summary of Ed Donner's background and LinkedIn profile which you can use to answer questions. \
Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
If you don't know the answer, say so."

initial_system_prompt += f"\n\n## Summary:\n{summary}\n\n## LinkedIn Profile:\n{linkedin}\n\n"
initial_system_prompt += f"With this context, please chat with the user, always staying in character as Ed Donner."

chat_session = None

In [5]:
evaluator_system_prompt = f"You are an evaluator that decides whether a response to a question is acceptable. \
You are provided with a conversation between a User and an Agent. Your task is to decide whether the Agent's latest response is acceptable quality. \
The Agent is playing the role of Ed Donner and is representing Ed Donner on their website. \
The Agent has been instructed to be professional and engaging, as if talking to a potential client or future employer who came across the website. \
The Agent has been provided with context on Ed Donner in the form of their summary and LinkedIn details. Here's the information:"

evaluator_system_prompt += f"\n\n## Summary:\n{summary}\n\n## LinkedIn Profile:\n{linkedin}\n\n"
evaluator_system_prompt += f"With this context, please evaluate the latest response, replying with whether the response is acceptable and your feedback."

def evaluator_user_prompt(reply, message, history):
    user_prompt = f"Here's the conversation between the User and the Agent: \n\n{history}\n\n"
    user_prompt += f"Here's the latest message from the User: \n\n{message}\n\n"
    user_prompt += f"Here's the latest response from the Agent: \n\n{reply}\n\n"
    user_prompt += f"Please evaluate the response, replying with whether it is acceptable and your feedback."
    return user_prompt

In [6]:
class Evaluation(BaseModel):
    is_acceptable: bool
    response: str



In [7]:
# Create a model for evaluation

model_evaluator = genai.GenerativeModel(
    'gemini-2.0-flash-exp',
    system_instruction=evaluator_system_prompt
)

In [8]:
def evaluate_response(reply, message, history) -> Evaluation:
    try:
        # Create evaluation prompt
        eval_prompt = evaluator_user_prompt(reply, message, history)
        response = model_evaluator.generate_content(eval_prompt)

        # Parse the JSON response
        try:
            eval_data = json.loads(response.text)
            return Evaluation(
                is_acceptable=eval_data.get("is_acceptable", True),
                response=eval_data.get("response", "No response provided.")

            )
        except json.JSONDecodeError:
            # If JSON parsing fails, try to extract boolean and text
            text = response.text.lower()
            is_acceptable = "true" in text or "acceptable" in text
            return Evaluation(
                is_acceptable=is_acceptable,
                response=response.text
            )
    except Exception as e:
        # Return default evaluation on error
        return Evaluation(
            is_acceptable=True,
            response=f"Evaluation failed: {str(e)}"
        )

In [13]:
# Create the main chat
def chat(message, history, system_prompt=initial_system_prompt):
    model = genai.GenerativeModel(
        'gemini-2.0-flash',
        system_instruction=system_prompt
    )
    # Convert Gradio messages format to Gemini format
    gemini_history = []
    for msg in history:
        if msg["role"] == "user":
            gemini_history.append({
                "role": "user",
                "parts": [msg["content"]]
            })
        elif msg["role"] == "assistant":
            gemini_history.append({
                "role": "model",  # Gemini uses "model" instead of "assistant"
                "parts": [msg["content"]]
            })
    
    # Start chat with history
    chat_session = model.start_chat(history=gemini_history)
    
    # Create an acceptable retries if the message is not acceptable
    for try_count in range(3):
        try:
            # Send the current message
            response = chat_session.send_message(message).text

            # Evaluate the response
            evaluation = evaluate_response(response, message, history)
            if evaluation.is_acceptable:
                print("Passed evaluation - returning reply")
                return response
            else:
                print("Failed evaluation - retrying")
                if try_count < 2:
                    retry_message = f"{message}\n\nPlease provide a better response. Previous attempt had issues: {evaluation.response}"
                    # Create a new chat to avoid the bad response
                    chat_session = model.start_chat(history=gemini_history)
                    message = retry_message
                else:
                    return f"{response}\n\n*[Note: Response may need improvement - {evaluation.response}]*"
        except Exception as e:
            if try_count < 2:
                continue
            else:
                return f"Error: {str(e)} after 3 tries"
    return "Failed to generate acceptable response after maximum retries."


In [15]:
# Create interface with additional inputs
with gr.Blocks() as demo:
    gr.Markdown("# Chat with Google Gemini")
    
    system_prompt = gr.Textbox(
        value=initial_system_prompt,
        label="System Prompt",
        placeholder="Enter system instructions for the AI...",
        lines=2
    )
    
    chat_interface = gr.ChatInterface(
        fn=chat,
        additional_inputs=[system_prompt],
        title="",
        cache_examples=False,
        type='messages'
        
    )

In [None]:
# Launch the interface
demo.launch()

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




Passed evaluation - returning reply
Passed evaluation - returning reply


In [17]:
demo.close()

Closing server running on port: 7862
