exambot / app.py
siba4442's picture
Create app.py
15a2a84 verified
import gradio as gr
from pydantic import BaseModel, Field
from typing import List, Dict, Any
from groq import Groq
import time
# Real Groq API call
def call_llm(model: str, api_key: str, question: str) -> str:
try:
client = Groq(api_key=api_key)
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": question}
],
temperature=0.7,
max_tokens=1024
)
return completion.choices[0].message.content
except Exception as e:
return f"Error calling {model}: {str(e)}"
def reevaluate_answer(model: str, api_key: str, answer: str) -> str:
try:
client = Groq(api_key=api_key)
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": f"Please review and improve this answer if needed: {answer}"}
],
temperature=0.5,
max_tokens=1024
)
return completion.choices[0].message.content
except Exception as e:
return f"Error re-evaluating with {model}: {str(e)}"
def master_review(model: str, api_key: str, all_answers: List[Dict]) -> str:
try:
client = Groq(api_key=api_key)
answers_text = "\n\n".join([f"Answer {i+1} from {ans['model']}: {ans['answer']}" for i, ans in enumerate(all_answers)])
completion = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": f"Review these answers and select the best one. Only return the number (1, 2, 3, etc.) of the best answer:\n\n{answers_text}"}
],
temperature=0.3,
max_tokens=50
)
return completion.choices[0].message.content.strip()
except Exception as e:
return "1" # Default to first answer if error
models_list = ["qwen/qwen3-32b", "openai/gpt-oss-120b", "openai/gpt-oss-20b", "moonshotai/kimi-k2-instruct", "llama-3.1-8b-instant", "meta-llama/llama-guard-4-12b", "groq/compound", "groq/compound-mini"]
def process_question(api_key, selected_models, question):
if not api_key or not selected_models:
return "Please provide API key and select models", "", "", "", "", "", "", "", "", "", "", "", ""
outputs = [""] * 13 # Initialize all outputs
answers = []
# Process each selected model
model_idx = 0
for model in selected_models[:4]: # Limit to 4 models max
base_idx = model_idx * 3
# Initial answer
outputs[base_idx] = f"πŸ€” Getting answer from {model}..."
yield tuple(outputs)
initial_ans = call_llm(model, api_key, question)
outputs[base_idx] = initial_ans
yield tuple(outputs)
# Re-evaluate
outputs[base_idx + 1] = f"πŸ”„ Re-evaluating..."
yield tuple(outputs)
reevaluated = reevaluate_answer(model, api_key, initial_ans)
outputs[base_idx + 1] = reevaluated
answers.append({"model": model, "answer": reevaluated})
yield tuple(outputs)
model_idx += 1
# Master review - only use openai/gpt-oss-120b as master
master_model = "openai/gpt-oss-120b"
outputs[3] = f"πŸ† Master {master_model} reviewing all answers..."
yield tuple(outputs)
choice = master_review(master_model, api_key, answers)
try:
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(answers):
chosen_model = answers[choice_idx]["model"]
chosen_answer = answers[choice_idx]["answer"]
else:
chosen_model = answers[0]["model"]
chosen_answer = answers[0]["answer"]
except:
chosen_model = answers[0]["model"]
chosen_answer = answers[0]["answer"]
outputs[3] = f"Master chose: {chosen_model}"
# Clear unused master vote boxes
outputs[6] = ""
outputs[9] = ""
# Final answer
outputs[12] = f"πŸ† Master LLM ({master_model}) selected:\n\n{chosen_answer}"
yield tuple(outputs)
with gr.Blocks() as demo:
gr.Markdown("""
# Multi-LLM Master Evaluator with Real-time Streaming
Enter your Groq API key, select LLM models, and ask a question.
Watch real-time processing as each LLM answers, re-evaluates, and then the master LLM (openai/gpt-oss-120b) selects the best answer.
""")
with gr.Row():
api_key = gr.Textbox(label="πŸ”‘ Groq API Key", type="password", placeholder="Enter your API key...")
models = gr.CheckboxGroup(models_list, label="πŸ€– Select Models", value=["llama-3.1-8b-instant", "openai/gpt-oss-20b"])
question = gr.Textbox(label="❓ Your Question", placeholder="Ask your question...")
run_btn = gr.Button("πŸš€ Start Evaluation")
# Model outputs (4 models max, 3 columns each)
with gr.Row():
with gr.Column():
gr.Markdown("### Model 1")
model1_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False)
model1_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False)
model1_vote = gr.Textbox(label="Master LLM Decision", lines=1, interactive=False)
with gr.Column():
gr.Markdown("### Model 2")
model2_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False)
model2_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False)
model2_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False)
with gr.Row():
with gr.Column():
gr.Markdown("### Model 3")
model3_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False)
model3_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False)
model3_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False)
with gr.Column():
gr.Markdown("### Model 4")
model4_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False)
model4_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False)
model4_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False)
final_answer = gr.Textbox(label="πŸ† Final Best Answer", lines=6, interactive=False)
run_btn.click(
process_question,
inputs=[api_key, models, question],
outputs=[model1_initial, model1_reevaluated, model1_vote,
model2_initial, model2_reevaluated, model2_vote,
model3_initial, model3_reevaluated, model3_vote,
model4_initial, model4_reevaluated, model4_vote,
final_answer]
)
demo.launch()