| import gradio as gr |
| from pydantic import BaseModel, Field |
| from typing import List, Dict, Any |
| from groq import Groq |
| import time |
|
|
| |
| def call_llm(model: str, api_key: str, question: str) -> str: |
| try: |
| client = Groq(api_key=api_key) |
| completion = client.chat.completions.create( |
| model=model, |
| messages=[ |
| {"role": "user", "content": question} |
| ], |
| temperature=0.7, |
| max_tokens=1024 |
| ) |
| return completion.choices[0].message.content |
| except Exception as e: |
| return f"Error calling {model}: {str(e)}" |
|
|
| def reevaluate_answer(model: str, api_key: str, answer: str) -> str: |
| try: |
| client = Groq(api_key=api_key) |
| completion = client.chat.completions.create( |
| model=model, |
| messages=[ |
| {"role": "user", "content": f"Please review and improve this answer if needed: {answer}"} |
| ], |
| temperature=0.5, |
| max_tokens=1024 |
| ) |
| return completion.choices[0].message.content |
| except Exception as e: |
| return f"Error re-evaluating with {model}: {str(e)}" |
|
|
| def master_review(model: str, api_key: str, all_answers: List[Dict]) -> str: |
| try: |
| client = Groq(api_key=api_key) |
| answers_text = "\n\n".join([f"Answer {i+1} from {ans['model']}: {ans['answer']}" for i, ans in enumerate(all_answers)]) |
| completion = client.chat.completions.create( |
| model=model, |
| messages=[ |
| {"role": "user", "content": f"Review these answers and select the best one. Only return the number (1, 2, 3, etc.) of the best answer:\n\n{answers_text}"} |
| ], |
| temperature=0.3, |
| max_tokens=50 |
| ) |
| return completion.choices[0].message.content.strip() |
| except Exception as e: |
| return "1" |
|
|
| models_list = ["qwen/qwen3-32b", "openai/gpt-oss-120b", "openai/gpt-oss-20b", "moonshotai/kimi-k2-instruct", "llama-3.1-8b-instant", "meta-llama/llama-guard-4-12b", "groq/compound", "groq/compound-mini"] |
|
|
| def process_question(api_key, selected_models, question): |
| if not api_key or not selected_models: |
| return "Please provide API key and select models", "", "", "", "", "", "", "", "", "", "", "", "" |
| |
| outputs = [""] * 13 |
| answers = [] |
| |
| |
| model_idx = 0 |
| for model in selected_models[:4]: |
| base_idx = model_idx * 3 |
| |
| |
| outputs[base_idx] = f"π€ Getting answer from {model}..." |
| yield tuple(outputs) |
| |
| initial_ans = call_llm(model, api_key, question) |
| outputs[base_idx] = initial_ans |
| yield tuple(outputs) |
| |
| |
| outputs[base_idx + 1] = f"π Re-evaluating..." |
| yield tuple(outputs) |
| |
| reevaluated = reevaluate_answer(model, api_key, initial_ans) |
| outputs[base_idx + 1] = reevaluated |
| answers.append({"model": model, "answer": reevaluated}) |
| yield tuple(outputs) |
| |
| model_idx += 1 |
| |
| |
| master_model = "openai/gpt-oss-120b" |
| outputs[3] = f"π Master {master_model} reviewing all answers..." |
| yield tuple(outputs) |
| |
| choice = master_review(master_model, api_key, answers) |
| try: |
| choice_idx = int(choice) - 1 |
| if 0 <= choice_idx < len(answers): |
| chosen_model = answers[choice_idx]["model"] |
| chosen_answer = answers[choice_idx]["answer"] |
| else: |
| chosen_model = answers[0]["model"] |
| chosen_answer = answers[0]["answer"] |
| except: |
| chosen_model = answers[0]["model"] |
| chosen_answer = answers[0]["answer"] |
| |
| outputs[3] = f"Master chose: {chosen_model}" |
| |
| |
| outputs[6] = "" |
| outputs[9] = "" |
| |
| |
| outputs[12] = f"π Master LLM ({master_model}) selected:\n\n{chosen_answer}" |
| |
| yield tuple(outputs) |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown(""" |
| # Multi-LLM Master Evaluator with Real-time Streaming |
| Enter your Groq API key, select LLM models, and ask a question. |
| Watch real-time processing as each LLM answers, re-evaluates, and then the master LLM (openai/gpt-oss-120b) selects the best answer. |
| """) |
| |
| with gr.Row(): |
| api_key = gr.Textbox(label="π Groq API Key", type="password", placeholder="Enter your API key...") |
| models = gr.CheckboxGroup(models_list, label="π€ Select Models", value=["llama-3.1-8b-instant", "openai/gpt-oss-20b"]) |
| |
| question = gr.Textbox(label="β Your Question", placeholder="Ask your question...") |
| run_btn = gr.Button("π Start Evaluation") |
| |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Model 1") |
| model1_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False) |
| model1_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False) |
| model1_vote = gr.Textbox(label="Master LLM Decision", lines=1, interactive=False) |
| |
| with gr.Column(): |
| gr.Markdown("### Model 2") |
| model2_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False) |
| model2_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False) |
| model2_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Model 3") |
| model3_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False) |
| model3_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False) |
| model3_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False) |
| |
| with gr.Column(): |
| gr.Markdown("### Model 4") |
| model4_initial = gr.Textbox(label="Initial Answer", lines=3, interactive=False) |
| model4_reevaluated = gr.Textbox(label="Re-evaluated", lines=3, interactive=False) |
| model4_vote = gr.Textbox(label="(Master uses Model 1 slot)", lines=1, interactive=False, visible=False) |
| |
| final_answer = gr.Textbox(label="π Final Best Answer", lines=6, interactive=False) |
| |
| run_btn.click( |
| process_question, |
| inputs=[api_key, models, question], |
| outputs=[model1_initial, model1_reevaluated, model1_vote, |
| model2_initial, model2_reevaluated, model2_vote, |
| model3_initial, model3_reevaluated, model3_vote, |
| model4_initial, model4_reevaluated, model4_vote, |
| final_answer] |
| ) |
|
|
| demo.launch() |
|
|