Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import openai | |
| import anthropic | |
| import google.generativeai as genai | |
| import threading | |
| import json | |
| import time | |
| import os | |
| # --- Securely Load API Keys from Environment Variables --- | |
| # IMPORTANT: Set these keys in your system's environment variables | |
| # or create a .env file and use a library like 'python-dotenv' to load them. | |
| API_KEYS = { | |
| "openai_api_key": os.getenv("OPENAI_API_KEY"), | |
| "anthropic_api_key": os.getenv("ANTHROPIC_API_KEY"), | |
| "deepseek_api_key": os.getenv("DEEPSEEK_API_KEY"), | |
| "google_api_key": os.getenv("GOOGLE_API_KEY"), | |
| "groq_api_key": os.getenv("GROQ_API_KEY"), | |
| "ollama_api_key": "ollama" # Static key for local Ollama | |
| } | |
| # --- Model & API Configuration --- | |
| # FIX: Corrected model names for Claude, Gemini, and the Judge model. | |
| # FIX: Reconfigured Gemini to use its own 'gemini' api_client. | |
| COMPETITOR_MODELS = [ | |
| { | |
| "name": "gpt-4o-mini", | |
| "api_client": "openai", | |
| "key_name": "openai_api_key" | |
| }, | |
| { | |
| "name": "claude-3-5-sonnet-20240620", # CORRECTED model name | |
| "api_client": "anthropic", | |
| "key_name": "anthropic_api_key" | |
| }, | |
| { | |
| "name": "deepseek-chat", | |
| "api_client": "openai_compatible", | |
| "base_url": "https://api.deepseek.com/v1", | |
| "key_name": "deepseek_api_key" | |
| }, | |
| { | |
| "name": "llama3-8b-8192", | |
| "api_client": "openai_compatible", | |
| "base_url": "https://api.groq.com/openai/v1", | |
| "key_name": "groq_api_key" | |
| }, | |
| { | |
| "name": "llama3", # Ensure you have 'llama3' pulled via 'ollama pull llama3' | |
| "api_client": "ollama", | |
| "base_url": "http://localhost:11434/v1", | |
| "key_name": "ollama_api_key" | |
| }, | |
| { | |
| "name": "gemini-1.5-flash-latest", # CORRECTED model name | |
| "api_client": "gemini", # CORRECTED client type | |
| "key_name": "google_api_key" | |
| } | |
| ] | |
| # --- UI Configuration --- | |
| MODEL_COLORS = ["#FF6347", "#D2691E", "#32CD32", "#FFD700", "#6A5ACD", "#00CED1"] | |
| JUDGE_MODEL = "gpt-4o-mini" # CORRECTED judge model name | |
| # --- Helper Function to Query APIs --- | |
| def get_model_response(model_config, api_keys, prompt, results_list): | |
| """ | |
| Queries an LLM API based on the provided configuration and appends the result to a list. | |
| """ | |
| model_name = model_config["name"] | |
| api_client_type = model_config["api_client"] | |
| api_key = api_keys.get(model_config["key_name"]) | |
| response_content = f"Error: Model {model_name} did not respond." | |
| try: | |
| if not api_key and api_client_type != "ollama": | |
| raise ValueError(f"API key '{model_config['key_name']}' is missing.") | |
| messages = [{"role": "user", "content": prompt}] | |
| if api_client_type == "openai": | |
| client = openai.OpenAI(api_key=api_key) | |
| response = client.chat.completions.create(model=model_name, messages=messages) | |
| response_content = response.choices[0].message.content | |
| elif api_client_type == "anthropic": | |
| client = anthropic.Anthropic(api_key=api_key) | |
| response = client.messages.create(model=model_name, max_tokens=4096, messages=messages) | |
| response_content = response.content[0].text | |
| # FIX: Added a dedicated block for the Gemini API | |
| elif api_client_type == "gemini": | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel(model_name) | |
| response = model.generate_content(prompt) | |
| response_content = response.text | |
| elif api_client_type in ["openai_compatible", "ollama"]: | |
| base_url = model_config.get("base_url") | |
| client = openai.OpenAI(api_key=api_key, base_url=base_url) | |
| response = client.chat.completions.create(model=model_name, messages=messages) | |
| response_content = response.choices[0].message.content | |
| except Exception as e: | |
| response_content = f"Error for {model_name}: {str(e)}" | |
| results_list.append({"model": model_name, "response": response_content}) | |
| # --- Main Logic for the Arena (as a Generator) --- | |
| def run_competition(question, progress=gr.Progress(track_tqdm=True)): | |
| """ | |
| A generator function that runs the competition and yields UI updates at each stage. | |
| """ | |
| # Stage 1: Initial UI State | |
| button_update_running = gr.Button("βοΈ Running Competition...", interactive=False) | |
| initial_text_outputs = ["The winning answer will be displayed here..."] + ["β³ Thinking..."] * len(COMPETITOR_MODELS) | |
| yield [button_update_running] + initial_text_outputs | |
| if not question: | |
| button_update_idle = gr.Button("Run Competition", interactive=True) | |
| blank_outputs = [""] * (1 + len(COMPETITOR_MODELS)) | |
| yield [button_update_idle] + blank_outputs | |
| return | |
| # Stage 2: Get Competitor Responses Concurrently | |
| progress(0, desc="Querying Competitor Models...") | |
| threads = [] | |
| competitor_responses = [] | |
| for model_config in COMPETITOR_MODELS: | |
| thread = threading.Thread( | |
| target=get_model_response, | |
| args=(model_config, API_KEYS, question, competitor_responses) | |
| ) | |
| threads.append(thread) | |
| thread.start() | |
| for thread in threads: | |
| thread.join() | |
| # Stage 3: Update UI with Competitor Responses | |
| progress(0.7, desc="All models responded. Awaiting judgment...") | |
| button_update_judging = gr.Button("βοΈ Judging...", interactive=False) | |
| text_outputs = ["The winning answer will be displayed here..."] | |
| response_dict = {r['model']: r['response'] for r in competitor_responses} | |
| responses_text_for_judge = "" | |
| for i, model_config in enumerate(COMPETITOR_MODELS): | |
| response = response_dict.get(model_config['name'], f"Error: {model_config['name']} response not found.") | |
| text_outputs.append(response) | |
| responses_text_for_judge += f"# Response from competitor {i+1} ({model_config['name']})\n\n{response}\n\n" | |
| yield [button_update_judging] + text_outputs | |
| time.sleep(1) | |
| # Stage 4: Get the Judge's Ranking | |
| judge_prompt = f"""You are a fair and impartial judge in a competition between {len(competitor_responses)} LLM assistants. | |
| Each model was given this question: | |
| --- | |
| {question} | |
| --- | |
| Your task is to evaluate each response for clarity, accuracy, and depth of reasoning. Then, you must rank them in order from best to worst. | |
| You must respond with JSON, and only JSON, with the following format: | |
| {{"results": ["best competitor number", "second best competitor number", ...]}} | |
| Here are the responses from each competitor: | |
| --- | |
| {responses_text_for_judge} | |
| --- | |
| Now, provide your judgment as a JSON object with the ranked order of the competitors. Do not include any other text, markdown formatting, or code blocks.""" | |
| best_answer_text = "Error: Judge failed to provide a valid ranking." | |
| try: | |
| # Ensure the OpenAI API key is available for the judge | |
| if not API_KEYS["openai_api_key"]: | |
| raise ValueError("OpenAI API key is missing for the judge model.") | |
| judge_client = openai.OpenAI(api_key=API_KEYS["openai_api_key"]) | |
| judge_messages = [{"role": "user", "content": judge_prompt}] | |
| response = judge_client.chat.completions.create( | |
| model=JUDGE_MODEL, | |
| messages=judge_messages, | |
| response_format={"type": "json_object"} | |
| ) | |
| results_json = response.choices[0].message.content | |
| results_dict = json.loads(results_json) | |
| # Handle potential string or integer values from the judge model | |
| ranked_indices = [str(i) for i in results_dict.get("results", [])] | |
| if ranked_indices: | |
| best_competitor_num_str = ranked_indices[0] | |
| best_competitor_index = int(best_competitor_num_str) - 1 | |
| best_model_name = COMPETITOR_MODELS[best_competitor_index]['name'] | |
| best_model_color = MODEL_COLORS[best_competitor_index % len(MODEL_COLORS)] | |
| best_answer = text_outputs[best_competitor_index + 1] | |
| best_answer_text = f"## π Best Answer (from <span style='color:{best_model_color}; font-weight:bold;'>{best_model_name}</span>)\n\n" | |
| best_answer_text += best_answer | |
| except Exception as e: | |
| best_answer_text = f"## Error\n\nAn error occurred during judgment: {str(e)}" | |
| # Stage 5: Final UI Update | |
| progress(1, desc="Competition Complete!") | |
| button_update_idle = gr.Button("Run Competition", interactive=True) | |
| text_outputs[0] = best_answer_text | |
| yield [button_update_idle] + text_outputs | |
| # --- Gradio User Interface --- | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange", secondary_hue="blue")) as demo: | |
| gr.Markdown("# Advanced Multi-Model LLM Arena") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| question_box = gr.Textbox( | |
| label="Enter Your Question Here", | |
| lines=6, | |
| placeholder="e.g., Explain the concept of emergent properties in complex systems and provide three distinct examples." | |
| ) | |
| run_button = gr.Button("Run Competition", variant="primary") | |
| progress_bar = gr.Progress() # This component is controlled by the `gr.Progress` in the function | |
| with gr.Column(scale=2): | |
| best_answer_box = gr.Markdown("The winning answer will be displayed here...") | |
| gr.Markdown("---") | |
| gr.Markdown("### Competitor Responses") | |
| response_boxes = [] | |
| for i in range(0, len(COMPETITOR_MODELS), 3): | |
| with gr.Row(): | |
| for j in range(3): | |
| model_index = i + j | |
| if model_index < len(COMPETITOR_MODELS): | |
| with gr.Column(): | |
| model_config = COMPETITOR_MODELS[model_index] | |
| model_name = model_config['name'] | |
| color = MODEL_COLORS[model_index % len(MODEL_COLORS)] | |
| gr.Markdown(f"<h3 style='color:{color}; margin-bottom: -10px; text-align:center;'>{model_name}</h3>") | |
| box = gr.Textbox(lines=10, interactive=False, container=False) | |
| response_boxes.append(box) | |
| all_outputs = [run_button, best_answer_box] + response_boxes | |
| run_button.click( | |
| fn=run_competition, | |
| inputs=[question_box], | |
| outputs=all_outputs | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |