Spaces:
Sleeping
Sleeping
| """ | |
| AI Model Arena - Compare AI model outputs side by side. | |
| Test prompts across multiple models and vote for the best response. | |
| """ | |
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| import random | |
| import time | |
| # --------------------------------------------------------------------------- | |
| # Model Configurations | |
| # --------------------------------------------------------------------------- | |
| MODELS = { | |
| "Mistral-7B": { | |
| "id": "mistralai/Mistral-7B-Instruct-v0.3", | |
| "description": "Fast, efficient 7B parameter model from Mistral AI", | |
| "strengths": "Speed, reasoning, code", | |
| }, | |
| "Llama-3.1-8B": { | |
| "id": "meta-llama/Llama-3.1-8B-Instruct", | |
| "description": "Meta's latest open LLM with strong capabilities", | |
| "strengths": "General knowledge, instruction following", | |
| }, | |
| "Qwen2.5-7B": { | |
| "id": "Qwen/Qwen2.5-7B-Instruct", | |
| "description": "Alibaba's powerful multilingual model", | |
| "strengths": "Multilingual, math, coding", | |
| }, | |
| "Phi-3-mini": { | |
| "id": "microsoft/Phi-3-mini-4k-instruct", | |
| "description": "Microsoft's compact but capable model", | |
| "strengths": "Efficiency, reasoning, small size", | |
| }, | |
| "Gemma-2-9B": { | |
| "id": "google/gemma-2-9b-it", | |
| "description": "Google's instruction-tuned Gemma model", | |
| "strengths": "Quality, safety, general tasks", | |
| }, | |
| "Zephyr-7B": { | |
| "id": "HuggingFaceH4/zephyr-7b-beta", | |
| "description": "Fine-tuned Mistral with DPO alignment", | |
| "strengths": "Helpfulness, alignment, chat", | |
| }, | |
| } | |
| CATEGORIES = { | |
| "Creative Writing": [ | |
| "Write a haiku about artificial intelligence", | |
| "Create a short story opening about a robot discovering emotions", | |
| "Write a limerick about machine learning", | |
| "Compose a brief poem about the future of technology", | |
| ], | |
| "Coding": [ | |
| "Write a Python function to check if a number is prime", | |
| "Create a JavaScript function to reverse a string", | |
| "Write a SQL query to find duplicate emails in a users table", | |
| "Implement a simple stack data structure in Python", | |
| ], | |
| "Reasoning": [ | |
| "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?", | |
| "A bat and ball cost $1.10 total. The bat costs $1 more than the ball. How much does the ball cost?", | |
| "What comes next in the sequence: 2, 6, 12, 20, 30, ?", | |
| "If it takes 5 machines 5 minutes to make 5 widgets, how long would it take 100 machines to make 100 widgets?", | |
| ], | |
| "Knowledge": [ | |
| "Explain quantum entanglement in simple terms", | |
| "What are the main differences between TCP and UDP?", | |
| "Briefly explain how transformers work in machine learning", | |
| "What is the difference between machine learning and deep learning?", | |
| ], | |
| "Summarization": [ | |
| "Summarize the concept of blockchain technology in 2-3 sentences", | |
| "Explain the main idea behind reinforcement learning briefly", | |
| "Summarize what makes Python popular for data science", | |
| "Briefly explain the concept of transfer learning", | |
| ], | |
| } | |
| # --------------------------------------------------------------------------- | |
| # State Management | |
| # --------------------------------------------------------------------------- | |
| vote_counts = {model: {"wins": 0, "battles": 0} for model in MODELS} | |
| # --------------------------------------------------------------------------- | |
| # Core Functions | |
| # --------------------------------------------------------------------------- | |
| def get_model_response(model_id: str, prompt: str, max_tokens: int = 500) -> tuple: | |
| """Get response from a model with timing.""" | |
| client = InferenceClient(model_id) | |
| start_time = time.time() | |
| try: | |
| messages = [{"role": "user", "content": prompt}] | |
| response = client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=0.7, | |
| ) | |
| elapsed = time.time() - start_time | |
| return response.choices[0].message.content, elapsed, None | |
| except Exception as e: | |
| elapsed = time.time() - start_time | |
| return None, elapsed, str(e) | |
| def battle(prompt: str, model1_name: str, model2_name: str) -> tuple: | |
| """Run a battle between two models.""" | |
| if not prompt.strip(): | |
| return "Please enter a prompt.", "", "", "", "" | |
| model1_id = MODELS[model1_name]["id"] | |
| model2_id = MODELS[model2_name]["id"] | |
| # Get responses | |
| resp1, time1, err1 = get_model_response(model1_id, prompt) | |
| resp2, time2, err2 = get_model_response(model2_id, prompt) | |
| # Format responses | |
| if err1: | |
| output1 = f"**Error:** {err1}" | |
| else: | |
| output1 = f"{resp1}\n\n---\n*Response time: {time1:.2f}s*" | |
| if err2: | |
| output2 = f"**Error:** {err2}" | |
| else: | |
| output2 = f"{resp2}\n\n---\n*Response time: {time2:.2f}s*" | |
| # Model info | |
| info1 = f"**{model1_name}**\n{MODELS[model1_name]['description']}\n*Strengths: {MODELS[model1_name]['strengths']}*" | |
| info2 = f"**{model2_name}**\n{MODELS[model2_name]['description']}\n*Strengths: {MODELS[model2_name]['strengths']}*" | |
| return output1, output2, info1, info2, prompt | |
| def vote_model(winner: str, loser: str, prompt: str) -> str: | |
| """Record a vote for the winning model.""" | |
| if not prompt: | |
| return "Run a battle first before voting!" | |
| vote_counts[winner]["wins"] += 1 | |
| vote_counts[winner]["battles"] += 1 | |
| vote_counts[loser]["battles"] += 1 | |
| return f"Voted for **{winner}**! Total wins: {vote_counts[winner]['wins']}/{vote_counts[winner]['battles']}" | |
| def get_leaderboard() -> str: | |
| """Generate leaderboard markdown.""" | |
| # Calculate win rates | |
| rankings = [] | |
| for model, stats in vote_counts.items(): | |
| if stats["battles"] > 0: | |
| win_rate = stats["wins"] / stats["battles"] * 100 | |
| else: | |
| win_rate = 0 | |
| rankings.append((model, stats["wins"], stats["battles"], win_rate)) | |
| # Sort by win rate, then by total wins | |
| rankings.sort(key=lambda x: (x[3], x[1]), reverse=True) | |
| # Generate markdown table | |
| md = "## Leaderboard\n\n" | |
| md += "| Rank | Model | Wins | Battles | Win Rate |\n" | |
| md += "|------|-------|------|---------|----------|\n" | |
| for i, (model, wins, battles, rate) in enumerate(rankings, 1): | |
| if battles > 0: | |
| md += f"| {i} | {model} | {wins} | {battles} | {rate:.1f}% |\n" | |
| else: | |
| md += f"| {i} | {model} | 0 | 0 | - |\n" | |
| md += "\n*Leaderboard resets when the Space restarts*" | |
| return md | |
| def random_battle() -> tuple: | |
| """Set up a random battle.""" | |
| models = list(MODELS.keys()) | |
| model1 = random.choice(models) | |
| model2 = random.choice([m for m in models if m != model1]) | |
| category = random.choice(list(CATEGORIES.keys())) | |
| prompt = random.choice(CATEGORIES[category]) | |
| return model1, model2, prompt | |
| def get_example_prompt(category: str) -> str: | |
| """Get a random prompt from a category.""" | |
| if category in CATEGORIES: | |
| return random.choice(CATEGORIES[category]) | |
| return "" | |
| # --------------------------------------------------------------------------- | |
| # Gradio Interface | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks(title="AI Model Arena", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # AI Model Arena | |
| **Compare AI models head-to-head!** | |
| Test the same prompt across different models and vote for the best response. | |
| See which models excel at different tasks. | |
| *All models run via HuggingFace Inference API* | |
| """) | |
| # Hidden state for current prompt | |
| current_prompt = gr.State("") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| prompt_input = gr.Textbox( | |
| label="Your Prompt", | |
| placeholder="Enter a prompt to test both models...", | |
| lines=3, | |
| ) | |
| with gr.Row(): | |
| category_dropdown = gr.Dropdown( | |
| choices=list(CATEGORIES.keys()), | |
| label="Example Category", | |
| value="Creative Writing", | |
| ) | |
| example_btn = gr.Button("📝 Get Example", size="sm") | |
| with gr.Column(scale=1): | |
| model1_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value="Mistral-7B", | |
| label="Model A", | |
| ) | |
| model2_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value="Llama-3.1-8B", | |
| label="Model B", | |
| ) | |
| with gr.Row(): | |
| random_btn = gr.Button("🎲 Random Battle", variant="secondary") | |
| battle_btn = gr.Button("⚔️ Start Battle!", variant="primary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(): | |
| model1_info = gr.Markdown("**Model A**") | |
| model1_output = gr.Markdown(label="Model A Response") | |
| vote1_btn = gr.Button("👍 Vote for Model A", variant="secondary") | |
| with gr.Column(): | |
| model2_info = gr.Markdown("**Model B**") | |
| model2_output = gr.Markdown(label="Model B Response") | |
| vote2_btn = gr.Button("👍 Vote for Model B", variant="secondary") | |
| vote_result = gr.Markdown("") | |
| with gr.Accordion("📊 Leaderboard", open=False): | |
| leaderboard_output = gr.Markdown(get_leaderboard()) | |
| refresh_btn = gr.Button("🔄 Refresh Leaderboard") | |
| gr.Markdown(""" | |
| --- | |
| ## Available Models | |
| | Model | Size | Strengths | | |
| |-------|------|-----------| | |
| | Mistral-7B | 7B | Speed, reasoning, code | | |
| | Llama-3.1-8B | 8B | General knowledge, instructions | | |
| | Qwen2.5-7B | 7B | Multilingual, math, coding | | |
| | Phi-3-mini | 3.8B | Efficiency, reasoning | | |
| | Gemma-2-9B | 9B | Quality, safety | | |
| | Zephyr-7B | 7B | Helpfulness, alignment | | |
| --- | |
| ## Test Categories | |
| - **Creative Writing** - Poetry, stories, creative tasks | |
| - **Coding** - Programming challenges | |
| - **Reasoning** - Logic puzzles, math | |
| - **Knowledge** - Explanations, facts | |
| - **Summarization** - Condensing information | |
| --- | |
| Built by [Lorenzo Scaturchio](https://huggingface.co/gr8monk3ys) | |
| """) | |
| # Event handlers | |
| battle_btn.click( | |
| fn=battle, | |
| inputs=[prompt_input, model1_dropdown, model2_dropdown], | |
| outputs=[model1_output, model2_output, model1_info, model2_info, current_prompt], | |
| ) | |
| example_btn.click( | |
| fn=get_example_prompt, | |
| inputs=[category_dropdown], | |
| outputs=[prompt_input], | |
| ) | |
| random_btn.click( | |
| fn=random_battle, | |
| outputs=[model1_dropdown, model2_dropdown, prompt_input], | |
| ) | |
| vote1_btn.click( | |
| fn=lambda m1, m2, p: vote_model(m1, m2, p), | |
| inputs=[model1_dropdown, model2_dropdown, current_prompt], | |
| outputs=[vote_result], | |
| ).then( | |
| fn=get_leaderboard, | |
| outputs=[leaderboard_output], | |
| ) | |
| vote2_btn.click( | |
| fn=lambda m1, m2, p: vote_model(m2, m1, p), | |
| inputs=[model1_dropdown, model2_dropdown, current_prompt], | |
| outputs=[vote_result], | |
| ).then( | |
| fn=get_leaderboard, | |
| outputs=[leaderboard_output], | |
| ) | |
| refresh_btn.click( | |
| fn=get_leaderboard, | |
| outputs=[leaderboard_output], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |