Spaces:

gr8monk3ys
/

model-arena

Sleeping

App Files Files Community

model-arena / app.py

gr8monk3ys

Upload folder using huggingface_hub

252cc7d verified 3 months ago

raw

history blame contribute delete

11.5 kB

	"""
	AI Model Arena - Compare AI model outputs side by side.
	Test prompts across multiple models and vote for the best response.
	"""

	import gradio as gr
	from huggingface_hub import InferenceClient
	import random
	import time

	# ---------------------------------------------------------------------------
	# Model Configurations
	# ---------------------------------------------------------------------------

	MODELS = {
	"Mistral-7B": {
	"id": "mistralai/Mistral-7B-Instruct-v0.3",
	"description": "Fast, efficient 7B parameter model from Mistral AI",
	"strengths": "Speed, reasoning, code",
	},
	"Llama-3.1-8B": {
	"id": "meta-llama/Llama-3.1-8B-Instruct",
	"description": "Meta's latest open LLM with strong capabilities",
	"strengths": "General knowledge, instruction following",
	},
	"Qwen2.5-7B": {
	"id": "Qwen/Qwen2.5-7B-Instruct",
	"description": "Alibaba's powerful multilingual model",
	"strengths": "Multilingual, math, coding",
	},
	"Phi-3-mini": {
	"id": "microsoft/Phi-3-mini-4k-instruct",
	"description": "Microsoft's compact but capable model",
	"strengths": "Efficiency, reasoning, small size",
	},
	"Gemma-2-9B": {
	"id": "google/gemma-2-9b-it",
	"description": "Google's instruction-tuned Gemma model",
	"strengths": "Quality, safety, general tasks",
	},
	"Zephyr-7B": {
	"id": "HuggingFaceH4/zephyr-7b-beta",
	"description": "Fine-tuned Mistral with DPO alignment",
	"strengths": "Helpfulness, alignment, chat",
	},
	}

	CATEGORIES = {
	"Creative Writing": [
	"Write a haiku about artificial intelligence",
	"Create a short story opening about a robot discovering emotions",
	"Write a limerick about machine learning",
	"Compose a brief poem about the future of technology",
	],
	"Coding": [
	"Write a Python function to check if a number is prime",
	"Create a JavaScript function to reverse a string",
	"Write a SQL query to find duplicate emails in a users table",
	"Implement a simple stack data structure in Python",
	],
	"Reasoning": [
	"If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?",
	"A bat and ball cost $1.10 total. The bat costs $1 more than the ball. How much does the ball cost?",
	"What comes next in the sequence: 2, 6, 12, 20, 30, ?",
	"If it takes 5 machines 5 minutes to make 5 widgets, how long would it take 100 machines to make 100 widgets?",
	],
	"Knowledge": [
	"Explain quantum entanglement in simple terms",
	"What are the main differences between TCP and UDP?",
	"Briefly explain how transformers work in machine learning",
	"What is the difference between machine learning and deep learning?",
	],
	"Summarization": [
	"Summarize the concept of blockchain technology in 2-3 sentences",
	"Explain the main idea behind reinforcement learning briefly",
	"Summarize what makes Python popular for data science",
	"Briefly explain the concept of transfer learning",
	],
	}

	# ---------------------------------------------------------------------------
	# State Management
	# ---------------------------------------------------------------------------

	vote_counts = {model: {"wins": 0, "battles": 0} for model in MODELS}

	# ---------------------------------------------------------------------------
	# Core Functions
	# ---------------------------------------------------------------------------

	def get_model_response(model_id: str, prompt: str, max_tokens: int = 500) -> tuple:
	"""Get response from a model with timing."""
	client = InferenceClient(model_id)

	start_time = time.time()
	try:
	messages = [{"role": "user", "content": prompt}]
	response = client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=0.7,
	)
	elapsed = time.time() - start_time
	return response.choices[0].message.content, elapsed, None
	except Exception as e:
	elapsed = time.time() - start_time
	return None, elapsed, str(e)


	def battle(prompt: str, model1_name: str, model2_name: str) -> tuple:
	"""Run a battle between two models."""
	if not prompt.strip():
	return "Please enter a prompt.", "", "", "", ""

	model1_id = MODELS[model1_name]["id"]
	model2_id = MODELS[model2_name]["id"]

	# Get responses
	resp1, time1, err1 = get_model_response(model1_id, prompt)
	resp2, time2, err2 = get_model_response(model2_id, prompt)

	# Format responses
	if err1:
	output1 = f"Error: {err1}"
	else:
	output1 = f"{resp1}\n\n---\nResponse time: {time1:.2f}s"

	if err2:
	output2 = f"Error: {err2}"
	else:
	output2 = f"{resp2}\n\n---\nResponse time: {time2:.2f}s"

	# Model info
	info1 = f"{model1_name}\n{MODELS[model1_name]['description']}\nStrengths: {MODELS[model1_name]['strengths']}"
	info2 = f"{model2_name}\n{MODELS[model2_name]['description']}\nStrengths: {MODELS[model2_name]['strengths']}"

	return output1, output2, info1, info2, prompt


	def vote_model(winner: str, loser: str, prompt: str) -> str:
	"""Record a vote for the winning model."""
	if not prompt:
	return "Run a battle first before voting!"

	vote_counts[winner]["wins"] += 1
	vote_counts[winner]["battles"] += 1
	vote_counts[loser]["battles"] += 1

	return f"Voted for {winner}! Total wins: {vote_counts[winner]['wins']}/{vote_counts[winner]['battles']}"


	def get_leaderboard() -> str:
	"""Generate leaderboard markdown."""
	# Calculate win rates
	rankings = []
	for model, stats in vote_counts.items():
	if stats["battles"] > 0:
	win_rate = stats["wins"] / stats["battles"] * 100
	else:
	win_rate = 0
	rankings.append((model, stats["wins"], stats["battles"], win_rate))

	# Sort by win rate, then by total wins
	rankings.sort(key=lambda x: (x[3], x[1]), reverse=True)

	# Generate markdown table
	md = "## Leaderboard\n\n"
	md += "\| Rank \| Model \| Wins \| Battles \| Win Rate \|\n"
	md += "\|------\|-------\|------\|---------\|----------\|\n"

	for i, (model, wins, battles, rate) in enumerate(rankings, 1):
	if battles > 0:
	md += f"\| {i} \| {model} \| {wins} \| {battles} \| {rate:.1f}% \|\n"
	else:
	md += f"\| {i} \| {model} \| 0 \| 0 \| - \|\n"

	md += "\nLeaderboard resets when the Space restarts"
	return md


	def random_battle() -> tuple:
	"""Set up a random battle."""
	models = list(MODELS.keys())
	model1 = random.choice(models)
	model2 = random.choice([m for m in models if m != model1])
	category = random.choice(list(CATEGORIES.keys()))
	prompt = random.choice(CATEGORIES[category])
	return model1, model2, prompt


	def get_example_prompt(category: str) -> str:
	"""Get a random prompt from a category."""
	if category in CATEGORIES:
	return random.choice(CATEGORIES[category])
	return ""


	# ---------------------------------------------------------------------------
	# Gradio Interface
	# ---------------------------------------------------------------------------

	with gr.Blocks(title="AI Model Arena", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# AI Model Arena

	Compare AI models head-to-head!

	Test the same prompt across different models and vote for the best response.
	See which models excel at different tasks.

	All models run via HuggingFace Inference API
	""")

	# Hidden state for current prompt
	current_prompt = gr.State("")

	with gr.Row():
	with gr.Column(scale=2):
	prompt_input = gr.Textbox(
	label="Your Prompt",
	placeholder="Enter a prompt to test both models...",
	lines=3,
	)

	with gr.Row():
	category_dropdown = gr.Dropdown(
	choices=list(CATEGORIES.keys()),
	label="Example Category",
	value="Creative Writing",
	)
	example_btn = gr.Button("📝 Get Example", size="sm")

	with gr.Column(scale=1):
	model1_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="Mistral-7B",
	label="Model A",
	)
	model2_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="Llama-3.1-8B",
	label="Model B",
	)

	with gr.Row():
	random_btn = gr.Button("🎲 Random Battle", variant="secondary")
	battle_btn = gr.Button("⚔️ Start Battle!", variant="primary", size="lg")

	with gr.Row():
	with gr.Column():
	model1_info = gr.Markdown("Model A")
	model1_output = gr.Markdown(label="Model A Response")
	vote1_btn = gr.Button("👍 Vote for Model A", variant="secondary")

	with gr.Column():
	model2_info = gr.Markdown("Model B")
	model2_output = gr.Markdown(label="Model B Response")
	vote2_btn = gr.Button("👍 Vote for Model B", variant="secondary")

	vote_result = gr.Markdown("")

	with gr.Accordion("📊 Leaderboard", open=False):
	leaderboard_output = gr.Markdown(get_leaderboard())
	refresh_btn = gr.Button("🔄 Refresh Leaderboard")

	gr.Markdown("""
	---

	## Available Models

	\| Model \| Size \| Strengths \|
	\|-------\|------\|-----------\|
	\| Mistral-7B \| 7B \| Speed, reasoning, code \|
	\| Llama-3.1-8B \| 8B \| General knowledge, instructions \|
	\| Qwen2.5-7B \| 7B \| Multilingual, math, coding \|
	\| Phi-3-mini \| 3.8B \| Efficiency, reasoning \|
	\| Gemma-2-9B \| 9B \| Quality, safety \|
	\| Zephyr-7B \| 7B \| Helpfulness, alignment \|

	---

	## Test Categories

	- Creative Writing - Poetry, stories, creative tasks
	- Coding - Programming challenges
	- Reasoning - Logic puzzles, math
	- Knowledge - Explanations, facts
	- Summarization - Condensing information

	---

	Built by [Lorenzo Scaturchio](https://huggingface.co/gr8monk3ys)
	""")

	# Event handlers
	battle_btn.click(
	fn=battle,
	inputs=[prompt_input, model1_dropdown, model2_dropdown],
	outputs=[model1_output, model2_output, model1_info, model2_info, current_prompt],
	)

	example_btn.click(
	fn=get_example_prompt,
	inputs=[category_dropdown],
	outputs=[prompt_input],
	)

	random_btn.click(
	fn=random_battle,
	outputs=[model1_dropdown, model2_dropdown, prompt_input],
	)

	vote1_btn.click(
	fn=lambda m1, m2, p: vote_model(m1, m2, p),
	inputs=[model1_dropdown, model2_dropdown, current_prompt],
	outputs=[vote_result],
	).then(
	fn=get_leaderboard,
	outputs=[leaderboard_output],
	)

	vote2_btn.click(
	fn=lambda m1, m2, p: vote_model(m2, m1, p),
	inputs=[model1_dropdown, model2_dropdown, current_prompt],
	outputs=[vote_result],
	).then(
	fn=get_leaderboard,
	outputs=[leaderboard_output],
	)

	refresh_btn.click(
	fn=get_leaderboard,
	outputs=[leaderboard_output],
	)

	if __name__ == "__main__":
	demo.launch()