import gradio as gr import pandas as pd import plotly.graph_objects as go from huggingface_hub import InferenceClient import time import json import re from io import StringIO # ============================================================================= # LLM Evaluation Dashboard # ============================================================================= MODELS = { "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2", "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct", "Llama-3.1-70B": "meta-llama/Llama-3.1-70B-Instruct", "Qwen2.5-72B": "Qwen/Qwen2.5-72B-Instruct", "Qwen2.5-Coder": "Qwen/Qwen2.5-Coder-32B-Instruct" } MODEL_INFO = { "Mistral-7B": {"params": "7B", "type": "General", "org": "Mistral AI"}, "Llama-3.2-3B": {"params": "3B", "type": "General", "org": "Meta"}, "Llama-3.1-70B": {"params": "70B", "type": "General", "org": "Meta"}, "Qwen2.5-72B": {"params": "72B", "type": "General", "org": "Alibaba"}, "Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"} } EVAL_TASKS = { "reasoning": { "name": "Reasoning (Math)", "description": "Tests logical and mathematical reasoning", "tasks": [ {"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"}, {"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"}, {"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"}, {"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only yes or no.", "expected": "no", "check_type": "contains_lower"}, {"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"} ] }, "knowledge": { "name": "Knowledge (Facts)", "description": "Tests factual accuracy", "tasks": [ {"id": "fact_1", "prompt": "What is the chemical symbol for gold? Answer with just the symbol.", "expected": "Au", "check_type": "contains"}, {"id": "fact_2", "prompt": "In what year did World War 2 end? Answer with just the year.", "expected": "1945", "check_type": "contains"}, {"id": "fact_3", "prompt": "What planet is known as the Red Planet? Answer with just the name.", "expected": "Mars", "check_type": "contains_lower"}, {"id": "fact_4", "prompt": "How many sides does a hexagon have? Answer with just the number.", "expected": "6", "check_type": "contains"}, {"id": "fact_5", "prompt": "What is the capital of Japan? Answer with just the city name.", "expected": "Tokyo", "check_type": "contains_lower"} ] }, "instruction": { "name": "Instruction Following", "description": "Tests ability to follow format instructions", "tasks": [ {"id": "json_1", "prompt": "Return a JSON object with keys name and age for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": "name", "check_type": "json_valid"}, {"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"}, {"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"}, {"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter A. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"}, {"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"} ] } } def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict: client = InferenceClient(model=model_id) messages = [{"role": "user", "content": prompt}] start_time = time.time() try: response = client.chat_completion(messages=messages, max_tokens=max_tokens, temperature=0.7) latency = time.time() - start_time return {"response": response.choices[0].message.content, "latency": latency, "error": None} except Exception as e: latency = time.time() - start_time return {"response": None, "latency": latency, "error": str(e)} def check_answer(response: str, expected: str, check_type: str) -> dict: if response is None: return {"score": 0, "explanation": "No response (error)"} response = response.strip() if check_type == "contains": passed = expected in response return {"score": 1 if passed else 0, "explanation": f"Looking for '{expected}' in response"} if check_type == "contains_lower": passed = expected.lower() in response.lower() return {"score": 1 if passed else 0, "explanation": f"Looking for '{expected}' (case-insensitive)"} if check_type == "starts_with_lower": first_word = response.split()[0] if response.split() else "" passed = first_word.lower().startswith(expected.lower()) return {"score": 1 if passed else 0, "explanation": f"Should start with '{expected}'"} if check_type == "json_valid": try: json_match = re.search(r'\{[^{}]*\}', response) passed = json_match is not None if passed: json.loads(json_match.group()) except: passed = False return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"} if check_type == "line_count": lines = [l for l in response.strip().split("\n") if l.strip()] expected_count = int(expected.split("_")[0]) passed = len(lines) == expected_count return {"score": 1 if passed else 0, "explanation": f"Expected {expected_count} lines, got {len(lines)}"} if check_type == "word_count": words = response.split() expected_count = int(expected) passed = len(words) == expected_count return {"score": 1 if passed else 0, "explanation": f"Expected {expected_count} words, got {len(words)}"} if check_type == "is_single_number": numbers = re.findall(r'\b([1-9]|10)\b', response) passed = len(numbers) >= 1 and len(response.strip()) <= 3 return {"score": 1 if passed else 0, "explanation": "Should be a single number 1-10"} return {"score": 0, "explanation": f"Unknown check type: {check_type}"} # Pre-computed results PRECOMPUTED_CSV = """model,category,category_name,task_id,score,latency,response Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4 Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150 Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40 Mistral-7B,reasoning,Reasoning (Math),logic_1,0,0.2,yes Mistral-7B,reasoning,Reasoning (Math),logic_2,0,0.2,3 Mistral-7B,knowledge,Knowledge (Facts),fact_1,1,0.3,Au Mistral-7B,knowledge,Knowledge (Facts),fact_2,1,0.8,1945 Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6 Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid json Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple Mistral-7B,instruction,Instruction Following,constraint_2,1,0.2,7 Llama-3.2-3B,reasoning,Reasoning (Math),math_1,1,2.2,4 Llama-3.2-3B,reasoning,Reasoning (Math),math_2,1,5.8,150 Llama-3.2-3B,reasoning,Reasoning (Math),math_3,1,3.5,40 Llama-3.2-3B,reasoning,Reasoning (Math),logic_1,0,0.9,yes Llama-3.2-3B,reasoning,Reasoning (Math),logic_2,0,4.8,3 Llama-3.2-3B,knowledge,Knowledge (Facts),fact_1,1,5.6,Au Llama-3.2-3B,knowledge,Knowledge (Facts),fact_2,1,2.9,1945 Llama-3.2-3B,knowledge,Knowledge (Facts),fact_3,1,0.8,Mars Llama-3.2-3B,knowledge,Knowledge (Facts),fact_4,1,2.7,6 Llama-3.2-3B,knowledge,Knowledge (Facts),fact_5,1,3.8,Tokyo Llama-3.2-3B,instruction,Instruction Following,json_1,0,0.1,error Llama-3.2-3B,instruction,Instruction Following,format_1,0,0.1,error Llama-3.2-3B,instruction,Instruction Following,format_2,0,0.1,error Llama-3.2-3B,instruction,Instruction Following,constraint_1,0,0.1,error Llama-3.2-3B,instruction,Instruction Following,constraint_2,0,0.1,error Qwen2.5-72B,reasoning,Reasoning (Math),math_1,0,0.1,error Qwen2.5-72B,reasoning,Reasoning (Math),math_2,0,0.1,error Qwen2.5-72B,reasoning,Reasoning (Math),math_3,0,0.1,error Qwen2.5-72B,reasoning,Reasoning (Math),logic_1,0,0.1,error Qwen2.5-72B,reasoning,Reasoning (Math),logic_2,0,0.1,error Qwen2.5-72B,knowledge,Knowledge (Facts),fact_1,0,0.1,error Qwen2.5-72B,knowledge,Knowledge (Facts),fact_2,1,0.9,1945 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid json Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple Qwen2.5-72B,instruction,Instruction Following,constraint_2,1,1.0,5 Qwen2.5-Coder,reasoning,Reasoning (Math),math_1,1,0.4,4 Qwen2.5-Coder,reasoning,Reasoning (Math),math_2,1,0.4,150 Qwen2.5-Coder,reasoning,Reasoning (Math),math_3,1,0.4,40 Qwen2.5-Coder,reasoning,Reasoning (Math),logic_1,1,0.4,no Qwen2.5-Coder,reasoning,Reasoning (Math),logic_2,0,0.5,3 Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_1,1,0.4,Au Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_2,1,0.4,1945 Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_4,1,0.4,6 Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo Qwen2.5-Coder,instruction,Instruction Following,json_1,0,0.1,error Qwen2.5-Coder,instruction,Instruction Following,format_1,0,0.1,error Qwen2.5-Coder,instruction,Instruction Following,format_2,0,0.1,error Qwen2.5-Coder,instruction,Instruction Following,constraint_1,0,0.1,error Qwen2.5-Coder,instruction,Instruction Following,constraint_2,0,0.1,error Llama-3.1-70B,reasoning,Reasoning (Math),math_1,0,0.04,error Llama-3.1-70B,reasoning,Reasoning (Math),math_2,0,0.04,error Llama-3.1-70B,reasoning,Reasoning (Math),math_3,0,0.04,error Llama-3.1-70B,reasoning,Reasoning (Math),logic_1,0,0.04,error Llama-3.1-70B,reasoning,Reasoning (Math),logic_2,0,0.04,error Llama-3.1-70B,knowledge,Knowledge (Facts),fact_1,0,0.04,error Llama-3.1-70B,knowledge,Knowledge (Facts),fact_2,0,0.04,error Llama-3.1-70B,knowledge,Knowledge (Facts),fact_3,0,0.04,error Llama-3.1-70B,knowledge,Knowledge (Facts),fact_4,0,0.04,error Llama-3.1-70B,knowledge,Knowledge (Facts),fact_5,0,0.04,error Llama-3.1-70B,instruction,Instruction Following,json_1,0,0.04,error Llama-3.1-70B,instruction,Instruction Following,format_1,0,0.04,error Llama-3.1-70B,instruction,Instruction Following,format_2,0,0.04,error Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error""" EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_CSV)) def get_summary_stats(): model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False) best_model = model_acc.index[0] best_acc = model_acc.values[0] * 100 html = f"""
{best_model}
{best_acc:.1f}% accuracy
15
Across 3 categories
{len(MODELS)}
3B to 72B parameters
Please enter a prompt.
" if not model_choices: return "Please select at least one model.
" results_html = "{response_text}