Spaces:

Nav772
/

llm-evaluation-dashboard

Sleeping

App Files Files Community

Nav772 commited on Feb 22

Commit

e998535

verified ·

1 Parent(s): 3f93d90

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +51 -58

app.py CHANGED Viewed

@@ -5,15 +5,12 @@ from huggingface_hub import InferenceClient
 import time
 import json
 import re
 # =============================================================================
 # LLM Evaluation Dashboard
 # =============================================================================
-# Compares multiple LLMs across reasoning, knowledge, and instruction-following
-# Uses HuggingFace Inference API (free tier)
-# =============================================================================
-# Models to evaluate
 MODELS = {
     "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
     "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
@@ -30,7 +27,6 @@ MODEL_INFO = {
     "Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"}
 }
-# Evaluation tasks
 EVAL_TASKS = {
     "reasoning": {
         "name": "Reasoning (Math)",
@@ -39,7 +35,7 @@ EVAL_TASKS = {
             {"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"},
             {"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"},
             {"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"},
-            {"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only 'yes' or 'no'.", "expected": "no", "check_type": "contains_lower"},
             {"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"}
         ]
     },
@@ -58,27 +54,22 @@ EVAL_TASKS = {
         "name": "Instruction Following",
         "description": "Tests ability to follow format instructions",
         "tasks": [
-            {"id": "json_1", "prompt": "Return a JSON object with keys 'name' and 'age' for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": '{"name"', "check_type": "json_valid"},
             {"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"},
             {"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"},
-            {"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter 'A'. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"},
             {"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"}
         ]
     }
 }
 def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
-    """Query a model via HF Inference API."""
     client = InferenceClient(model=model_id)
     messages = [{"role": "user", "content": prompt}]
     start_time = time.time()
     try:
-        response = client.chat_completion(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=0.7
-        )
         latency = time.time() - start_time
         return {"response": response.choices[0].message.content, "latency": latency, "error": None}
     except Exception as e:
@@ -86,7 +77,6 @@ def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
         return {"response": None, "latency": latency, "error": str(e)}
 def check_answer(response: str, expected: str, check_type: str) -> dict:
-    """Check if response matches expected answer."""
     if response is None:
         return {"score": 0, "explanation": "No response (error)"}
@@ -108,7 +98,9 @@ def check_answer(response: str, expected: str, check_type: str) -> dict:
     if check_type == "json_valid":
         try:
             json_match = re.search(r'\{[^{}]*\}', response)
-            passed = json_match is not None and json.loads(json_match.group())
         except:
             passed = False
         return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"}
@@ -132,8 +124,8 @@ def check_answer(response: str, expected: str, check_type: str) -> dict:
     return {"score": 0, "explanation": f"Unknown check type: {check_type}"}
-# Pre-computed results (from our evaluation run)
-PRECOMPUTED_RESULTS = """model,category,category_name,task_id,score,latency,response
 Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4
 Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150
 Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40
@@ -144,7 +136,7 @@ Mistral-7B,knowledge,Knowledge (Facts),fact_2,1,0.8,1945
 Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
 Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6
 Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
-Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid
 Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines
 Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words
 Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple
@@ -174,7 +166,7 @@ Qwen2.5-72B,knowledge,Knowledge (Facts),fact_2,1,0.9,1945
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo
-Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid
 Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines
 Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words
 Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple
@@ -210,18 +202,15 @@ Llama-3.1-70B,instruction,Instruction Following,format_2,0,0.04,error
 Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error
 Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error"""
-# Load pre-computed results
-from io import StringIO
-EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_RESULTS))
 def get_summary_stats():
-    """Generate summary statistics HTML."""
     model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False)
     best_model = model_acc.index[0]
     best_acc = model_acc.values[0] * 100
     html = f"""
-    <div style="display: flex; gap: 20px; flex-wrap: wrap; justify-content: center;">
         <div style="background: linear-gradient(135deg, #e8f5e9, #c8e6c9); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
             <h3 style="margin: 0; color: #2e7d32; font-size: 14px;">🏆 Best Model</h3>
             <p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #1b5e20;">{best_model}</p>
@@ -242,7 +231,6 @@ def get_summary_stats():
     return html
 def get_accuracy_chart():
-    """Create overall accuracy bar chart."""
     model_accuracy = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=True)
     fig = go.Figure(go.Bar(
@@ -255,7 +243,7 @@ def get_accuracy_chart():
         textfont=dict(color='white', size=14)
     ))
     fig.update_layout(
-        title="Overall Accuracy by Model",
         xaxis_title="Accuracy (%)",
         yaxis_title="",
         height=350,
@@ -265,34 +253,44 @@ def get_accuracy_chart():
     return fig
 def get_category_heatmap():
-    """Create accuracy heatmap by category."""
-    category_model_acc = EVAL_RESULTS.pivot_table(
         values='score',
         index='model',
         columns='category_name',
         aggfunc='mean'
-    ) * 100
     fig = go.Figure(data=go.Heatmap(
-        z=category_model_acc.values,
-        x=category_model_acc.columns,
-        y=category_model_acc.index,
         colorscale='RdYlGn',
-        text=[[f"{v:.0f}%" for v in row] for row in category_model_acc.values],
         texttemplate="%{text}",
         textfont={"size": 14},
         zmin=0,
-        zmax=100
     ))
     fig.update_layout(
-        title="Accuracy by Model and Task Category",
         height=350,
-        margin=dict(l=20, r=20, t=50, b=40)
     )
     return fig
 def get_latency_chart():
-    """Create latency comparison chart."""
     valid_latency = EVAL_RESULTS[EVAL_RESULTS['latency'] > 0.05]
     latency_by_model = valid_latency.groupby('model')['latency'].mean().sort_values()
@@ -304,7 +302,7 @@ def get_latency_chart():
         textposition='outside'
     ))
     fig.update_layout(
-        title="Average Response Latency",
         xaxis_title="",
         yaxis_title="Latency (seconds)",
         height=350,
@@ -313,7 +311,6 @@ def get_latency_chart():
     return fig
 def get_detailed_results(model_filter, category_filter):
-    """Get filtered detailed results."""
     df = EVAL_RESULTS.copy()
     if model_filter != "All":
@@ -329,12 +326,11 @@ def get_detailed_results(model_filter, category_filter):
     return display_df
 def run_live_comparison(prompt, model_choices):
-    """Run live comparison with custom prompt."""
     if not prompt.strip():
-        return "Please enter a prompt."
     if not model_choices:
-        return "Please select at least one model."
     results_html = "<div style='display: flex; flex-direction: column; gap: 15px;'>"
@@ -343,18 +339,20 @@ def run_live_comparison(prompt, model_choices):
             result = query_model(MODELS[model_name], prompt, max_tokens=200)
             if result["error"]:
-                response_text = f"❌ Error: {result['error'][:100]}"
                 color = "#ffebee"
                 border_color = "#c62828"
             else:
                 response_text = result["response"]
                 color = "#e8f5e9"
                 border_color = "#2e7d32"
             results_html += f"""
             <div style="background: {color}; padding: 15px; border-radius: 8px; border-left: 4px solid {border_color};">
-                <h4 style="margin: 0 0 10px 0;">{model_name} <span style="font-weight: normal; color: #666;">({result['latency']:.2f}s)</span></h4>
-                <p style="margin: 0; white-space: pre-wrap;">{response_text}</p>
             </div>
             """
@@ -371,16 +369,14 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
     gr.HTML(get_summary_stats())
-    gr.Markdown("---")
     with gr.Row():
         with gr.Column():
-            gr.Plot(get_accuracy_chart())
         with gr.Column():
-            gr.Plot(get_latency_chart())
     with gr.Row():
-        gr.Plot(get_category_heatmap())
     gr.Markdown("---")
     gr.Markdown("## 📋 Detailed Results")
@@ -396,7 +392,7 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
     gr.Markdown("---")
     gr.Markdown("## 🔄 Live Model Comparison")
-    gr.Markdown("Test the models yourself with custom prompts!")
     with gr.Row():
         with gr.Column(scale=2):
@@ -413,14 +409,11 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
     ---
     ### 📚 About This Evaluation
-    **Models Tested:** Mistral-7B, Llama-3.2-3B, Llama-3.1-70B, Qwen2.5-72B, Qwen2.5-Coder-32B
-    **Task Categories:**
-    - **Reasoning:** Math word problems and logic puzzles
-    - **Knowledge:** Factual questions (science, history, geography)
-    - **Instruction Following:** Format compliance (JSON, line count, constraints)
-    Built as part of an AI/ML Engineering portfolio project.
     """)
 if __name__ == "__main__":

 import time
 import json
 import re
+from io import StringIO
 # =============================================================================
 # LLM Evaluation Dashboard
 # =============================================================================
 MODELS = {
     "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
     "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
     "Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"}
 }
 EVAL_TASKS = {
     "reasoning": {
         "name": "Reasoning (Math)",
             {"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"},
             {"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"},
             {"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"},
+            {"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only yes or no.", "expected": "no", "check_type": "contains_lower"},
             {"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"}
         ]
     },
         "name": "Instruction Following",
         "description": "Tests ability to follow format instructions",
         "tasks": [
+            {"id": "json_1", "prompt": "Return a JSON object with keys name and age for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": "name", "check_type": "json_valid"},
             {"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"},
             {"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"},
+            {"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter A. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"},
             {"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"}
         ]
     }
 }
 def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
     client = InferenceClient(model=model_id)
     messages = [{"role": "user", "content": prompt}]
     start_time = time.time()
     try:
+        response = client.chat_completion(messages=messages, max_tokens=max_tokens, temperature=0.7)
         latency = time.time() - start_time
         return {"response": response.choices[0].message.content, "latency": latency, "error": None}
     except Exception as e:
         return {"response": None, "latency": latency, "error": str(e)}
 def check_answer(response: str, expected: str, check_type: str) -> dict:
     if response is None:
         return {"score": 0, "explanation": "No response (error)"}
     if check_type == "json_valid":
         try:
             json_match = re.search(r'\{[^{}]*\}', response)
+            passed = json_match is not None
+            if passed:
+                json.loads(json_match.group())
         except:
             passed = False
         return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"}
     return {"score": 0, "explanation": f"Unknown check type: {check_type}"}
+# Pre-computed results
+PRECOMPUTED_CSV = """model,category,category_name,task_id,score,latency,response
 Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4
 Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150
 Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40
 Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
 Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6
 Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
+Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid json
 Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines
 Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words
 Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6
 Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo
+Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid json
 Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines
 Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words
 Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple
 Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error
 Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error"""
+EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_CSV))
 def get_summary_stats():
     model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False)
     best_model = model_acc.index[0]
     best_acc = model_acc.values[0] * 100
     html = f"""
+    <div style="display: flex; gap: 20px; flex-wrap: wrap; justify-content: center; margin-bottom: 20px;">
         <div style="background: linear-gradient(135deg, #e8f5e9, #c8e6c9); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
             <h3 style="margin: 0; color: #2e7d32; font-size: 14px;">🏆 Best Model</h3>
             <p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #1b5e20;">{best_model}</p>
     return html
 def get_accuracy_chart():
     model_accuracy = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=True)
     fig = go.Figure(go.Bar(
         textfont=dict(color='white', size=14)
     ))
     fig.update_layout(
+        title=dict(text="Overall Accuracy by Model", font=dict(size=16)),
         xaxis_title="Accuracy (%)",
         yaxis_title="",
         height=350,
     return fig
 def get_category_heatmap():
+    # Create pivot table
+    pivot = EVAL_RESULTS.pivot_table(
         values='score',
         index='model',
         columns='category_name',
         aggfunc='mean'
+    ).fillna(0) * 100
+    # Get data as lists
+    models = pivot.index.tolist()
+    categories = pivot.columns.tolist()
+    z_values = pivot.values.tolist()
+    # Create text annotations
+    text_values = [[f"{val:.0f}%" for val in row] for row in z_values]
     fig = go.Figure(data=go.Heatmap(
+        z=z_values,
+        x=categories,
+        y=models,
         colorscale='RdYlGn',
+        text=text_values,
         texttemplate="%{text}",
         textfont={"size": 14},
         zmin=0,
+        zmax=100,
+        showscale=True
     ))
     fig.update_layout(
+        title=dict(text="Accuracy by Model and Task Category", font=dict(size=16)),
         height=350,
+        margin=dict(l=20, r=20, t=50, b=40),
+        xaxis=dict(title="", tickangle=0),
+        yaxis=dict(title="")
     )
     return fig
 def get_latency_chart():
     valid_latency = EVAL_RESULTS[EVAL_RESULTS['latency'] > 0.05]
     latency_by_model = valid_latency.groupby('model')['latency'].mean().sort_values()
         textposition='outside'
     ))
     fig.update_layout(
+        title=dict(text="Average Response Latency", font=dict(size=16)),
         xaxis_title="",
         yaxis_title="Latency (seconds)",
         height=350,
     return fig
 def get_detailed_results(model_filter, category_filter):
     df = EVAL_RESULTS.copy()
     if model_filter != "All":
     return display_df
 def run_live_comparison(prompt, model_choices):
     if not prompt.strip():
+        return "<p style='color: #666;'>Please enter a prompt.</p>"
     if not model_choices:
+        return "<p style='color: #666;'>Please select at least one model.</p>"
     results_html = "<div style='display: flex; flex-direction: column; gap: 15px;'>"
             result = query_model(MODELS[model_name], prompt, max_tokens=200)
             if result["error"]:
+                response_text = f"Error: {result['error'][:100]}"
                 color = "#ffebee"
                 border_color = "#c62828"
+                icon = "❌"
             else:
                 response_text = result["response"]
                 color = "#e8f5e9"
                 border_color = "#2e7d32"
+                icon = "✅"
             results_html += f"""
             <div style="background: {color}; padding: 15px; border-radius: 8px; border-left: 4px solid {border_color};">
+                <h4 style="margin: 0 0 10px 0;">{icon} {model_name} <span style="font-weight: normal; color: #666;">({result['latency']:.2f}s)</span></h4>
+                <p style="margin: 0; white-space: pre-wrap; font-family: sans-serif;">{response_text}</p>
             </div>
             """
     gr.HTML(get_summary_stats())
     with gr.Row():
         with gr.Column():
+            gr.Plot(value=get_accuracy_chart(), label="Accuracy")
         with gr.Column():
+            gr.Plot(value=get_latency_chart(), label="Latency")
     with gr.Row():
+        gr.Plot(value=get_category_heatmap(), label="Category Breakdown")
     gr.Markdown("---")
     gr.Markdown("## 📋 Detailed Results")
     gr.Markdown("---")
     gr.Markdown("## 🔄 Live Model Comparison")
+    gr.Markdown("Test the models with your own prompts!")
     with gr.Row():
         with gr.Column(scale=2):
     ---
     ### 📚 About This Evaluation
+    **Models:** Mistral-7B, Llama-3.2-3B, Llama-3.1-70B, Qwen2.5-72B, Qwen2.5-Coder-32B
+    **Categories:** Reasoning (math/logic), Knowledge (facts), Instruction Following (format compliance)
+    *Built as part of an AI/ML Engineering portfolio project.*
     """)
 if __name__ == "__main__":