Nav772 commited on
Commit
e998535
·
verified ·
1 Parent(s): 3f93d90

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +51 -58
app.py CHANGED
@@ -5,15 +5,12 @@ from huggingface_hub import InferenceClient
5
  import time
6
  import json
7
  import re
 
8
 
9
  # =============================================================================
10
  # LLM Evaluation Dashboard
11
  # =============================================================================
12
- # Compares multiple LLMs across reasoning, knowledge, and instruction-following
13
- # Uses HuggingFace Inference API (free tier)
14
- # =============================================================================
15
 
16
- # Models to evaluate
17
  MODELS = {
18
  "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
19
  "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
@@ -30,7 +27,6 @@ MODEL_INFO = {
30
  "Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"}
31
  }
32
 
33
- # Evaluation tasks
34
  EVAL_TASKS = {
35
  "reasoning": {
36
  "name": "Reasoning (Math)",
@@ -39,7 +35,7 @@ EVAL_TASKS = {
39
  {"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"},
40
  {"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"},
41
  {"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"},
42
- {"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only 'yes' or 'no'.", "expected": "no", "check_type": "contains_lower"},
43
  {"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"}
44
  ]
45
  },
@@ -58,27 +54,22 @@ EVAL_TASKS = {
58
  "name": "Instruction Following",
59
  "description": "Tests ability to follow format instructions",
60
  "tasks": [
61
- {"id": "json_1", "prompt": "Return a JSON object with keys 'name' and 'age' for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": '{"name"', "check_type": "json_valid"},
62
  {"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"},
63
  {"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"},
64
- {"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter 'A'. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"},
65
  {"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"}
66
  ]
67
  }
68
  }
69
 
70
  def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
71
- """Query a model via HF Inference API."""
72
  client = InferenceClient(model=model_id)
73
  messages = [{"role": "user", "content": prompt}]
74
 
75
  start_time = time.time()
76
  try:
77
- response = client.chat_completion(
78
- messages=messages,
79
- max_tokens=max_tokens,
80
- temperature=0.7
81
- )
82
  latency = time.time() - start_time
83
  return {"response": response.choices[0].message.content, "latency": latency, "error": None}
84
  except Exception as e:
@@ -86,7 +77,6 @@ def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
86
  return {"response": None, "latency": latency, "error": str(e)}
87
 
88
  def check_answer(response: str, expected: str, check_type: str) -> dict:
89
- """Check if response matches expected answer."""
90
  if response is None:
91
  return {"score": 0, "explanation": "No response (error)"}
92
 
@@ -108,7 +98,9 @@ def check_answer(response: str, expected: str, check_type: str) -> dict:
108
  if check_type == "json_valid":
109
  try:
110
  json_match = re.search(r'\{[^{}]*\}', response)
111
- passed = json_match is not None and json.loads(json_match.group())
 
 
112
  except:
113
  passed = False
114
  return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"}
@@ -132,8 +124,8 @@ def check_answer(response: str, expected: str, check_type: str) -> dict:
132
 
133
  return {"score": 0, "explanation": f"Unknown check type: {check_type}"}
134
 
135
- # Pre-computed results (from our evaluation run)
136
- PRECOMPUTED_RESULTS = """model,category,category_name,task_id,score,latency,response
137
  Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4
138
  Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150
139
  Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40
@@ -144,7 +136,7 @@ Mistral-7B,knowledge,Knowledge (Facts),fact_2,1,0.8,1945
144
  Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
145
  Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6
146
  Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
147
- Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid
148
  Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines
149
  Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words
150
  Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple
@@ -174,7 +166,7 @@ Qwen2.5-72B,knowledge,Knowledge (Facts),fact_2,1,0.9,1945
174
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars
175
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6
176
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo
177
- Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid
178
  Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines
179
  Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words
180
  Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple
@@ -210,18 +202,15 @@ Llama-3.1-70B,instruction,Instruction Following,format_2,0,0.04,error
210
  Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error
211
  Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error"""
212
 
213
- # Load pre-computed results
214
- from io import StringIO
215
- EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_RESULTS))
216
 
217
  def get_summary_stats():
218
- """Generate summary statistics HTML."""
219
  model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False)
220
  best_model = model_acc.index[0]
221
  best_acc = model_acc.values[0] * 100
222
 
223
  html = f"""
224
- <div style="display: flex; gap: 20px; flex-wrap: wrap; justify-content: center;">
225
  <div style="background: linear-gradient(135deg, #e8f5e9, #c8e6c9); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
226
  <h3 style="margin: 0; color: #2e7d32; font-size: 14px;">🏆 Best Model</h3>
227
  <p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #1b5e20;">{best_model}</p>
@@ -242,7 +231,6 @@ def get_summary_stats():
242
  return html
243
 
244
  def get_accuracy_chart():
245
- """Create overall accuracy bar chart."""
246
  model_accuracy = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=True)
247
 
248
  fig = go.Figure(go.Bar(
@@ -255,7 +243,7 @@ def get_accuracy_chart():
255
  textfont=dict(color='white', size=14)
256
  ))
257
  fig.update_layout(
258
- title="Overall Accuracy by Model",
259
  xaxis_title="Accuracy (%)",
260
  yaxis_title="",
261
  height=350,
@@ -265,34 +253,44 @@ def get_accuracy_chart():
265
  return fig
266
 
267
  def get_category_heatmap():
268
- """Create accuracy heatmap by category."""
269
- category_model_acc = EVAL_RESULTS.pivot_table(
270
  values='score',
271
  index='model',
272
  columns='category_name',
273
  aggfunc='mean'
274
- ) * 100
 
 
 
 
 
 
 
 
275
 
276
  fig = go.Figure(data=go.Heatmap(
277
- z=category_model_acc.values,
278
- x=category_model_acc.columns,
279
- y=category_model_acc.index,
280
  colorscale='RdYlGn',
281
- text=[[f"{v:.0f}%" for v in row] for row in category_model_acc.values],
282
  texttemplate="%{text}",
283
  textfont={"size": 14},
284
  zmin=0,
285
- zmax=100
 
286
  ))
287
  fig.update_layout(
288
- title="Accuracy by Model and Task Category",
289
  height=350,
290
- margin=dict(l=20, r=20, t=50, b=40)
 
 
291
  )
292
  return fig
293
 
294
  def get_latency_chart():
295
- """Create latency comparison chart."""
296
  valid_latency = EVAL_RESULTS[EVAL_RESULTS['latency'] > 0.05]
297
  latency_by_model = valid_latency.groupby('model')['latency'].mean().sort_values()
298
 
@@ -304,7 +302,7 @@ def get_latency_chart():
304
  textposition='outside'
305
  ))
306
  fig.update_layout(
307
- title="Average Response Latency",
308
  xaxis_title="",
309
  yaxis_title="Latency (seconds)",
310
  height=350,
@@ -313,7 +311,6 @@ def get_latency_chart():
313
  return fig
314
 
315
  def get_detailed_results(model_filter, category_filter):
316
- """Get filtered detailed results."""
317
  df = EVAL_RESULTS.copy()
318
 
319
  if model_filter != "All":
@@ -329,12 +326,11 @@ def get_detailed_results(model_filter, category_filter):
329
  return display_df
330
 
331
  def run_live_comparison(prompt, model_choices):
332
- """Run live comparison with custom prompt."""
333
  if not prompt.strip():
334
- return "Please enter a prompt."
335
 
336
  if not model_choices:
337
- return "Please select at least one model."
338
 
339
  results_html = "<div style='display: flex; flex-direction: column; gap: 15px;'>"
340
 
@@ -343,18 +339,20 @@ def run_live_comparison(prompt, model_choices):
343
  result = query_model(MODELS[model_name], prompt, max_tokens=200)
344
 
345
  if result["error"]:
346
- response_text = f"Error: {result['error'][:100]}"
347
  color = "#ffebee"
348
  border_color = "#c62828"
 
349
  else:
350
  response_text = result["response"]
351
  color = "#e8f5e9"
352
  border_color = "#2e7d32"
 
353
 
354
  results_html += f"""
355
  <div style="background: {color}; padding: 15px; border-radius: 8px; border-left: 4px solid {border_color};">
356
- <h4 style="margin: 0 0 10px 0;">{model_name} <span style="font-weight: normal; color: #666;">({result['latency']:.2f}s)</span></h4>
357
- <p style="margin: 0; white-space: pre-wrap;">{response_text}</p>
358
  </div>
359
  """
360
 
@@ -371,16 +369,14 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
371
 
372
  gr.HTML(get_summary_stats())
373
 
374
- gr.Markdown("---")
375
-
376
  with gr.Row():
377
  with gr.Column():
378
- gr.Plot(get_accuracy_chart())
379
  with gr.Column():
380
- gr.Plot(get_latency_chart())
381
 
382
  with gr.Row():
383
- gr.Plot(get_category_heatmap())
384
 
385
  gr.Markdown("---")
386
  gr.Markdown("## 📋 Detailed Results")
@@ -396,7 +392,7 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
396
 
397
  gr.Markdown("---")
398
  gr.Markdown("## 🔄 Live Model Comparison")
399
- gr.Markdown("Test the models yourself with custom prompts!")
400
 
401
  with gr.Row():
402
  with gr.Column(scale=2):
@@ -413,14 +409,11 @@ with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo
413
  ---
414
  ### 📚 About This Evaluation
415
 
416
- **Models Tested:** Mistral-7B, Llama-3.2-3B, Llama-3.1-70B, Qwen2.5-72B, Qwen2.5-Coder-32B
417
 
418
- **Task Categories:**
419
- - **Reasoning:** Math word problems and logic puzzles
420
- - **Knowledge:** Factual questions (science, history, geography)
421
- - **Instruction Following:** Format compliance (JSON, line count, constraints)
422
 
423
- Built as part of an AI/ML Engineering portfolio project.
424
  """)
425
 
426
  if __name__ == "__main__":
 
5
  import time
6
  import json
7
  import re
8
+ from io import StringIO
9
 
10
  # =============================================================================
11
  # LLM Evaluation Dashboard
12
  # =============================================================================
 
 
 
13
 
 
14
  MODELS = {
15
  "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
16
  "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
 
27
  "Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"}
28
  }
29
 
 
30
  EVAL_TASKS = {
31
  "reasoning": {
32
  "name": "Reasoning (Math)",
 
35
  {"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"},
36
  {"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"},
37
  {"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"},
38
+ {"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only yes or no.", "expected": "no", "check_type": "contains_lower"},
39
  {"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"}
40
  ]
41
  },
 
54
  "name": "Instruction Following",
55
  "description": "Tests ability to follow format instructions",
56
  "tasks": [
57
+ {"id": "json_1", "prompt": "Return a JSON object with keys name and age for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": "name", "check_type": "json_valid"},
58
  {"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"},
59
  {"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"},
60
+ {"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter A. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"},
61
  {"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"}
62
  ]
63
  }
64
  }
65
 
66
  def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
 
67
  client = InferenceClient(model=model_id)
68
  messages = [{"role": "user", "content": prompt}]
69
 
70
  start_time = time.time()
71
  try:
72
+ response = client.chat_completion(messages=messages, max_tokens=max_tokens, temperature=0.7)
 
 
 
 
73
  latency = time.time() - start_time
74
  return {"response": response.choices[0].message.content, "latency": latency, "error": None}
75
  except Exception as e:
 
77
  return {"response": None, "latency": latency, "error": str(e)}
78
 
79
  def check_answer(response: str, expected: str, check_type: str) -> dict:
 
80
  if response is None:
81
  return {"score": 0, "explanation": "No response (error)"}
82
 
 
98
  if check_type == "json_valid":
99
  try:
100
  json_match = re.search(r'\{[^{}]*\}', response)
101
+ passed = json_match is not None
102
+ if passed:
103
+ json.loads(json_match.group())
104
  except:
105
  passed = False
106
  return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"}
 
124
 
125
  return {"score": 0, "explanation": f"Unknown check type: {check_type}"}
126
 
127
+ # Pre-computed results
128
+ PRECOMPUTED_CSV = """model,category,category_name,task_id,score,latency,response
129
  Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4
130
  Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150
131
  Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40
 
136
  Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
137
  Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6
138
  Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
139
+ Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid json
140
  Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines
141
  Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words
142
  Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple
 
166
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars
167
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6
168
  Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo
169
+ Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid json
170
  Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines
171
  Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words
172
  Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple
 
202
  Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error
203
  Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error"""
204
 
205
+ EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_CSV))
 
 
206
 
207
  def get_summary_stats():
 
208
  model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False)
209
  best_model = model_acc.index[0]
210
  best_acc = model_acc.values[0] * 100
211
 
212
  html = f"""
213
+ <div style="display: flex; gap: 20px; flex-wrap: wrap; justify-content: center; margin-bottom: 20px;">
214
  <div style="background: linear-gradient(135deg, #e8f5e9, #c8e6c9); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
215
  <h3 style="margin: 0; color: #2e7d32; font-size: 14px;">🏆 Best Model</h3>
216
  <p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #1b5e20;">{best_model}</p>
 
231
  return html
232
 
233
  def get_accuracy_chart():
 
234
  model_accuracy = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=True)
235
 
236
  fig = go.Figure(go.Bar(
 
243
  textfont=dict(color='white', size=14)
244
  ))
245
  fig.update_layout(
246
+ title=dict(text="Overall Accuracy by Model", font=dict(size=16)),
247
  xaxis_title="Accuracy (%)",
248
  yaxis_title="",
249
  height=350,
 
253
  return fig
254
 
255
  def get_category_heatmap():
256
+ # Create pivot table
257
+ pivot = EVAL_RESULTS.pivot_table(
258
  values='score',
259
  index='model',
260
  columns='category_name',
261
  aggfunc='mean'
262
+ ).fillna(0) * 100
263
+
264
+ # Get data as lists
265
+ models = pivot.index.tolist()
266
+ categories = pivot.columns.tolist()
267
+ z_values = pivot.values.tolist()
268
+
269
+ # Create text annotations
270
+ text_values = [[f"{val:.0f}%" for val in row] for row in z_values]
271
 
272
  fig = go.Figure(data=go.Heatmap(
273
+ z=z_values,
274
+ x=categories,
275
+ y=models,
276
  colorscale='RdYlGn',
277
+ text=text_values,
278
  texttemplate="%{text}",
279
  textfont={"size": 14},
280
  zmin=0,
281
+ zmax=100,
282
+ showscale=True
283
  ))
284
  fig.update_layout(
285
+ title=dict(text="Accuracy by Model and Task Category", font=dict(size=16)),
286
  height=350,
287
+ margin=dict(l=20, r=20, t=50, b=40),
288
+ xaxis=dict(title="", tickangle=0),
289
+ yaxis=dict(title="")
290
  )
291
  return fig
292
 
293
  def get_latency_chart():
 
294
  valid_latency = EVAL_RESULTS[EVAL_RESULTS['latency'] > 0.05]
295
  latency_by_model = valid_latency.groupby('model')['latency'].mean().sort_values()
296
 
 
302
  textposition='outside'
303
  ))
304
  fig.update_layout(
305
+ title=dict(text="Average Response Latency", font=dict(size=16)),
306
  xaxis_title="",
307
  yaxis_title="Latency (seconds)",
308
  height=350,
 
311
  return fig
312
 
313
  def get_detailed_results(model_filter, category_filter):
 
314
  df = EVAL_RESULTS.copy()
315
 
316
  if model_filter != "All":
 
326
  return display_df
327
 
328
  def run_live_comparison(prompt, model_choices):
 
329
  if not prompt.strip():
330
+ return "<p style='color: #666;'>Please enter a prompt.</p>"
331
 
332
  if not model_choices:
333
+ return "<p style='color: #666;'>Please select at least one model.</p>"
334
 
335
  results_html = "<div style='display: flex; flex-direction: column; gap: 15px;'>"
336
 
 
339
  result = query_model(MODELS[model_name], prompt, max_tokens=200)
340
 
341
  if result["error"]:
342
+ response_text = f"Error: {result['error'][:100]}"
343
  color = "#ffebee"
344
  border_color = "#c62828"
345
+ icon = "❌"
346
  else:
347
  response_text = result["response"]
348
  color = "#e8f5e9"
349
  border_color = "#2e7d32"
350
+ icon = "✅"
351
 
352
  results_html += f"""
353
  <div style="background: {color}; padding: 15px; border-radius: 8px; border-left: 4px solid {border_color};">
354
+ <h4 style="margin: 0 0 10px 0;">{icon} {model_name} <span style="font-weight: normal; color: #666;">({result['latency']:.2f}s)</span></h4>
355
+ <p style="margin: 0; white-space: pre-wrap; font-family: sans-serif;">{response_text}</p>
356
  </div>
357
  """
358
 
 
369
 
370
  gr.HTML(get_summary_stats())
371
 
 
 
372
  with gr.Row():
373
  with gr.Column():
374
+ gr.Plot(value=get_accuracy_chart(), label="Accuracy")
375
  with gr.Column():
376
+ gr.Plot(value=get_latency_chart(), label="Latency")
377
 
378
  with gr.Row():
379
+ gr.Plot(value=get_category_heatmap(), label="Category Breakdown")
380
 
381
  gr.Markdown("---")
382
  gr.Markdown("## 📋 Detailed Results")
 
392
 
393
  gr.Markdown("---")
394
  gr.Markdown("## 🔄 Live Model Comparison")
395
+ gr.Markdown("Test the models with your own prompts!")
396
 
397
  with gr.Row():
398
  with gr.Column(scale=2):
 
409
  ---
410
  ### 📚 About This Evaluation
411
 
412
+ **Models:** Mistral-7B, Llama-3.2-3B, Llama-3.1-70B, Qwen2.5-72B, Qwen2.5-Coder-32B
413
 
414
+ **Categories:** Reasoning (math/logic), Knowledge (facts), Instruction Following (format compliance)
 
 
 
415
 
416
+ *Built as part of an AI/ML Engineering portfolio project.*
417
  """)
418
 
419
  if __name__ == "__main__":