aeb56 commited on
Commit
29f5263
Β·
1 Parent(s): 2f60fd7

Add Evaluation tab with ARC-Challenge, TruthfulQA, and Winogrande benchmarks

Browse files
Files changed (2) hide show
  1. app.py +182 -35
  2. requirements.txt +3 -0
app.py CHANGED
@@ -2,6 +2,9 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import os
 
 
 
5
 
6
  # Set environment variable for flash-linear-attention
7
  os.environ["FLA_USE_TRITON"] = "1"
@@ -32,11 +35,11 @@ class ChatBot:
32
  self.model = AutoModelForCausalLM.from_pretrained(
33
  MODEL_NAME,
34
  torch_dtype=torch.bfloat16,
35
- device_map="balanced", # Distribute evenly
36
  max_memory=max_memory,
37
  trust_remote_code=True,
38
  low_cpu_mem_usage=True,
39
- attn_implementation="eager", # Use eager attention instead of flash
40
  )
41
 
42
  self.model.eval()
@@ -62,7 +65,7 @@ class ChatBot:
62
  else:
63
  device_info = ""
64
 
65
- yield f"βœ… **Model loaded successfully!**{device_info}\n\nYou can now start chatting below."
66
 
67
  except Exception as e:
68
  self.loaded = False
@@ -70,7 +73,7 @@ class ChatBot:
70
 
71
  def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
72
  if not self.loaded:
73
- return "❌ Please load the model first by clicking the 'Load Model' button."
74
 
75
  try:
76
  # Build prompt from history
@@ -92,7 +95,7 @@ class ChatBot:
92
  inputs = self.tokenizer(prompt, return_tensors="pt")
93
  inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
94
 
95
- # Generate with explicit attention settings
96
  with torch.no_grad():
97
  outputs = self.model.generate(
98
  **inputs,
@@ -101,7 +104,7 @@ class ChatBot:
101
  top_p=top_p,
102
  do_sample=temperature > 0,
103
  pad_token_id=self.tokenizer.eos_token_id,
104
- use_cache=True, # Enable KV caching
105
  )
106
 
107
  # Decode
@@ -115,17 +118,112 @@ class ChatBot:
115
 
116
  except Exception as e:
117
  return f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # Initialize
120
  bot = ChatBot()
121
 
122
- # UI
123
  with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
124
  gr.Markdown("""
125
  # πŸš€ Kimi Linear 48B A3B - Fine-tuned
126
 
127
- Chat interface for the fine-tuned Kimi model.
128
-
129
  **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
130
  """)
131
 
@@ -136,35 +234,87 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
136
  total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
137
  gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
138
 
139
- with gr.Row():
140
- with gr.Column(scale=1):
141
- gr.Markdown("### πŸŽ›οΈ Controls")
142
-
143
  load_btn = gr.Button("πŸš€ Load Model", variant="primary", size="lg")
144
  status = gr.Markdown("**Status:** Model not loaded")
145
 
146
- gr.Markdown("---")
147
- gr.Markdown("### βš™οΈ Settings")
148
-
149
- system_prompt = gr.Textbox(
150
- label="System Prompt",
151
- placeholder="You are a helpful assistant...",
152
- lines=2
153
- )
154
-
155
- max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
156
- temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
157
- top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
158
 
159
- with gr.Column(scale=2):
160
- gr.Markdown("### πŸ’¬ Chat")
161
- chatbot = gr.Chatbot(height=500, show_copy_button=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  with gr.Row():
164
- msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
165
- send = gr.Button("Send", variant="primary", scale=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- clear = gr.Button("Clear")
 
 
 
 
 
 
 
 
168
 
169
  # Events
170
  load_btn.click(bot.load_model, outputs=status)
@@ -178,10 +328,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
178
  send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
179
  clear.click(lambda: None, None, chatbot)
180
 
181
- gr.Markdown("""
182
- ---
183
- **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
184
- """)
185
 
186
  if __name__ == "__main__":
187
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import os
5
+ import subprocess
6
+ import json
7
+ from datetime import datetime
8
 
9
  # Set environment variable for flash-linear-attention
10
  os.environ["FLA_USE_TRITON"] = "1"
 
35
  self.model = AutoModelForCausalLM.from_pretrained(
36
  MODEL_NAME,
37
  torch_dtype=torch.bfloat16,
38
+ device_map="balanced",
39
  max_memory=max_memory,
40
  trust_remote_code=True,
41
  low_cpu_mem_usage=True,
42
+ attn_implementation="eager",
43
  )
44
 
45
  self.model.eval()
 
65
  else:
66
  device_info = ""
67
 
68
+ yield f"βœ… **Model loaded successfully!**{device_info}\n\nYou can now use Chat or Evaluation tabs."
69
 
70
  except Exception as e:
71
  self.loaded = False
 
73
 
74
  def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
75
  if not self.loaded:
76
+ return "❌ Please load the model first by clicking the 'Load Model' button in Controls."
77
 
78
  try:
79
  # Build prompt from history
 
95
  inputs = self.tokenizer(prompt, return_tensors="pt")
96
  inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
97
 
98
+ # Generate
99
  with torch.no_grad():
100
  outputs = self.model.generate(
101
  **inputs,
 
104
  top_p=top_p,
105
  do_sample=temperature > 0,
106
  pad_token_id=self.tokenizer.eos_token_id,
107
+ use_cache=True,
108
  )
109
 
110
  # Decode
 
118
 
119
  except Exception as e:
120
  return f"❌ Error: {str(e)}"
121
+
122
+ def run_evaluation(self, tasks_to_run):
123
+ """Run lm_eval on selected tasks"""
124
+ if not self.loaded:
125
+ yield "❌ Please load the model first!"
126
+ return
127
+
128
+ try:
129
+ # Map friendly names to lm_eval task names
130
+ task_map = {
131
+ "ARC-Challenge": "arc_challenge",
132
+ "TruthfulQA": "truthfulqa_mc2",
133
+ "Winogrande": "winogrande"
134
+ }
135
+
136
+ selected_tasks = [task_map[t] for t in tasks_to_run]
137
+ task_string = ",".join(selected_tasks)
138
+
139
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
140
+ output_dir = f"/tmp/eval_results_{timestamp}"
141
+
142
+ yield f"πŸ”„ **Starting evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\nThis will take 30-60 minutes total.\n\n"
143
+
144
+ # Run lm_eval
145
+ cmd = [
146
+ "lm_eval",
147
+ "--model", "hf",
148
+ "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16",
149
+ "--tasks", task_string,
150
+ "--batch_size", "auto:4",
151
+ "--output_path", output_dir,
152
+ "--log_samples"
153
+ ]
154
+
155
+ yield f"πŸ”„ **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
156
+
157
+ # Run evaluation
158
+ process = subprocess.Popen(
159
+ cmd,
160
+ stdout=subprocess.PIPE,
161
+ stderr=subprocess.STDOUT,
162
+ text=True,
163
+ bufsize=1
164
+ )
165
+
166
+ output_lines = []
167
+ for line in process.stdout:
168
+ output_lines.append(line)
169
+ # Show last 20 lines
170
+ recent = ''.join(output_lines[-20:])
171
+ yield f"πŸ”„ **Running evaluation...**\n\n```\n{recent}\n```"
172
+
173
+ process.wait()
174
+
175
+ if process.returncode != 0:
176
+ yield f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
177
+ return
178
+
179
+ # Read results
180
+ results_file = os.path.join(output_dir, "results.json")
181
+ if os.path.exists(results_file):
182
+ with open(results_file, 'r') as f:
183
+ results = json.load(f)
184
+
185
+ # Format results
186
+ result_text = "βœ… **Evaluation Complete!**\n\n"
187
+ result_text += f"**Timestamp:** {timestamp}\n\n"
188
+ result_text += "## πŸ“Š Results:\n\n"
189
+
190
+ for task in selected_tasks:
191
+ if task in results['results']:
192
+ task_results = results['results'][task]
193
+ result_text += f"### {task}\n"
194
+ for metric, value in task_results.items():
195
+ if isinstance(value, float):
196
+ result_text += f"- **{metric}:** {value:.4f}\n"
197
+ else:
198
+ result_text += f"- **{metric}:** {value}\n"
199
+ result_text += "\n"
200
+
201
+ # Add summary if available
202
+ if 'summary' in results:
203
+ result_text += "## πŸ“ˆ Summary:\n\n"
204
+ for metric, value in results['summary'].items():
205
+ if isinstance(value, float):
206
+ result_text += f"- **{metric}:** {value:.4f}\n"
207
+ else:
208
+ result_text += f"- **{metric}:** {value}\n"
209
+
210
+ result_text += f"\n\n**Full results saved to:** `{output_dir}`"
211
+
212
+ yield result_text
213
+ else:
214
+ yield f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
215
+
216
+ except Exception as e:
217
+ yield f"❌ **Evaluation error:**\n\n{str(e)}"
218
 
219
  # Initialize
220
  bot = ChatBot()
221
 
222
+ # UI with Tabs
223
  with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
224
  gr.Markdown("""
225
  # πŸš€ Kimi Linear 48B A3B - Fine-tuned
226
 
 
 
227
  **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
228
  """)
229
 
 
234
  total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
235
  gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
236
 
237
+ with gr.Tabs():
238
+ # Tab 1: Controls (always visible)
239
+ with gr.Tab("πŸŽ›οΈ Controls"):
240
+ gr.Markdown("### Load Model First")
241
  load_btn = gr.Button("πŸš€ Load Model", variant="primary", size="lg")
242
  status = gr.Markdown("**Status:** Model not loaded")
243
 
244
+ gr.Markdown("""
245
+ ### ℹ️ Instructions
246
+ 1. **Click "Load Model"** - Takes 5-10 minutes
247
+ 2. **Use Chat tab** - For conversations
248
+ 3. **Use Evaluation tab** - To run benchmarks
249
+ """)
 
 
 
 
 
 
250
 
251
+ # Tab 2: Chat
252
+ with gr.Tab("πŸ’¬ Chat"):
253
+ with gr.Row():
254
+ with gr.Column(scale=1):
255
+ gr.Markdown("### βš™οΈ Settings")
256
+
257
+ system_prompt = gr.Textbox(
258
+ label="System Prompt",
259
+ placeholder="You are a helpful assistant...",
260
+ lines=2
261
+ )
262
+
263
+ max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
264
+ temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
265
+ top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
266
+
267
+ with gr.Column(scale=2):
268
+ chatbot = gr.Chatbot(height=500, show_copy_button=True)
269
+
270
+ with gr.Row():
271
+ msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
272
+ send = gr.Button("Send", variant="primary", scale=1)
273
+
274
+ clear = gr.Button("Clear Chat")
275
+
276
+ # Tab 3: Evaluation
277
+ with gr.Tab("πŸ“Š Evaluation"):
278
+ gr.Markdown("""
279
+ ### Run LM Evaluation Harness
280
+
281
+ Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.**
282
+ """)
283
 
284
  with gr.Row():
285
+ with gr.Column(scale=1):
286
+ gr.Markdown("### Select Benchmarks")
287
+
288
+ tasks = gr.CheckboxGroup(
289
+ choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
290
+ value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
291
+ label="Tasks to Run",
292
+ info="Select one or more tasks"
293
+ )
294
+
295
+ eval_btn = gr.Button("πŸš€ Start Evaluation", variant="primary", size="lg")
296
+
297
+ gr.Markdown("""
298
+ ### ⏱️ Estimated Time:
299
+ - **ARC-Challenge:** 15-30 min
300
+ - **TruthfulQA:** 10-20 min
301
+ - **Winogrande:** 15-30 min
302
+
303
+ **Total:** ~40-80 minutes for all 3
304
+ """)
305
+
306
+ with gr.Column(scale=2):
307
+ eval_results = gr.Markdown("Results will appear here after evaluation completes.")
308
 
309
+ gr.Markdown("""
310
+ ---
311
+ **Note:** Evaluation requires the model to be loaded first. Results will be saved to `/tmp/eval_results_[timestamp]/`.
312
+ """)
313
+
314
+ gr.Markdown("""
315
+ ---
316
+ **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
317
+ """)
318
 
319
  # Events
320
  load_btn.click(bot.load_model, outputs=status)
 
328
  send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
329
  clear.click(lambda: None, None, chatbot)
330
 
331
+ eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
 
 
 
332
 
333
  if __name__ == "__main__":
334
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
requirements.txt CHANGED
@@ -10,6 +10,9 @@ triton>=3.0.0
10
  # Flash Linear Attention (required by Kimi model)
11
  git+https://github.com/sustcsonglin/flash-linear-attention.git@main
12
 
 
 
 
13
  # UI
14
  gradio==4.19.2
15
 
 
10
  # Flash Linear Attention (required by Kimi model)
11
  git+https://github.com/sustcsonglin/flash-linear-attention.git@main
12
 
13
+ # Evaluation
14
+ lm-eval>=0.4.0
15
+
16
  # UI
17
  gradio==4.19.2
18