aeb56 commited on
Commit
74f609c
Β·
1 Parent(s): 69cd0c5

Fix OOM: Unload model before evaluation to free VRAM for lm_eval

Browse files
Files changed (2) hide show
  1. README.md +11 -0
  2. app.py +35 -7
README.md CHANGED
@@ -56,6 +56,8 @@ Model evaluation Space for the fine-tuned Kimi-Linear-48B-A3B-Instruct model. **
56
  - Go to the "πŸ“Š Evaluation" tab
57
  - Select benchmarks to run (ARC-Challenge, TruthfulQA, Winogrande)
58
  - Click "πŸš€ Start Evaluation"
 
 
59
  - Wait 30-60 minutes for results
60
  - Results will be displayed and saved to `/tmp/eval_results_[timestamp]/`
61
 
@@ -78,6 +80,15 @@ The LM Evaluation Harness is a standard framework for evaluating language models
78
  - **Minimum:** 4x NVIDIA L4 (96GB VRAM)
79
  - **Model Size:** ~96GB in bfloat16
80
 
 
 
 
 
 
 
 
 
 
81
  ## Technical Details
82
 
83
  ### Fine-tuning Configuration
 
56
  - Go to the "πŸ“Š Evaluation" tab
57
  - Select benchmarks to run (ARC-Challenge, TruthfulQA, Winogrande)
58
  - Click "πŸš€ Start Evaluation"
59
+ - The model will be automatically unloaded to free VRAM
60
+ - lm_eval will load its own instance for evaluation
61
  - Wait 30-60 minutes for results
62
  - Results will be displayed and saved to `/tmp/eval_results_[timestamp]/`
63
 
 
80
  - **Minimum:** 4x NVIDIA L4 (96GB VRAM)
81
  - **Model Size:** ~96GB in bfloat16
82
 
83
+ ### Memory Management
84
+
85
+ This Space is optimized for limited VRAM:
86
+ - **Pre-loading:** Optional model loading to verify setup
87
+ - **Automatic Cleanup:** Model is unloaded before evaluation starts
88
+ - **Single Instance:** Only lm_eval's model instance runs during evaluation
89
+ - **Batch Size:** Set to 1 to minimize memory usage
90
+ - **Device Mapping:** Automatic distribution across available GPUs
91
+
92
  ## Technical Details
93
 
94
  ### Fine-tuning Configuration
app.py CHANGED
@@ -139,15 +139,37 @@ class ChatBot:
139
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
140
  output_dir = f"/tmp/eval_results_{timestamp}"
141
 
142
- yield f"πŸ”„ **Starting evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\nThis will take 30-60 minutes total.\n\n"
143
 
144
- # Run lm_eval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  cmd = [
146
  "lm_eval",
147
  "--model", "hf",
148
- "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16",
149
  "--tasks", task_string,
150
- "--batch_size", "auto:4",
151
  "--output_path", output_dir,
152
  "--log_samples"
153
  ]
@@ -245,10 +267,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
245
 
246
  gr.Markdown("""
247
  ### ℹ️ Instructions
248
- 1. **Click "Load Model"** - Takes 5-10 minutes
249
  2. **Use Evaluation tab** - To run benchmarks
250
 
251
- **Note:** Chat/inference functionality is currently disabled. This Space focuses on model evaluation only.
 
 
252
  """)
253
 
254
  # Tab 2: Chat - DISABLED
@@ -314,7 +338,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation")
314
 
315
  gr.Markdown("""
316
  ---
317
- **Note:** Evaluation requires the model to be loaded first. Results will be saved to `/tmp/eval_results_[timestamp]/`.
 
 
 
 
318
  """)
319
 
320
  gr.Markdown("""
 
139
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
140
  output_dir = f"/tmp/eval_results_{timestamp}"
141
 
142
+ yield f"πŸ”„ **Preparing for evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\n"
143
 
144
+ # IMPORTANT: Unload the model from memory to free VRAM for lm_eval
145
+ yield f"πŸ”„ **Unloading model to free VRAM...**\n\nThis is necessary because lm_eval will load its own instance.\n\n"
146
+
147
+ if self.model is not None:
148
+ del self.model
149
+ self.model = None
150
+ if self.tokenizer is not None:
151
+ del self.tokenizer
152
+ self.tokenizer = None
153
+
154
+ # Clear CUDA cache
155
+ if torch.cuda.is_available():
156
+ torch.cuda.empty_cache()
157
+ torch.cuda.synchronize()
158
+
159
+ import gc
160
+ gc.collect()
161
+
162
+ self.loaded = False
163
+
164
+ yield f"βœ… **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
165
+
166
+ # Run lm_eval with optimized memory settings
167
  cmd = [
168
  "lm_eval",
169
  "--model", "hf",
170
+ "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,device_map=auto,low_cpu_mem_usage=True",
171
  "--tasks", task_string,
172
+ "--batch_size", "1", # Reduced to minimize memory usage
173
  "--output_path", output_dir,
174
  "--log_samples"
175
  ]
 
267
 
268
  gr.Markdown("""
269
  ### ℹ️ Instructions
270
+ 1. **Click "Load Model"** - Takes 5-10 minutes (verifies setup)
271
  2. **Use Evaluation tab** - To run benchmarks
272
 
273
+ **Note:**
274
+ - Chat/inference functionality is currently disabled. This Space focuses on model evaluation only.
275
+ - The model will be automatically unloaded before evaluation starts to free VRAM for lm_eval.
276
  """)
277
 
278
  # Tab 2: Chat - DISABLED
 
338
 
339
  gr.Markdown("""
340
  ---
341
+ **Note:**
342
+ - Click "Load Model" in Controls tab first to verify the setup
343
+ - The model will be automatically unloaded before evaluation to free VRAM
344
+ - lm_eval will load its own instance of the model for evaluation
345
+ - Results will be saved to `/tmp/eval_results_[timestamp]/`
346
  """)
347
 
348
  gr.Markdown("""