aeb56 commited on
Commit
b51ac87
Β·
1 Parent(s): 9bb160e

Optimize app.py for 48B model on 4xL40S GPUs with multi-GPU support

Browse files
Files changed (1) hide show
  1. app.py +93 -14
app.py CHANGED
@@ -7,11 +7,23 @@ import gc
7
  from huggingface_hub import login, snapshot_download
8
  import logging
9
  from datetime import datetime
 
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
  logger = logging.getLogger(__name__)
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Constants
16
  BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
17
  LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
@@ -30,7 +42,14 @@ class ModelMerger:
30
  if self.merged_model is not None:
31
  del self.merged_model
32
  gc.collect()
33
- torch.cuda.empty_cache()
 
 
 
 
 
 
 
34
 
35
  def login_huggingface(self, token):
36
  """Login to Hugging Face"""
@@ -60,17 +79,46 @@ class ModelMerger:
60
  logger.info("Loading tokenizer...")
61
  self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
62
 
63
- # Load base model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  progress(0.25, desc="Loading base model (this may take several minutes)...")
65
  logger.info(f"Loading base model: {BASE_MODEL_NAME}")
66
- self.base_model = AutoModelForCausalLM.from_pretrained(
67
- BASE_MODEL_NAME,
68
- torch_dtype=torch.bfloat16,
69
- device_map="auto",
70
- trust_remote_code=True,
71
- low_cpu_mem_usage=True,
72
- )
73
- logger.info("Base model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Load LoRA configuration
76
  progress(0.50, desc="Loading LoRA adapters...")
@@ -108,6 +156,16 @@ class ModelMerger:
108
  total_params = sum(p.numel() for p in self.merged_model.parameters())
109
  trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
110
 
 
 
 
 
 
 
 
 
 
 
111
  result_message = f"""
112
  βœ… **Merge Completed Successfully!**
113
 
@@ -117,8 +175,9 @@ class ModelMerger:
117
  - Output Directory: `{OUTPUT_DIR}`
118
  - Total Parameters: {total_params:,}
119
  - Trainable Parameters: {trainable_params:,}
 
120
  - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
121
-
122
  **Next Steps:**
123
  1. The merged model is saved in the container at `/app/merged_model`
124
  2. You can now test the model using the inference tab
@@ -203,6 +262,21 @@ class ModelMerger:
203
  # Initialize merger
204
  merger = ModelMerger()
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  # Create Gradio interface
207
  with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
208
  gr.Markdown("""
@@ -213,10 +287,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
213
  **Models:**
214
  - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
215
  - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
216
-
217
- **Hardware:** Running on 4xL40S GPUs
218
  """)
219
 
 
 
 
220
  with gr.Tabs():
221
  # Tab 1: Merge Models
222
  with gr.Tab("πŸ”„ Merge Models"):
@@ -228,7 +303,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
228
  2. Merge the LoRA weights into the base model
229
  3. Save the merged model for inference
230
 
231
- ⚠️ **Note:** This process may take 10-30 minutes depending on model size and network speed.
 
 
 
 
232
  """)
233
 
234
  with gr.Row():
 
7
  from huggingface_hub import login, snapshot_download
8
  import logging
9
  from datetime import datetime
10
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
  logger = logging.getLogger(__name__)
15
 
16
+ # Check GPU availability
17
+ if torch.cuda.is_available():
18
+ num_gpus = torch.cuda.device_count()
19
+ logger.info(f"Found {num_gpus} GPUs available")
20
+ for i in range(num_gpus):
21
+ gpu_name = torch.cuda.get_device_name(i)
22
+ gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
23
+ logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
24
+ else:
25
+ logger.warning("No GPUs found! This will likely fail for 48B model.")
26
+
27
  # Constants
28
  BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
29
  LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
 
42
  if self.merged_model is not None:
43
  del self.merged_model
44
  gc.collect()
45
+ if torch.cuda.is_available():
46
+ torch.cuda.empty_cache()
47
+ # Synchronize all GPUs
48
+ for i in range(torch.cuda.device_count()):
49
+ with torch.cuda.device(i):
50
+ torch.cuda.empty_cache()
51
+ torch.cuda.synchronize()
52
+ logger.info("Memory cleared successfully")
53
 
54
  def login_huggingface(self, token):
55
  """Login to Hugging Face"""
 
79
  logger.info("Loading tokenizer...")
80
  self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
81
 
82
+ # Configure memory allocation for multi-GPU setup (4xL40S = 4x48GB = 192GB)
83
+ # Reserve some memory for CUDA overhead and operations
84
+ num_gpus = torch.cuda.device_count()
85
+ max_memory = {}
86
+ if num_gpus > 0:
87
+ # Allocate memory per GPU (leave ~2GB per GPU for overhead)
88
+ per_gpu_memory = "46GB" # 48GB - 2GB overhead for L40S
89
+ for i in range(num_gpus):
90
+ max_memory[i] = per_gpu_memory
91
+ logger.info(f"Configured max_memory: {max_memory}")
92
+ else:
93
+ # Fallback for CPU-only (will be slow)
94
+ max_memory = {"cpu": "64GB"}
95
+ logger.warning("No GPUs detected, using CPU fallback")
96
+
97
+ # Load base model with explicit multi-GPU configuration
98
  progress(0.25, desc="Loading base model (this may take several minutes)...")
99
  logger.info(f"Loading base model: {BASE_MODEL_NAME}")
100
+ logger.info(f"Using bfloat16 precision for memory efficiency")
101
+
102
+ try:
103
+ self.base_model = AutoModelForCausalLM.from_pretrained(
104
+ BASE_MODEL_NAME,
105
+ torch_dtype=torch.bfloat16,
106
+ device_map="auto",
107
+ max_memory=max_memory,
108
+ trust_remote_code=True,
109
+ low_cpu_mem_usage=True,
110
+ offload_folder="/tmp/offload", # Fallback offload directory
111
+ offload_state_dict=True, # Offload state dict when loading
112
+ )
113
+ logger.info("Base model loaded successfully")
114
+
115
+ # Log device map to see distribution
116
+ if hasattr(self.base_model, 'hf_device_map'):
117
+ logger.info(f"Model device map: {self.base_model.hf_device_map}")
118
+
119
+ except torch.cuda.OutOfMemoryError as e:
120
+ logger.error("Out of memory error! Try with quantization or smaller batch size")
121
+ raise Exception(f"GPU Out of Memory: {str(e)}. The 48B model requires ~96GB VRAM in bfloat16. Ensure 4xL40S GPUs are available.")
122
 
123
  # Load LoRA configuration
124
  progress(0.50, desc="Loading LoRA adapters...")
 
156
  total_params = sum(p.numel() for p in self.merged_model.parameters())
157
  trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
158
 
159
+ # Get GPU memory usage
160
+ gpu_memory_info = ""
161
+ if torch.cuda.is_available():
162
+ gpu_memory_info = "\n**GPU Memory Usage:**\n"
163
+ for i in range(torch.cuda.device_count()):
164
+ allocated = torch.cuda.memory_allocated(i) / 1024**3
165
+ reserved = torch.cuda.memory_reserved(i) / 1024**3
166
+ total = torch.cuda.get_device_properties(i).total_memory / 1024**3
167
+ gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
168
+
169
  result_message = f"""
170
  βœ… **Merge Completed Successfully!**
171
 
 
175
  - Output Directory: `{OUTPUT_DIR}`
176
  - Total Parameters: {total_params:,}
177
  - Trainable Parameters: {trainable_params:,}
178
+ - Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
179
  - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
180
+ {gpu_memory_info}
181
  **Next Steps:**
182
  1. The merged model is saved in the container at `/app/merged_model`
183
  2. You can now test the model using the inference tab
 
262
  # Initialize merger
263
  merger = ModelMerger()
264
 
265
+ # Get GPU info for display
266
+ def get_gpu_info():
267
+ if not torch.cuda.is_available():
268
+ return "⚠️ **No GPUs detected!** This Space requires GPUs to run."
269
+
270
+ gpu_info = f"βœ… **{torch.cuda.device_count()} GPU(s) detected:**\n\n"
271
+ total_memory = 0
272
+ for i in range(torch.cuda.device_count()):
273
+ name = torch.cuda.get_device_name(i)
274
+ memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
275
+ total_memory += memory
276
+ gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
277
+ gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
278
+ return gpu_info
279
+
280
  # Create Gradio interface
281
  with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
282
  gr.Markdown("""
 
287
  **Models:**
288
  - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
289
  - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
 
 
290
  """)
291
 
292
+ # Display GPU info
293
+ gr.Markdown(get_gpu_info())
294
+
295
  with gr.Tabs():
296
  # Tab 1: Merge Models
297
  with gr.Tab("πŸ”„ Merge Models"):
 
303
  2. Merge the LoRA weights into the base model
304
  3. Save the merged model for inference
305
 
306
+ ⚠️ **Important Notes:**
307
+ - This process may take 10-30 minutes depending on model size and network speed
308
+ - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
309
+ - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
310
+ - The model will be automatically distributed across all available GPUs
311
  """)
312
 
313
  with gr.Row():