Spaces:
Paused
Paused
aeb56
commited on
Commit
Β·
b51ac87
1
Parent(s):
9bb160e
Optimize app.py for 48B model on 4xL40S GPUs with multi-GPU support
Browse files
app.py
CHANGED
|
@@ -7,11 +7,23 @@ import gc
|
|
| 7 |
from huggingface_hub import login, snapshot_download
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
|
|
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Constants
|
| 16 |
BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
|
| 17 |
LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
|
|
@@ -30,7 +42,14 @@ class ModelMerger:
|
|
| 30 |
if self.merged_model is not None:
|
| 31 |
del self.merged_model
|
| 32 |
gc.collect()
|
| 33 |
-
torch.cuda.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def login_huggingface(self, token):
|
| 36 |
"""Login to Hugging Face"""
|
|
@@ -60,17 +79,46 @@ class ModelMerger:
|
|
| 60 |
logger.info("Loading tokenizer...")
|
| 61 |
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
|
| 62 |
|
| 63 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
progress(0.25, desc="Loading base model (this may take several minutes)...")
|
| 65 |
logger.info(f"Loading base model: {BASE_MODEL_NAME}")
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Load LoRA configuration
|
| 76 |
progress(0.50, desc="Loading LoRA adapters...")
|
|
@@ -108,6 +156,16 @@ class ModelMerger:
|
|
| 108 |
total_params = sum(p.numel() for p in self.merged_model.parameters())
|
| 109 |
trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
result_message = f"""
|
| 112 |
β
**Merge Completed Successfully!**
|
| 113 |
|
|
@@ -117,8 +175,9 @@ class ModelMerger:
|
|
| 117 |
- Output Directory: `{OUTPUT_DIR}`
|
| 118 |
- Total Parameters: {total_params:,}
|
| 119 |
- Trainable Parameters: {trainable_params:,}
|
|
|
|
| 120 |
- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
| 121 |
-
|
| 122 |
**Next Steps:**
|
| 123 |
1. The merged model is saved in the container at `/app/merged_model`
|
| 124 |
2. You can now test the model using the inference tab
|
|
@@ -203,6 +262,21 @@ class ModelMerger:
|
|
| 203 |
# Initialize merger
|
| 204 |
merger = ModelMerger()
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
# Create Gradio interface
|
| 207 |
with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
|
| 208 |
gr.Markdown("""
|
|
@@ -213,10 +287,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
|
|
| 213 |
**Models:**
|
| 214 |
- **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
|
| 215 |
- **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
|
| 216 |
-
|
| 217 |
-
**Hardware:** Running on 4xL40S GPUs
|
| 218 |
""")
|
| 219 |
|
|
|
|
|
|
|
|
|
|
| 220 |
with gr.Tabs():
|
| 221 |
# Tab 1: Merge Models
|
| 222 |
with gr.Tab("π Merge Models"):
|
|
@@ -228,7 +303,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
|
|
| 228 |
2. Merge the LoRA weights into the base model
|
| 229 |
3. Save the merged model for inference
|
| 230 |
|
| 231 |
-
β οΈ **
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
""")
|
| 233 |
|
| 234 |
with gr.Row():
|
|
|
|
| 7 |
from huggingface_hub import login, snapshot_download
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
+
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
|
| 11 |
|
| 12 |
# Configure logging
|
| 13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
+
# Check GPU availability
|
| 17 |
+
if torch.cuda.is_available():
|
| 18 |
+
num_gpus = torch.cuda.device_count()
|
| 19 |
+
logger.info(f"Found {num_gpus} GPUs available")
|
| 20 |
+
for i in range(num_gpus):
|
| 21 |
+
gpu_name = torch.cuda.get_device_name(i)
|
| 22 |
+
gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
| 23 |
+
logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
|
| 24 |
+
else:
|
| 25 |
+
logger.warning("No GPUs found! This will likely fail for 48B model.")
|
| 26 |
+
|
| 27 |
# Constants
|
| 28 |
BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
|
| 29 |
LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
|
|
|
|
| 42 |
if self.merged_model is not None:
|
| 43 |
del self.merged_model
|
| 44 |
gc.collect()
|
| 45 |
+
if torch.cuda.is_available():
|
| 46 |
+
torch.cuda.empty_cache()
|
| 47 |
+
# Synchronize all GPUs
|
| 48 |
+
for i in range(torch.cuda.device_count()):
|
| 49 |
+
with torch.cuda.device(i):
|
| 50 |
+
torch.cuda.empty_cache()
|
| 51 |
+
torch.cuda.synchronize()
|
| 52 |
+
logger.info("Memory cleared successfully")
|
| 53 |
|
| 54 |
def login_huggingface(self, token):
|
| 55 |
"""Login to Hugging Face"""
|
|
|
|
| 79 |
logger.info("Loading tokenizer...")
|
| 80 |
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
|
| 81 |
|
| 82 |
+
# Configure memory allocation for multi-GPU setup (4xL40S = 4x48GB = 192GB)
|
| 83 |
+
# Reserve some memory for CUDA overhead and operations
|
| 84 |
+
num_gpus = torch.cuda.device_count()
|
| 85 |
+
max_memory = {}
|
| 86 |
+
if num_gpus > 0:
|
| 87 |
+
# Allocate memory per GPU (leave ~2GB per GPU for overhead)
|
| 88 |
+
per_gpu_memory = "46GB" # 48GB - 2GB overhead for L40S
|
| 89 |
+
for i in range(num_gpus):
|
| 90 |
+
max_memory[i] = per_gpu_memory
|
| 91 |
+
logger.info(f"Configured max_memory: {max_memory}")
|
| 92 |
+
else:
|
| 93 |
+
# Fallback for CPU-only (will be slow)
|
| 94 |
+
max_memory = {"cpu": "64GB"}
|
| 95 |
+
logger.warning("No GPUs detected, using CPU fallback")
|
| 96 |
+
|
| 97 |
+
# Load base model with explicit multi-GPU configuration
|
| 98 |
progress(0.25, desc="Loading base model (this may take several minutes)...")
|
| 99 |
logger.info(f"Loading base model: {BASE_MODEL_NAME}")
|
| 100 |
+
logger.info(f"Using bfloat16 precision for memory efficiency")
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
self.base_model = AutoModelForCausalLM.from_pretrained(
|
| 104 |
+
BASE_MODEL_NAME,
|
| 105 |
+
torch_dtype=torch.bfloat16,
|
| 106 |
+
device_map="auto",
|
| 107 |
+
max_memory=max_memory,
|
| 108 |
+
trust_remote_code=True,
|
| 109 |
+
low_cpu_mem_usage=True,
|
| 110 |
+
offload_folder="/tmp/offload", # Fallback offload directory
|
| 111 |
+
offload_state_dict=True, # Offload state dict when loading
|
| 112 |
+
)
|
| 113 |
+
logger.info("Base model loaded successfully")
|
| 114 |
+
|
| 115 |
+
# Log device map to see distribution
|
| 116 |
+
if hasattr(self.base_model, 'hf_device_map'):
|
| 117 |
+
logger.info(f"Model device map: {self.base_model.hf_device_map}")
|
| 118 |
+
|
| 119 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 120 |
+
logger.error("Out of memory error! Try with quantization or smaller batch size")
|
| 121 |
+
raise Exception(f"GPU Out of Memory: {str(e)}. The 48B model requires ~96GB VRAM in bfloat16. Ensure 4xL40S GPUs are available.")
|
| 122 |
|
| 123 |
# Load LoRA configuration
|
| 124 |
progress(0.50, desc="Loading LoRA adapters...")
|
|
|
|
| 156 |
total_params = sum(p.numel() for p in self.merged_model.parameters())
|
| 157 |
trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
|
| 158 |
|
| 159 |
+
# Get GPU memory usage
|
| 160 |
+
gpu_memory_info = ""
|
| 161 |
+
if torch.cuda.is_available():
|
| 162 |
+
gpu_memory_info = "\n**GPU Memory Usage:**\n"
|
| 163 |
+
for i in range(torch.cuda.device_count()):
|
| 164 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**3
|
| 165 |
+
reserved = torch.cuda.memory_reserved(i) / 1024**3
|
| 166 |
+
total = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
| 167 |
+
gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
|
| 168 |
+
|
| 169 |
result_message = f"""
|
| 170 |
β
**Merge Completed Successfully!**
|
| 171 |
|
|
|
|
| 175 |
- Output Directory: `{OUTPUT_DIR}`
|
| 176 |
- Total Parameters: {total_params:,}
|
| 177 |
- Trainable Parameters: {trainable_params:,}
|
| 178 |
+
- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
|
| 179 |
- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
| 180 |
+
{gpu_memory_info}
|
| 181 |
**Next Steps:**
|
| 182 |
1. The merged model is saved in the container at `/app/merged_model`
|
| 183 |
2. You can now test the model using the inference tab
|
|
|
|
| 262 |
# Initialize merger
|
| 263 |
merger = ModelMerger()
|
| 264 |
|
| 265 |
+
# Get GPU info for display
|
| 266 |
+
def get_gpu_info():
|
| 267 |
+
if not torch.cuda.is_available():
|
| 268 |
+
return "β οΈ **No GPUs detected!** This Space requires GPUs to run."
|
| 269 |
+
|
| 270 |
+
gpu_info = f"β
**{torch.cuda.device_count()} GPU(s) detected:**\n\n"
|
| 271 |
+
total_memory = 0
|
| 272 |
+
for i in range(torch.cuda.device_count()):
|
| 273 |
+
name = torch.cuda.get_device_name(i)
|
| 274 |
+
memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
| 275 |
+
total_memory += memory
|
| 276 |
+
gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
|
| 277 |
+
gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
|
| 278 |
+
return gpu_info
|
| 279 |
+
|
| 280 |
# Create Gradio interface
|
| 281 |
with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
|
| 282 |
gr.Markdown("""
|
|
|
|
| 287 |
**Models:**
|
| 288 |
- **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
|
| 289 |
- **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
|
|
|
|
|
|
|
| 290 |
""")
|
| 291 |
|
| 292 |
+
# Display GPU info
|
| 293 |
+
gr.Markdown(get_gpu_info())
|
| 294 |
+
|
| 295 |
with gr.Tabs():
|
| 296 |
# Tab 1: Merge Models
|
| 297 |
with gr.Tab("π Merge Models"):
|
|
|
|
| 303 |
2. Merge the LoRA weights into the base model
|
| 304 |
3. Save the merged model for inference
|
| 305 |
|
| 306 |
+
β οΈ **Important Notes:**
|
| 307 |
+
- This process may take 10-30 minutes depending on model size and network speed
|
| 308 |
+
- The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
|
| 309 |
+
- Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
|
| 310 |
+
- The model will be automatically distributed across all available GPUs
|
| 311 |
""")
|
| 312 |
|
| 313 |
with gr.Row():
|