fnmodel / app.py
aeb56
Optimize app.py for 48B model on 4xL40S GPUs with multi-GPU support
b51ac87
raw
history blame
20 kB
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import gc
from huggingface_hub import login, snapshot_download
import logging
from datetime import datetime
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Check GPU availability
if torch.cuda.is_available():
num_gpus = torch.cuda.device_count()
logger.info(f"Found {num_gpus} GPUs available")
for i in range(num_gpus):
gpu_name = torch.cuda.get_device_name(i)
gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
else:
logger.warning("No GPUs found! This will likely fail for 48B model.")
# Constants
BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
OUTPUT_DIR = "/app/merged_model"
class ModelMerger:
def __init__(self):
self.base_model = None
self.tokenizer = None
self.merged_model = None
def clear_memory(self):
"""Clear GPU memory"""
if self.base_model is not None:
del self.base_model
if self.merged_model is not None:
del self.merged_model
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Synchronize all GPUs
for i in range(torch.cuda.device_count()):
with torch.cuda.device(i):
torch.cuda.empty_cache()
torch.cuda.synchronize()
logger.info("Memory cleared successfully")
def login_huggingface(self, token):
"""Login to Hugging Face"""
try:
login(token=token)
logger.info("Successfully logged in to Hugging Face")
return "βœ… Successfully logged in to Hugging Face"
except Exception as e:
logger.error(f"Login failed: {str(e)}")
return f"❌ Login failed: {str(e)}"
def merge_models(self, hf_token, progress=gr.Progress()):
"""Merge LoRA adapters with base model"""
try:
# Login to HF
if hf_token:
progress(0.05, desc="Logging in to Hugging Face...")
login(token=hf_token)
logger.info("Logged in to Hugging Face")
# Clear any existing models from memory
progress(0.1, desc="Clearing GPU memory...")
self.clear_memory()
# Load tokenizer
progress(0.15, desc="Loading tokenizer...")
logger.info("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
# Configure memory allocation for multi-GPU setup (4xL40S = 4x48GB = 192GB)
# Reserve some memory for CUDA overhead and operations
num_gpus = torch.cuda.device_count()
max_memory = {}
if num_gpus > 0:
# Allocate memory per GPU (leave ~2GB per GPU for overhead)
per_gpu_memory = "46GB" # 48GB - 2GB overhead for L40S
for i in range(num_gpus):
max_memory[i] = per_gpu_memory
logger.info(f"Configured max_memory: {max_memory}")
else:
# Fallback for CPU-only (will be slow)
max_memory = {"cpu": "64GB"}
logger.warning("No GPUs detected, using CPU fallback")
# Load base model with explicit multi-GPU configuration
progress(0.25, desc="Loading base model (this may take several minutes)...")
logger.info(f"Loading base model: {BASE_MODEL_NAME}")
logger.info(f"Using bfloat16 precision for memory efficiency")
try:
self.base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="auto",
max_memory=max_memory,
trust_remote_code=True,
low_cpu_mem_usage=True,
offload_folder="/tmp/offload", # Fallback offload directory
offload_state_dict=True, # Offload state dict when loading
)
logger.info("Base model loaded successfully")
# Log device map to see distribution
if hasattr(self.base_model, 'hf_device_map'):
logger.info(f"Model device map: {self.base_model.hf_device_map}")
except torch.cuda.OutOfMemoryError as e:
logger.error("Out of memory error! Try with quantization or smaller batch size")
raise Exception(f"GPU Out of Memory: {str(e)}. The 48B model requires ~96GB VRAM in bfloat16. Ensure 4xL40S GPUs are available.")
# Load LoRA configuration
progress(0.50, desc="Loading LoRA adapters...")
logger.info(f"Loading LoRA adapters from: {LORA_MODEL_NAME}")
# Merge LoRA weights
self.merged_model = PeftModel.from_pretrained(
self.base_model,
LORA_MODEL_NAME,
torch_dtype=torch.bfloat16,
)
logger.info("LoRA adapters loaded successfully")
progress(0.70, desc="Merging LoRA weights with base model...")
logger.info("Merging LoRA weights...")
self.merged_model = self.merged_model.merge_and_unload()
logger.info("Models merged successfully")
# Save merged model
progress(0.85, desc="Saving merged model...")
logger.info(f"Saving merged model to: {OUTPUT_DIR}")
os.makedirs(OUTPUT_DIR, exist_ok=True)
self.merged_model.save_pretrained(
OUTPUT_DIR,
safe_serialization=True,
max_shard_size="5GB"
)
self.tokenizer.save_pretrained(OUTPUT_DIR)
progress(1.0, desc="Complete!")
logger.info("Merge completed successfully")
# Get model info
total_params = sum(p.numel() for p in self.merged_model.parameters())
trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
# Get GPU memory usage
gpu_memory_info = ""
if torch.cuda.is_available():
gpu_memory_info = "\n**GPU Memory Usage:**\n"
for i in range(torch.cuda.device_count()):
allocated = torch.cuda.memory_allocated(i) / 1024**3
reserved = torch.cuda.memory_reserved(i) / 1024**3
total = torch.cuda.get_device_properties(i).total_memory / 1024**3
gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
result_message = f"""
βœ… **Merge Completed Successfully!**
**Model Information:**
- Base Model: `{BASE_MODEL_NAME}`
- LoRA Adapters: `{LORA_MODEL_NAME}`
- Output Directory: `{OUTPUT_DIR}`
- Total Parameters: {total_params:,}
- Trainable Parameters: {trainable_params:,}
- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
{gpu_memory_info}
**Next Steps:**
1. The merged model is saved in the container at `/app/merged_model`
2. You can now test the model using the inference tab
3. To upload to Hugging Face, use the upload section
"""
return result_message
except Exception as e:
logger.error(f"Error during merge: {str(e)}", exc_info=True)
self.clear_memory()
return f"❌ **Error during merge:**\n\n{str(e)}\n\nPlease check the logs for more details."
def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()):
"""Test the merged model with a prompt"""
try:
if self.merged_model is None:
return "❌ Please merge the models first before testing inference."
progress(0.3, desc="Tokenizing input...")
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device)
progress(0.5, desc="Generating response...")
with torch.no_grad():
outputs = self.merged_model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)
progress(0.9, desc="Decoding output...")
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
progress(1.0, desc="Complete!")
return response
except Exception as e:
logger.error(f"Error during inference: {str(e)}", exc_info=True)
return f"❌ **Error during inference:**\n\n{str(e)}"
def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()):
"""Upload merged model to Hugging Face Hub"""
try:
if self.merged_model is None:
return "❌ Please merge the models first before uploading."
if not repo_name:
return "❌ Please provide a repository name."
if not hf_token:
return "❌ Please provide a Hugging Face token."
progress(0.1, desc="Logging in...")
login(token=hf_token)
progress(0.3, desc="Uploading model to Hugging Face Hub...")
logger.info(f"Uploading to: {repo_name}")
self.merged_model.push_to_hub(
repo_name,
private=private,
safe_serialization=True,
max_shard_size="5GB"
)
progress(0.8, desc="Uploading tokenizer...")
self.tokenizer.push_to_hub(repo_name, private=private)
progress(1.0, desc="Complete!")
logger.info("Upload completed successfully")
repo_url = f"https://huggingface.co/{repo_name}"
return f"βœ… **Successfully uploaded to Hugging Face Hub!**\n\nRepository: [{repo_name}]({repo_url})"
except Exception as e:
logger.error(f"Error during upload: {str(e)}", exc_info=True)
return f"❌ **Error during upload:**\n\n{str(e)}"
# Initialize merger
merger = ModelMerger()
# Get GPU info for display
def get_gpu_info():
if not torch.cuda.is_available():
return "⚠️ **No GPUs detected!** This Space requires GPUs to run."
gpu_info = f"βœ… **{torch.cuda.device_count()} GPU(s) detected:**\n\n"
total_memory = 0
for i in range(torch.cuda.device_count()):
name = torch.cuda.get_device_name(i)
memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
total_memory += memory
gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
return gpu_info
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
gr.Markdown("""
# πŸ”— LoRA Model Merger
Merge your fine-tuned LoRA adapters with the base model for the **Kimi-Linear-48B-A3B-Instruct** model.
**Models:**
- **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
- **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
""")
# Display GPU info
gr.Markdown(get_gpu_info())
with gr.Tabs():
# Tab 1: Merge Models
with gr.Tab("πŸ”„ Merge Models"):
gr.Markdown("""
### Step 1: Merge LoRA Adapters with Base Model
This process will:
1. Download the base model and LoRA adapters
2. Merge the LoRA weights into the base model
3. Save the merged model for inference
⚠️ **Important Notes:**
- This process may take 10-30 minutes depending on model size and network speed
- The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
- Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
- The model will be automatically distributed across all available GPUs
""")
with gr.Row():
hf_token_merge = gr.Textbox(
label="Hugging Face Token",
placeholder="hf_...",
type="password",
info="Required for accessing private models or avoiding rate limits"
)
merge_button = gr.Button("πŸš€ Start Merge Process", variant="primary", size="lg")
merge_output = gr.Markdown(label="Merge Status")
merge_button.click(
fn=merger.merge_models,
inputs=[hf_token_merge],
outputs=merge_output
)
# Tab 2: Test Inference
with gr.Tab("πŸ§ͺ Test Inference"):
gr.Markdown("""
### Step 2: Test the Merged Model
Test the merged model with custom prompts to verify it's working correctly.
""")
with gr.Row():
with gr.Column():
test_prompt = gr.Textbox(
label="Test Prompt",
placeholder="Enter your test prompt here...",
lines=5,
value="Hello, how are you today?"
)
with gr.Row():
max_length = gr.Slider(
minimum=50,
maximum=2048,
value=512,
step=1,
label="Max Length"
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P"
)
test_button = gr.Button("🎯 Generate", variant="primary")
with gr.Column():
test_output = gr.Textbox(
label="Model Output",
lines=15,
interactive=False
)
test_button.click(
fn=merger.test_inference,
inputs=[test_prompt, max_length, temperature, top_p],
outputs=test_output
)
# Tab 3: Upload to Hub
with gr.Tab("☁️ Upload to Hub"):
gr.Markdown("""
### Step 3: Upload Merged Model to Hugging Face Hub
Upload your merged model to Hugging Face Hub for easy sharing and deployment.
""")
with gr.Row():
with gr.Column():
repo_name = gr.Textbox(
label="Repository Name",
placeholder="username/model-name",
info="Format: username/model-name"
)
hf_token_upload = gr.Textbox(
label="Hugging Face Token (with write access)",
placeholder="hf_...",
type="password",
info="Token must have write permissions"
)
private_repo = gr.Checkbox(
label="Private Repository",
value=True,
info="Keep the model private"
)
upload_button = gr.Button("πŸ“€ Upload to Hub", variant="primary", size="lg")
with gr.Column():
upload_output = gr.Markdown(label="Upload Status")
upload_button.click(
fn=merger.upload_to_hub,
inputs=[repo_name, hf_token_upload, private_repo],
outputs=upload_output
)
# Tab 4: Info & Help
with gr.Tab("ℹ️ Info & Help"):
gr.Markdown("""
## About This Space
This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models.
### What is LoRA Merging?
LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model.
To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into
the base model, creating a single unified model.
### Process Overview
1. **Merge:** Combines the LoRA adapters with the base model
2. **Test:** Verify the merged model works correctly with inference
3. **Upload:** Share your merged model on Hugging Face Hub
### Hardware Requirements
- **Current Setup:** 4x NVIDIA L40S GPUs (48GB VRAM each)
- **Model Size:** ~48B parameters
- **Memory Usage:** ~96-120GB VRAM during merge
### Tips
- The merge process can take 10-30 minutes
- Make sure you have a valid Hugging Face token with appropriate permissions
- Test the model thoroughly before uploading to Hub
- Consider keeping the uploaded model private initially
### Troubleshooting
**Out of Memory Errors:**
- The model is very large (48B parameters)
- Try restarting the Space to clear memory
**Authentication Errors:**
- Ensure your HF token has read access to the base model
- For private models, token must have appropriate permissions
**Slow Download/Upload:**
- Large models take time to transfer
- Network speed affects download/upload times
### Support
For issues or questions, please check:
- [PEFT Documentation](https://huggingface.co/docs/peft)
- [Transformers Documentation](https://huggingface.co/docs/transformers)
""")
gr.Markdown("""
---
**Note:** This Space requires significant computational resources. Ensure you have appropriate GPU allocation.
""")
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=5)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)