Spaces:
Runtime error
Runtime error
Joash commited on
Commit ·
5f0bb6b
1
Parent(s): 1f37a6a
Optimize memory usage with 4-bit quantization and Docker settings
Browse files- Dockerfile +7 -4
- src/model_manager.py +11 -3
Dockerfile
CHANGED
|
@@ -4,10 +4,11 @@ FROM python:3.11-slim
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
# Install system dependencies
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
build-essential \
|
| 10 |
curl \
|
|
|
|
| 11 |
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
| 13 |
# Create necessary directories with proper permissions
|
|
@@ -28,6 +29,8 @@ ENV HF_HOME=/home/user/.cache/huggingface
|
|
| 28 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
| 29 |
# Set logging to stdout
|
| 30 |
ENV LOG_FILE=/dev/stdout
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Switch to non-root user
|
| 33 |
USER user
|
|
@@ -39,7 +42,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
|
|
| 39 |
# Copy requirements first to leverage Docker cache
|
| 40 |
COPY --chown=user:user requirements.txt .
|
| 41 |
|
| 42 |
-
# Install Python dependencies
|
| 43 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
| 44 |
|
| 45 |
# Copy application code
|
|
@@ -48,5 +51,5 @@ COPY --chown=user:user . .
|
|
| 48 |
# Expose port for Hugging Face Spaces
|
| 49 |
EXPOSE 7860
|
| 50 |
|
| 51 |
-
# Run the application with
|
| 52 |
-
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug"]
|
|
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
+
# Install system dependencies including cuda-toolkit for bitsandbytes
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
build-essential \
|
| 10 |
curl \
|
| 11 |
+
git \
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Create necessary directories with proper permissions
|
|
|
|
| 29 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
| 30 |
# Set logging to stdout
|
| 31 |
ENV LOG_FILE=/dev/stdout
|
| 32 |
+
# Reduce memory usage
|
| 33 |
+
ENV MALLOC_ARENA_MAX=2
|
| 34 |
|
| 35 |
# Switch to non-root user
|
| 36 |
USER user
|
|
|
|
| 42 |
# Copy requirements first to leverage Docker cache
|
| 43 |
COPY --chown=user:user requirements.txt .
|
| 44 |
|
| 45 |
+
# Install Python dependencies with reduced memory usage
|
| 46 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
| 47 |
|
| 48 |
# Copy application code
|
|
|
|
| 51 |
# Expose port for Hugging Face Spaces
|
| 52 |
EXPOSE 7860
|
| 53 |
|
| 54 |
+
# Run the application with reduced memory usage
|
| 55 |
+
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]
|
src/model_manager.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import logging
|
| 2 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM,
|
| 3 |
import torch
|
| 4 |
from huggingface_hub import login
|
| 5 |
from .config import Config
|
|
@@ -54,11 +54,19 @@ class ModelManager:
|
|
| 54 |
logger.info(f"Loading model: {self.model_name}")
|
| 55 |
logger.info(f"Using device: {self.device}")
|
| 56 |
|
| 57 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 59 |
self.model_name,
|
| 60 |
device_map={"": self.device},
|
| 61 |
-
|
| 62 |
token=Config.HUGGING_FACE_TOKEN,
|
| 63 |
low_cpu_mem_usage=True
|
| 64 |
)
|
|
|
|
| 1 |
import logging
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 3 |
import torch
|
| 4 |
from huggingface_hub import login
|
| 5 |
from .config import Config
|
|
|
|
| 54 |
logger.info(f"Loading model: {self.model_name}")
|
| 55 |
logger.info(f"Using device: {self.device}")
|
| 56 |
|
| 57 |
+
# Configure 4-bit quantization
|
| 58 |
+
quantization_config = BitsAndBytesConfig(
|
| 59 |
+
load_in_4bit=True,
|
| 60 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 61 |
+
bnb_4bit_use_double_quant=True,
|
| 62 |
+
bnb_4bit_quant_type="nf4"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Load model with 4-bit quantization
|
| 66 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 67 |
self.model_name,
|
| 68 |
device_map={"": self.device},
|
| 69 |
+
quantization_config=quantization_config,
|
| 70 |
token=Config.HUGGING_FACE_TOKEN,
|
| 71 |
low_cpu_mem_usage=True
|
| 72 |
)
|