Spaces:
Sleeping
Sleeping
Commit
Β·
7cd0e22
1
Parent(s):
24c7b48
Fix: Update transformers to 4.45.2 for Phi-3 LongRoPE support
Browse files- Dockerfile +7 -19
- app/model_loader.py +37 -15
- requirements.txt +13 -11
Dockerfile
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
-
# Set working directory
|
| 4 |
WORKDIR /app
|
| 5 |
|
| 6 |
-
# Set environment variables
|
| 7 |
ENV PYTHONUNBUFFERED=1 \
|
| 8 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 9 |
PIP_NO_CACHE_DIR=1 \
|
| 10 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 11 |
TRANSFORMERS_CACHE=/app/.cache/transformers \
|
| 12 |
-
HF_HOME=/app/.cache/huggingface
|
| 13 |
-
DEBIAN_FRONTEND=noninteractive
|
| 14 |
|
| 15 |
# Install system dependencies
|
| 16 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
@@ -23,36 +20,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 23 |
&& rm -rf /var/lib/apt/lists/* \
|
| 24 |
&& apt-get clean
|
| 25 |
|
| 26 |
-
# Upgrade pip
|
| 27 |
-
RUN pip install --no-cache-dir --upgrade
|
| 28 |
-
pip==24.0 \
|
| 29 |
-
setuptools==69.5.1 \
|
| 30 |
-
wheel==0.43.0
|
| 31 |
|
| 32 |
-
# Copy
|
| 33 |
COPY requirements.txt .
|
| 34 |
-
|
| 35 |
-
# Install Python dependencies
|
| 36 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 37 |
|
| 38 |
-
# Copy application
|
| 39 |
COPY app/ ./app/
|
| 40 |
|
| 41 |
-
# Create
|
| 42 |
RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
|
| 43 |
chmod -R 777 /app/offload /app/.cache
|
| 44 |
|
| 45 |
-
# Expose port 7860 (HuggingFace Spaces standard)
|
| 46 |
EXPOSE 7860
|
| 47 |
|
| 48 |
-
# Health check - more lenient for model loading
|
| 49 |
HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
|
| 50 |
CMD curl -f http://localhost:7860/health || exit 1
|
| 51 |
|
| 52 |
-
# Run the application with increased timeouts
|
| 53 |
CMD ["uvicorn", "app.api:app", \
|
| 54 |
"--host", "0.0.0.0", \
|
| 55 |
"--port", "7860", \
|
| 56 |
"--timeout-keep-alive", "300", \
|
| 57 |
-
"--workers", "1"
|
| 58 |
-
"--log-level", "info"]
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
PIP_NO_CACHE_DIR=1 \
|
| 8 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
TRANSFORMERS_CACHE=/app/.cache/transformers \
|
| 10 |
+
HF_HOME=/app/.cache/huggingface
|
|
|
|
| 11 |
|
| 12 |
# Install system dependencies
|
| 13 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
| 20 |
&& rm -rf /var/lib/apt/lists/* \
|
| 21 |
&& apt-get clean
|
| 22 |
|
| 23 |
+
# Upgrade pip
|
| 24 |
+
RUN pip install --no-cache-dir --upgrade pip==24.2
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Copy and install requirements
|
| 27 |
COPY requirements.txt .
|
|
|
|
|
|
|
| 28 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 29 |
|
| 30 |
+
# Copy application
|
| 31 |
COPY app/ ./app/
|
| 32 |
|
| 33 |
+
# Create directories
|
| 34 |
RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
|
| 35 |
chmod -R 777 /app/offload /app/.cache
|
| 36 |
|
|
|
|
| 37 |
EXPOSE 7860
|
| 38 |
|
|
|
|
| 39 |
HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
|
| 40 |
CMD curl -f http://localhost:7860/health || exit 1
|
| 41 |
|
|
|
|
| 42 |
CMD ["uvicorn", "app.api:app", \
|
| 43 |
"--host", "0.0.0.0", \
|
| 44 |
"--port", "7860", \
|
| 45 |
"--timeout-keep-alive", "300", \
|
| 46 |
+
"--workers", "1"]
|
|
|
app/model_loader.py
CHANGED
|
@@ -16,8 +16,8 @@ MODEL_NAME = "RayyanAhmed9477/med-coding"
|
|
| 16 |
|
| 17 |
def load_model_and_tokenizer():
|
| 18 |
"""
|
| 19 |
-
Loads Phi-3 model with
|
| 20 |
-
|
| 21 |
"""
|
| 22 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
print(f"π§ Using device: {device}")
|
|
@@ -26,16 +26,19 @@ def load_model_and_tokenizer():
|
|
| 26 |
|
| 27 |
# Get HuggingFace token from environment
|
| 28 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
try:
|
| 31 |
# ===== STEP 1: Load Tokenizer =====
|
| 32 |
print(f"π₯ Loading tokenizer: {MODEL_NAME}")
|
| 33 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 34 |
MODEL_NAME,
|
| 35 |
-
trust_remote_code=True,
|
| 36 |
token=hf_token,
|
| 37 |
-
use_fast=True
|
| 38 |
-
legacy=False
|
| 39 |
)
|
| 40 |
|
| 41 |
# Configure tokenizer
|
|
@@ -46,13 +49,23 @@ def load_model_and_tokenizer():
|
|
| 46 |
|
| 47 |
print("β
Tokenizer loaded successfully")
|
| 48 |
|
| 49 |
-
# ===== STEP 2: Load Configuration
|
| 50 |
print(f"π₯ Loading model configuration: {MODEL_NAME}")
|
| 51 |
config = AutoConfig.from_pretrained(
|
| 52 |
MODEL_NAME,
|
| 53 |
-
trust_remote_code=True,
|
| 54 |
token=hf_token
|
| 55 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
print(f"β
Config loaded: {config.model_type}")
|
| 57 |
|
| 58 |
# ===== STEP 3: Load Model =====
|
|
@@ -69,11 +82,10 @@ def load_model_and_tokenizer():
|
|
| 69 |
torch_dtype=torch.bfloat16,
|
| 70 |
device_map="auto",
|
| 71 |
token=hf_token,
|
| 72 |
-
low_cpu_mem_usage=True
|
| 73 |
-
attn_implementation="eager" # More stable than flash attention
|
| 74 |
)
|
| 75 |
else:
|
| 76 |
-
# CPU Configuration
|
| 77 |
print("π» Using CPU with float32 precision")
|
| 78 |
model = AutoModelForCausalLM.from_pretrained(
|
| 79 |
MODEL_NAME,
|
|
@@ -83,8 +95,7 @@ def load_model_and_tokenizer():
|
|
| 83 |
device_map={"": "cpu"},
|
| 84 |
token=hf_token,
|
| 85 |
low_cpu_mem_usage=True,
|
| 86 |
-
offload_folder="offload"
|
| 87 |
-
attn_implementation="eager"
|
| 88 |
)
|
| 89 |
|
| 90 |
# Set model to evaluation mode
|
|
@@ -103,8 +114,7 @@ def load_model_and_tokenizer():
|
|
| 103 |
model=model,
|
| 104 |
tokenizer=tokenizer,
|
| 105 |
device=0 if device == "cuda" else -1,
|
| 106 |
-
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
|
| 107 |
-
framework="pt"
|
| 108 |
)
|
| 109 |
|
| 110 |
print("β
Pipeline created successfully!")
|
|
@@ -114,6 +124,18 @@ def load_model_and_tokenizer():
|
|
| 114 |
|
| 115 |
return gen_pipeline, tokenizer
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
except Exception as e:
|
| 118 |
print(f"β Error during model loading: {str(e)}")
|
| 119 |
print("\nπ Diagnostic Information:")
|
|
@@ -129,5 +151,5 @@ def load_model_and_tokenizer():
|
|
| 129 |
"Please check: "
|
| 130 |
"1) Internet connection, "
|
| 131 |
"2) HuggingFace token (if model is private), "
|
| 132 |
-
"3) Transformers version (requires >=4.
|
| 133 |
) from e
|
|
|
|
| 16 |
|
| 17 |
def load_model_and_tokenizer():
|
| 18 |
"""
|
| 19 |
+
Loads Phi-3 model with LongRoPE support.
|
| 20 |
+
Requires transformers>=4.43.0 for longrope rope_scaling type.
|
| 21 |
"""
|
| 22 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
print(f"π§ Using device: {device}")
|
|
|
|
| 26 |
|
| 27 |
# Get HuggingFace token from environment
|
| 28 |
hf_token = os.getenv("HF_TOKEN")
|
| 29 |
+
if hf_token:
|
| 30 |
+
print("π HuggingFace token found")
|
| 31 |
+
else:
|
| 32 |
+
print("β οΈ No HuggingFace token - assuming public model")
|
| 33 |
|
| 34 |
try:
|
| 35 |
# ===== STEP 1: Load Tokenizer =====
|
| 36 |
print(f"π₯ Loading tokenizer: {MODEL_NAME}")
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 38 |
MODEL_NAME,
|
| 39 |
+
trust_remote_code=True,
|
| 40 |
token=hf_token,
|
| 41 |
+
use_fast=True
|
|
|
|
| 42 |
)
|
| 43 |
|
| 44 |
# Configure tokenizer
|
|
|
|
| 49 |
|
| 50 |
print("β
Tokenizer loaded successfully")
|
| 51 |
|
| 52 |
+
# ===== STEP 2: Load Configuration =====
|
| 53 |
print(f"π₯ Loading model configuration: {MODEL_NAME}")
|
| 54 |
config = AutoConfig.from_pretrained(
|
| 55 |
MODEL_NAME,
|
| 56 |
+
trust_remote_code=True,
|
| 57 |
token=hf_token
|
| 58 |
)
|
| 59 |
+
|
| 60 |
+
# β
Handle LongRoPE configuration
|
| 61 |
+
if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
|
| 62 |
+
rope_type = config.rope_scaling.get('type', 'default')
|
| 63 |
+
print(f"π RoPE scaling type detected: {rope_type}")
|
| 64 |
+
|
| 65 |
+
# LongRoPE is supported in transformers>=4.43.0
|
| 66 |
+
if rope_type == 'longrope':
|
| 67 |
+
print("β
LongRoPE configuration detected and supported")
|
| 68 |
+
|
| 69 |
print(f"β
Config loaded: {config.model_type}")
|
| 70 |
|
| 71 |
# ===== STEP 3: Load Model =====
|
|
|
|
| 82 |
torch_dtype=torch.bfloat16,
|
| 83 |
device_map="auto",
|
| 84 |
token=hf_token,
|
| 85 |
+
low_cpu_mem_usage=True
|
|
|
|
| 86 |
)
|
| 87 |
else:
|
| 88 |
+
# CPU Configuration
|
| 89 |
print("π» Using CPU with float32 precision")
|
| 90 |
model = AutoModelForCausalLM.from_pretrained(
|
| 91 |
MODEL_NAME,
|
|
|
|
| 95 |
device_map={"": "cpu"},
|
| 96 |
token=hf_token,
|
| 97 |
low_cpu_mem_usage=True,
|
| 98 |
+
offload_folder="offload"
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
# Set model to evaluation mode
|
|
|
|
| 114 |
model=model,
|
| 115 |
tokenizer=tokenizer,
|
| 116 |
device=0 if device == "cuda" else -1,
|
| 117 |
+
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
print("β
Pipeline created successfully!")
|
|
|
|
| 124 |
|
| 125 |
return gen_pipeline, tokenizer
|
| 126 |
|
| 127 |
+
except ValueError as ve:
|
| 128 |
+
if "rope_scaling" in str(ve):
|
| 129 |
+
print(f"\nβ RoPE Scaling Error: {str(ve)}")
|
| 130 |
+
print("\nπ‘ SOLUTION:")
|
| 131 |
+
print(" This model requires transformers>=4.43.0 for LongRoPE support.")
|
| 132 |
+
print(" Please update requirements.txt with: transformers==4.45.2")
|
| 133 |
+
raise RuntimeError(
|
| 134 |
+
"Transformers version too old for this model. "
|
| 135 |
+
"Requires transformers>=4.43.0 for Phi-3 LongRoPE support."
|
| 136 |
+
) from ve
|
| 137 |
+
raise
|
| 138 |
+
|
| 139 |
except Exception as e:
|
| 140 |
print(f"β Error during model loading: {str(e)}")
|
| 141 |
print("\nπ Diagnostic Information:")
|
|
|
|
| 151 |
"Please check: "
|
| 152 |
"1) Internet connection, "
|
| 153 |
"2) HuggingFace token (if model is private), "
|
| 154 |
+
"3) Transformers version (requires >=4.43.0 for Phi-3 LongRoPE)"
|
| 155 |
) from e
|
requirements.txt
CHANGED
|
@@ -1,21 +1,23 @@
|
|
| 1 |
# Web Framework
|
| 2 |
-
fastapi==0.
|
| 3 |
-
uvicorn[standard]==0.
|
| 4 |
python-multipart==0.0.9
|
| 5 |
|
| 6 |
-
# Machine Learning -
|
| 7 |
-
transformers==4.
|
| 8 |
-
torch==2.
|
| 9 |
-
accelerate==0.
|
| 10 |
-
safetensors==0.4.
|
| 11 |
sentencepiece==0.2.0
|
|
|
|
| 12 |
|
| 13 |
# Utilities
|
| 14 |
-
pydantic==2.
|
| 15 |
-
pydantic-settings==2.2
|
| 16 |
python-dotenv==1.0.1
|
| 17 |
-
protobuf==
|
| 18 |
einops==0.8.0
|
|
|
|
| 19 |
|
| 20 |
# Monitoring
|
| 21 |
-
psutil==
|
|
|
|
| 1 |
# Web Framework
|
| 2 |
+
fastapi==0.115.0
|
| 3 |
+
uvicorn[standard]==0.30.6
|
| 4 |
python-multipart==0.0.9
|
| 5 |
|
| 6 |
+
# Machine Learning - UPDATED FOR PHI-3 LONGROPE SUPPORT
|
| 7 |
+
transformers==4.45.2
|
| 8 |
+
torch==2.4.1
|
| 9 |
+
accelerate==0.34.2
|
| 10 |
+
safetensors==0.4.5
|
| 11 |
sentencepiece==0.2.0
|
| 12 |
+
tokenizers==0.20.1
|
| 13 |
|
| 14 |
# Utilities
|
| 15 |
+
pydantic==2.9.2
|
| 16 |
+
pydantic-settings==2.5.2
|
| 17 |
python-dotenv==1.0.1
|
| 18 |
+
protobuf==5.28.2
|
| 19 |
einops==0.8.0
|
| 20 |
+
huggingface-hub==0.25.1
|
| 21 |
|
| 22 |
# Monitoring
|
| 23 |
+
psutil==6.0.0
|