Distopia22 commited on
Commit
61e7d9a
·
1 Parent(s): 7cd0e22

Fix: Add robust model loading with safetensors fallback strategies

Browse files
Files changed (3) hide show
  1. Dockerfile +11 -5
  2. app/model_loader.py +101 -47
  3. requirements.txt +2 -2
Dockerfile CHANGED
@@ -5,9 +5,9 @@ WORKDIR /app
5
  ENV PYTHONUNBUFFERED=1 \
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  PIP_NO_CACHE_DIR=1 \
8
- PIP_DISABLE_PIP_VERSION_CHECK=1 \
9
  TRANSFORMERS_CACHE=/app/.cache/transformers \
10
- HF_HOME=/app/.cache/huggingface
 
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
16
  build-essential \
17
  curl \
18
  ca-certificates \
 
19
  && git lfs install \
20
  && rm -rf /var/lib/apt/lists/* \
21
  && apt-get clean
@@ -23,6 +24,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
23
  # Upgrade pip
24
  RUN pip install --no-cache-dir --upgrade pip==24.2
25
 
 
 
 
26
  # Copy and install requirements
27
  COPY requirements.txt .
28
  RUN pip install --no-cache-dir -r requirements.txt
@@ -30,17 +34,19 @@ RUN pip install --no-cache-dir -r requirements.txt
30
  # Copy application
31
  COPY app/ ./app/
32
 
33
- # Create directories
34
  RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
35
  chmod -R 777 /app/offload /app/.cache
36
 
37
  EXPOSE 7860
38
 
39
- HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
 
40
  CMD curl -f http://localhost:7860/health || exit 1
41
 
42
  CMD ["uvicorn", "app.api:app", \
43
  "--host", "0.0.0.0", \
44
  "--port", "7860", \
45
  "--timeout-keep-alive", "300", \
46
- "--workers", "1"]
 
 
5
  ENV PYTHONUNBUFFERED=1 \
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  PIP_NO_CACHE_DIR=1 \
 
8
  TRANSFORMERS_CACHE=/app/.cache/transformers \
9
+ HF_HOME=/app/.cache/huggingface \
10
+ HF_HUB_ENABLE_HF_TRANSFER=1
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
16
  build-essential \
17
  curl \
18
  ca-certificates \
19
+ wget \
20
  && git lfs install \
21
  && rm -rf /var/lib/apt/lists/* \
22
  && apt-get clean
 
24
  # Upgrade pip
25
  RUN pip install --no-cache-dir --upgrade pip==24.2
26
 
27
+ # Install hf_transfer for faster downloads (optional but helps)
28
+ RUN pip install --no-cache-dir hf-transfer==0.1.8
29
+
30
  # Copy and install requirements
31
  COPY requirements.txt .
32
  RUN pip install --no-cache-dir -r requirements.txt
 
34
  # Copy application
35
  COPY app/ ./app/
36
 
37
+ # Create directories with proper permissions
38
  RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
39
  chmod -R 777 /app/offload /app/.cache
40
 
41
  EXPOSE 7860
42
 
43
+ # Longer startup period for model download
44
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=600s --retries=5 \
45
  CMD curl -f http://localhost:7860/health || exit 1
46
 
47
  CMD ["uvicorn", "app.api:app", \
48
  "--host", "0.0.0.0", \
49
  "--port", "7860", \
50
  "--timeout-keep-alive", "300", \
51
+ "--workers", "1", \
52
+ "--log-level", "info"]
app/model_loader.py CHANGED
@@ -16,8 +16,8 @@ MODEL_NAME = "RayyanAhmed9477/med-coding"
16
 
17
  def load_model_and_tokenizer():
18
  """
19
- Loads Phi-3 model with LongRoPE support.
20
- Requires transformers>=4.43.0 for longrope rope_scaling type.
21
  """
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"🔧 Using device: {device}")
@@ -57,46 +57,110 @@ def load_model_and_tokenizer():
57
  token=hf_token
58
  )
59
 
60
- # Handle LongRoPE configuration
61
  if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
62
  rope_type = config.rope_scaling.get('type', 'default')
63
  print(f"📐 RoPE scaling type detected: {rope_type}")
64
-
65
- # LongRoPE is supported in transformers>=4.43.0
66
  if rope_type == 'longrope':
67
  print("✅ LongRoPE configuration detected and supported")
68
 
69
  print(f"✅ Config loaded: {config.model_type}")
70
 
71
- # ===== STEP 3: Load Model =====
72
  print(f"📥 Loading model: {MODEL_NAME}")
73
  print("⏳ This may take 2-5 minutes on first load...")
74
 
 
 
 
75
  if device == "cuda":
76
- # GPU Configuration
77
- print("🎮 Using GPU with bfloat16 precision")
78
- model = AutoModelForCausalLM.from_pretrained(
79
- MODEL_NAME,
80
- config=config,
81
- trust_remote_code=True,
82
- torch_dtype=torch.bfloat16,
83
- device_map="auto",
84
- token=hf_token,
85
- low_cpu_mem_usage=True
86
- )
 
 
87
  else:
88
- # CPU Configuration
89
- print("💻 Using CPU with float32 precision")
90
- model = AutoModelForCausalLM.from_pretrained(
91
- MODEL_NAME,
92
- config=config,
93
- trust_remote_code=True,
94
- torch_dtype=torch.float32,
95
- device_map={"": "cpu"},
96
- token=hf_token,
97
- low_cpu_mem_usage=True,
98
- offload_folder="offload"
99
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  # Set model to evaluation mode
102
  model.eval()
@@ -105,7 +169,7 @@ def load_model_and_tokenizer():
105
  for param in model.parameters():
106
  param.requires_grad = False
107
 
108
- print("✅ Model loaded successfully!")
109
 
110
  # ===== STEP 4: Create Pipeline =====
111
  print("🔧 Creating text generation pipeline...")
@@ -124,20 +188,8 @@ def load_model_and_tokenizer():
124
 
125
  return gen_pipeline, tokenizer
126
 
127
- except ValueError as ve:
128
- if "rope_scaling" in str(ve):
129
- print(f"\n❌ RoPE Scaling Error: {str(ve)}")
130
- print("\n💡 SOLUTION:")
131
- print(" This model requires transformers>=4.43.0 for LongRoPE support.")
132
- print(" Please update requirements.txt with: transformers==4.45.2")
133
- raise RuntimeError(
134
- "Transformers version too old for this model. "
135
- "Requires transformers>=4.43.0 for Phi-3 LongRoPE support."
136
- ) from ve
137
- raise
138
-
139
  except Exception as e:
140
- print(f"❌ Error during model loading: {str(e)}")
141
  print("\n🔍 Diagnostic Information:")
142
  print(f" - Model: {MODEL_NAME}")
143
  print(f" - Device: {device}")
@@ -148,8 +200,10 @@ def load_model_and_tokenizer():
148
 
149
  raise RuntimeError(
150
  f"Failed to load model {MODEL_NAME}. "
151
- "Please check: "
152
- "1) Internet connection, "
153
- "2) HuggingFace token (if model is private), "
154
- "3) Transformers version (requires >=4.43.0 for Phi-3 LongRoPE)"
 
 
155
  ) from e
 
16
 
17
  def load_model_and_tokenizer():
18
  """
19
+ Loads Phi-3 model with multiple fallback strategies.
20
+ Handles safetensors loading issues with robust error recovery.
21
  """
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"🔧 Using device: {device}")
 
57
  token=hf_token
58
  )
59
 
60
+ # Handle LongRoPE configuration
61
  if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
62
  rope_type = config.rope_scaling.get('type', 'default')
63
  print(f"📐 RoPE scaling type detected: {rope_type}")
 
 
64
  if rope_type == 'longrope':
65
  print("✅ LongRoPE configuration detected and supported")
66
 
67
  print(f"✅ Config loaded: {config.model_type}")
68
 
69
+ # ===== STEP 3: Load Model with Multiple Strategies =====
70
  print(f"📥 Loading model: {MODEL_NAME}")
71
  print("⏳ This may take 2-5 minutes on first load...")
72
 
73
+ model = None
74
+ loading_strategies = []
75
+
76
  if device == "cuda":
77
+ loading_strategies = [
78
+ # Strategy 1: Standard GPU loading
79
+ {
80
+ "name": "GPU Standard",
81
+ "params": {
82
+ "trust_remote_code": True,
83
+ "torch_dtype": torch.bfloat16,
84
+ "device_map": "auto",
85
+ "token": hf_token,
86
+ "low_cpu_mem_usage": True
87
+ }
88
+ }
89
+ ]
90
  else:
91
+ loading_strategies = [
92
+ # Strategy 1: CPU with safetensors (preferred)
93
+ {
94
+ "name": "CPU with safetensors",
95
+ "params": {
96
+ "trust_remote_code": True,
97
+ "torch_dtype": torch.float32,
98
+ "device_map": {"": "cpu"},
99
+ "token": hf_token,
100
+ "low_cpu_mem_usage": True,
101
+ "use_safetensors": True
102
+ }
103
+ },
104
+ # Strategy 2: CPU without explicit safetensors
105
+ {
106
+ "name": "CPU standard",
107
+ "params": {
108
+ "trust_remote_code": True,
109
+ "torch_dtype": torch.float32,
110
+ "token": hf_token,
111
+ "low_cpu_mem_usage": True
112
+ }
113
+ },
114
+ # Strategy 3: CPU with PyTorch weights fallback
115
+ {
116
+ "name": "CPU PyTorch weights",
117
+ "params": {
118
+ "trust_remote_code": True,
119
+ "torch_dtype": torch.float32,
120
+ "token": hf_token,
121
+ "low_cpu_mem_usage": True,
122
+ "use_safetensors": False
123
+ }
124
+ },
125
+ # Strategy 4: Minimal parameters
126
+ {
127
+ "name": "CPU minimal",
128
+ "params": {
129
+ "trust_remote_code": True,
130
+ "token": hf_token
131
+ }
132
+ }
133
+ ]
134
+
135
+ # Try each loading strategy
136
+ for idx, strategy in enumerate(loading_strategies, 1):
137
+ try:
138
+ print(f"\n🔄 Attempt {idx}/{len(loading_strategies)}: {strategy['name']}")
139
+
140
+ model = AutoModelForCausalLM.from_pretrained(
141
+ MODEL_NAME,
142
+ config=config,
143
+ **strategy['params']
144
+ )
145
+
146
+ # Move to CPU explicitly if needed
147
+ if device == "cpu" and not strategy['params'].get('device_map'):
148
+ model = model.to("cpu")
149
+
150
+ print(f"✅ Model loaded successfully using: {strategy['name']}")
151
+ break
152
+
153
+ except Exception as e:
154
+ print(f"❌ Strategy '{strategy['name']}' failed: {str(e)}")
155
+ if idx == len(loading_strategies):
156
+ # All strategies failed
157
+ raise
158
+ else:
159
+ print(f"⏭️ Trying next strategy...")
160
+ continue
161
+
162
+ if model is None:
163
+ raise RuntimeError("All loading strategies failed")
164
 
165
  # Set model to evaluation mode
166
  model.eval()
 
169
  for param in model.parameters():
170
  param.requires_grad = False
171
 
172
+ print("\n✅ Model fully loaded and ready!")
173
 
174
  # ===== STEP 4: Create Pipeline =====
175
  print("🔧 Creating text generation pipeline...")
 
188
 
189
  return gen_pipeline, tokenizer
190
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  except Exception as e:
192
+ print(f"\n❌ Error during model loading: {str(e)}")
193
  print("\n🔍 Diagnostic Information:")
194
  print(f" - Model: {MODEL_NAME}")
195
  print(f" - Device: {device}")
 
200
 
201
  raise RuntimeError(
202
  f"Failed to load model {MODEL_NAME}. "
203
+ "All loading strategies exhausted. "
204
+ "This could be due to: "
205
+ "1) Model file corruption during download, "
206
+ "2) Insufficient memory, "
207
+ "3) Model incompatibility. "
208
+ "Try upgrading Space to GPU or use a different model."
209
  ) from e
requirements.txt CHANGED
@@ -3,11 +3,11 @@ fastapi==0.115.0
3
  uvicorn[standard]==0.30.6
4
  python-multipart==0.0.9
5
 
6
- # Machine Learning - UPDATED FOR PHI-3 LONGROPE SUPPORT
7
  transformers==4.45.2
8
  torch==2.4.1
9
  accelerate==0.34.2
10
- safetensors==0.4.5
11
  sentencepiece==0.2.0
12
  tokenizers==0.20.1
13
 
 
3
  uvicorn[standard]==0.30.6
4
  python-multipart==0.0.9
5
 
6
+ # Machine Learning - COMPATIBLE VERSIONS
7
  transformers==4.45.2
8
  torch==2.4.1
9
  accelerate==0.34.2
10
+ safetensors==0.4.3
11
  sentencepiece==0.2.0
12
  tokenizers==0.20.1
13