Spaces:

Distopia22
/

medical-coding-api

Sleeping

App Files Files Community

Distopia22 commited on Nov 19, 2025

Commit

61e7d9a

1 Parent(s): 7cd0e22

Fix: Add robust model loading with safetensors fallback strategies

Browse files

Files changed (3) hide show

Dockerfile +11 -5
app/model_loader.py +101 -47
requirements.txt +2 -2

Dockerfile CHANGED Viewed

@@ -5,9 +5,9 @@ WORKDIR /app
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1 \
     TRANSFORMERS_CACHE=/app/.cache/transformers \
-    HF_HOME=/app/.cache/huggingface
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     curl \
     ca-certificates \
     && git lfs install \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -23,6 +24,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip==24.2
 # Copy and install requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -30,17 +34,19 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
 COPY app/ ./app/
-# Create directories
 RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
     chmod -R 777 /app/offload /app/.cache
 EXPOSE 7860
-HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
     CMD curl -f http://localhost:7860/health || exit 1
 CMD ["uvicorn", "app.api:app", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--timeout-keep-alive", "300", \
-    "--workers", "1"]

 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
     TRANSFORMERS_CACHE=/app/.cache/transformers \
+    HF_HOME=/app/.cache/huggingface \
+    HF_HUB_ENABLE_HF_TRANSFER=1
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     curl \
     ca-certificates \
+    wget \
     && git lfs install \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip==24.2
+# Install hf_transfer for faster downloads (optional but helps)
+RUN pip install --no-cache-dir hf-transfer==0.1.8
 # Copy and install requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
 COPY app/ ./app/
+# Create directories with proper permissions
 RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
     chmod -R 777 /app/offload /app/.cache
 EXPOSE 7860
+# Longer startup period for model download
+HEALTHCHECK --interval=30s --timeout=30s --start-period=600s --retries=5 \
     CMD curl -f http://localhost:7860/health || exit 1
 CMD ["uvicorn", "app.api:app", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--timeout-keep-alive", "300", \
+    "--workers", "1", \
+    "--log-level", "info"]

app/model_loader.py CHANGED Viewed

@@ -16,8 +16,8 @@ MODEL_NAME = "RayyanAhmed9477/med-coding"
 def load_model_and_tokenizer():
     """
-    Loads Phi-3 model with LongRoPE support.
-    Requires transformers>=4.43.0 for longrope rope_scaling type.
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔧 Using device: {device}")
@@ -57,46 +57,110 @@ def load_model_and_tokenizer():
             token=hf_token
         )
-        # ✅ Handle LongRoPE configuration
         if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
             rope_type = config.rope_scaling.get('type', 'default')
             print(f"📐 RoPE scaling type detected: {rope_type}")
-            # LongRoPE is supported in transformers>=4.43.0
             if rope_type == 'longrope':
                 print("✅ LongRoPE configuration detected and supported")
         print(f"✅ Config loaded: {config.model_type}")
-        # ===== STEP 3: Load Model =====
         print(f"📥 Loading model: {MODEL_NAME}")
         print("⏳ This may take 2-5 minutes on first load...")
         if device == "cuda":
-            # GPU Configuration
-            print("🎮 Using GPU with bfloat16 precision")
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                config=config,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                token=hf_token,
-                low_cpu_mem_usage=True
-            )
         else:
-            # CPU Configuration
-            print("💻 Using CPU with float32 precision")
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                config=config,
-                trust_remote_code=True,
-                torch_dtype=torch.float32,
-                device_map={"": "cpu"},
-                token=hf_token,
-                low_cpu_mem_usage=True,
-                offload_folder="offload"
-            )
         # Set model to evaluation mode
         model.eval()
@@ -105,7 +169,7 @@ def load_model_and_tokenizer():
         for param in model.parameters():
             param.requires_grad = False
-        print("✅ Model loaded successfully!")
         # ===== STEP 4: Create Pipeline =====
         print("🔧 Creating text generation pipeline...")
@@ -124,20 +188,8 @@ def load_model_and_tokenizer():
         return gen_pipeline, tokenizer
-    except ValueError as ve:
-        if "rope_scaling" in str(ve):
-            print(f"\n❌ RoPE Scaling Error: {str(ve)}")
-            print("\n💡 SOLUTION:")
-            print("   This model requires transformers>=4.43.0 for LongRoPE support.")
-            print("   Please update requirements.txt with: transformers==4.45.2")
-            raise RuntimeError(
-                "Transformers version too old for this model. "
-                "Requires transformers>=4.43.0 for Phi-3 LongRoPE support."
-            ) from ve
-        raise
     except Exception as e:
-        print(f"❌ Error during model loading: {str(e)}")
         print("\n🔍 Diagnostic Information:")
         print(f"   - Model: {MODEL_NAME}")
         print(f"   - Device: {device}")
@@ -148,8 +200,10 @@ def load_model_and_tokenizer():
         raise RuntimeError(
             f"Failed to load model {MODEL_NAME}. "
-            "Please check: "
-            "1) Internet connection, "
-            "2) HuggingFace token (if model is private), "
-            "3) Transformers version (requires >=4.43.0 for Phi-3 LongRoPE)"
         ) from e

 def load_model_and_tokenizer():
     """
+    Loads Phi-3 model with multiple fallback strategies.
+    Handles safetensors loading issues with robust error recovery.
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔧 Using device: {device}")
             token=hf_token
         )
+        # Handle LongRoPE configuration
         if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
             rope_type = config.rope_scaling.get('type', 'default')
             print(f"📐 RoPE scaling type detected: {rope_type}")
             if rope_type == 'longrope':
                 print("✅ LongRoPE configuration detected and supported")
         print(f"✅ Config loaded: {config.model_type}")
+        # ===== STEP 3: Load Model with Multiple Strategies =====
         print(f"📥 Loading model: {MODEL_NAME}")
         print("⏳ This may take 2-5 minutes on first load...")
+        model = None
+        loading_strategies = []
         if device == "cuda":
+            loading_strategies = [
+                # Strategy 1: Standard GPU loading
+                {
+                    "name": "GPU Standard",
+                    "params": {
+                        "trust_remote_code": True,
+                        "torch_dtype": torch.bfloat16,
+                        "device_map": "auto",
+                        "token": hf_token,
+                        "low_cpu_mem_usage": True
+                    }
+                }
+            ]
         else:
+            loading_strategies = [
+                # Strategy 1: CPU with safetensors (preferred)
+                {
+                    "name": "CPU with safetensors",
+                    "params": {
+                        "trust_remote_code": True,
+                        "torch_dtype": torch.float32,
+                        "device_map": {"": "cpu"},
+                        "token": hf_token,
+                        "low_cpu_mem_usage": True,
+                        "use_safetensors": True
+                    }
+                },
+                # Strategy 2: CPU without explicit safetensors
+                {
+                    "name": "CPU standard",
+                    "params": {
+                        "trust_remote_code": True,
+                        "torch_dtype": torch.float32,
+                        "token": hf_token,
+                        "low_cpu_mem_usage": True
+                    }
+                },
+                # Strategy 3: CPU with PyTorch weights fallback
+                {
+                    "name": "CPU PyTorch weights",
+                    "params": {
+                        "trust_remote_code": True,
+                        "torch_dtype": torch.float32,
+                        "token": hf_token,
+                        "low_cpu_mem_usage": True,
+                        "use_safetensors": False
+                    }
+                },
+                # Strategy 4: Minimal parameters
+                {
+                    "name": "CPU minimal",
+                    "params": {
+                        "trust_remote_code": True,
+                        "token": hf_token
+                    }
+                }
+            ]
+        # Try each loading strategy
+        for idx, strategy in enumerate(loading_strategies, 1):
+            try:
+                print(f"\n🔄 Attempt {idx}/{len(loading_strategies)}: {strategy['name']}")
+                model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    config=config,
+                    **strategy['params']
+                )
+                # Move to CPU explicitly if needed
+                if device == "cpu" and not strategy['params'].get('device_map'):
+                    model = model.to("cpu")
+                print(f"✅ Model loaded successfully using: {strategy['name']}")
+                break
+            except Exception as e:
+                print(f"❌ Strategy '{strategy['name']}' failed: {str(e)}")
+                if idx == len(loading_strategies):
+                    # All strategies failed
+                    raise
+                else:
+                    print(f"⏭️  Trying next strategy...")
+                    continue
+        if model is None:
+            raise RuntimeError("All loading strategies failed")
         # Set model to evaluation mode
         model.eval()
         for param in model.parameters():
             param.requires_grad = False
+        print("\n✅ Model fully loaded and ready!")
         # ===== STEP 4: Create Pipeline =====
         print("🔧 Creating text generation pipeline...")
         return gen_pipeline, tokenizer
     except Exception as e:
+        print(f"\n❌ Error during model loading: {str(e)}")
         print("\n🔍 Diagnostic Information:")
         print(f"   - Model: {MODEL_NAME}")
         print(f"   - Device: {device}")
         raise RuntimeError(
             f"Failed to load model {MODEL_NAME}. "
+            "All loading strategies exhausted. "
+            "This could be due to: "
+            "1) Model file corruption during download, "
+            "2) Insufficient memory, "
+            "3) Model incompatibility. "
+            "Try upgrading Space to GPU or use a different model."
         ) from e

requirements.txt CHANGED Viewed

@@ -3,11 +3,11 @@ fastapi==0.115.0
 uvicorn[standard]==0.30.6
 python-multipart==0.0.9
-# Machine Learning - UPDATED FOR PHI-3 LONGROPE SUPPORT
 transformers==4.45.2
 torch==2.4.1
 accelerate==0.34.2
-safetensors==0.4.5
 sentencepiece==0.2.0
 tokenizers==0.20.1

 uvicorn[standard]==0.30.6
 python-multipart==0.0.9
+# Machine Learning - COMPATIBLE VERSIONS
 transformers==4.45.2
 torch==2.4.1
 accelerate==0.34.2
+safetensors==0.4.3
 sentencepiece==0.2.0
 tokenizers==0.20.1