Spaces:

Distopia22
/

medical-coding-api

Sleeping

App Files Files Community

Distopia22 commited on Nov 19, 2025

Commit

7cd0e22

1 Parent(s): 24c7b48

Fix: Update transformers to 4.45.2 for Phi-3 LongRoPE support

Browse files

Files changed (3) hide show

Dockerfile +7 -19
app/model_loader.py +37 -15
requirements.txt +13 -11

Dockerfile CHANGED Viewed

@@ -1,16 +1,13 @@
 FROM python:3.10-slim
-# Set working directory
 WORKDIR /app
-# Set environment variables
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     TRANSFORMERS_CACHE=/app/.cache/transformers \
-    HF_HOME=/app/.cache/huggingface \
-    DEBIAN_FRONTEND=noninteractive
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -23,36 +20,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
-# Upgrade pip and install build tools
-RUN pip install --no-cache-dir --upgrade \
-    pip==24.0 \
-    setuptools==69.5.1 \
-    wheel==0.43.0
-# Copy requirements first for better Docker caching
 COPY requirements.txt .
-# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy application code
 COPY app/ ./app/
-# Create necessary directories with proper permissions
 RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
     chmod -R 777 /app/offload /app/.cache
-# Expose port 7860 (HuggingFace Spaces standard)
 EXPOSE 7860
-# Health check - more lenient for model loading
 HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
     CMD curl -f http://localhost:7860/health || exit 1
-# Run the application with increased timeouts
 CMD ["uvicorn", "app.api:app", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--timeout-keep-alive", "300", \
-    "--workers", "1", \
-    "--log-level", "info"]

 FROM python:3.10-slim
 WORKDIR /app
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     TRANSFORMERS_CACHE=/app/.cache/transformers \
+    HF_HOME=/app/.cache/huggingface
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
+# Upgrade pip
+RUN pip install --no-cache-dir --upgrade pip==24.2
+# Copy and install requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy application
 COPY app/ ./app/
+# Create directories
 RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
     chmod -R 777 /app/offload /app/.cache
 EXPOSE 7860
 HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
     CMD curl -f http://localhost:7860/health || exit 1
 CMD ["uvicorn", "app.api:app", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--timeout-keep-alive", "300", \
+    "--workers", "1"]

app/model_loader.py CHANGED Viewed

@@ -16,8 +16,8 @@ MODEL_NAME = "RayyanAhmed9477/med-coding"
 def load_model_and_tokenizer():
     """
-    Loads Phi-3 model with comprehensive error handling and fallbacks.
-    Supports both CPU and GPU with automatic detection.
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔧 Using device: {device}")
@@ -26,16 +26,19 @@ def load_model_and_tokenizer():
     # Get HuggingFace token from environment
     hf_token = os.getenv("HF_TOKEN")
     try:
         # ===== STEP 1: Load Tokenizer =====
         print(f"📥 Loading tokenizer: {MODEL_NAME}")
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
-            trust_remote_code=True,  # Critical for Phi-3
             token=hf_token,
-            use_fast=True,
-            legacy=False
         )
         # Configure tokenizer
@@ -46,13 +49,23 @@ def load_model_and_tokenizer():
         print("✅ Tokenizer loaded successfully")
-        # ===== STEP 2: Load Configuration with trust_remote_code =====
         print(f"📥 Loading model configuration: {MODEL_NAME}")
         config = AutoConfig.from_pretrained(
             MODEL_NAME,
-            trust_remote_code=True,  # Critical for Phi-3
             token=hf_token
         )
         print(f"✅ Config loaded: {config.model_type}")
         # ===== STEP 3: Load Model =====
@@ -69,11 +82,10 @@ def load_model_and_tokenizer():
                 torch_dtype=torch.bfloat16,
                 device_map="auto",
                 token=hf_token,
-                low_cpu_mem_usage=True,
-                attn_implementation="eager"  # More stable than flash attention
             )
         else:
-            # CPU Configuration - optimized for stability
             print("💻 Using CPU with float32 precision")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
@@ -83,8 +95,7 @@ def load_model_and_tokenizer():
                 device_map={"": "cpu"},
                 token=hf_token,
                 low_cpu_mem_usage=True,
-                offload_folder="offload",
-                attn_implementation="eager"
             )
         # Set model to evaluation mode
@@ -103,8 +114,7 @@ def load_model_and_tokenizer():
             model=model,
             tokenizer=tokenizer,
             device=0 if device == "cuda" else -1,
-            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
-            framework="pt"
         )
         print("✅ Pipeline created successfully!")
@@ -114,6 +124,18 @@ def load_model_and_tokenizer():
         return gen_pipeline, tokenizer
     except Exception as e:
         print(f"❌ Error during model loading: {str(e)}")
         print("\n🔍 Diagnostic Information:")
@@ -129,5 +151,5 @@ def load_model_and_tokenizer():
             "Please check: "
             "1) Internet connection, "
             "2) HuggingFace token (if model is private), "
-            "3) Transformers version (requires >=4.36.0 for Phi-3)"
         ) from e

 def load_model_and_tokenizer():
     """
+    Loads Phi-3 model with LongRoPE support.
+    Requires transformers>=4.43.0 for longrope rope_scaling type.
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔧 Using device: {device}")
     # Get HuggingFace token from environment
     hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        print("🔑 HuggingFace token found")
+    else:
+        print("⚠️  No HuggingFace token - assuming public model")
     try:
         # ===== STEP 1: Load Tokenizer =====
         print(f"📥 Loading tokenizer: {MODEL_NAME}")
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
+            trust_remote_code=True,
             token=hf_token,
+            use_fast=True
         )
         # Configure tokenizer
         print("✅ Tokenizer loaded successfully")
+        # ===== STEP 2: Load Configuration =====
         print(f"📥 Loading model configuration: {MODEL_NAME}")
         config = AutoConfig.from_pretrained(
             MODEL_NAME,
+            trust_remote_code=True,
             token=hf_token
         )
+        # ✅ Handle LongRoPE configuration
+        if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
+            rope_type = config.rope_scaling.get('type', 'default')
+            print(f"📐 RoPE scaling type detected: {rope_type}")
+            # LongRoPE is supported in transformers>=4.43.0
+            if rope_type == 'longrope':
+                print("✅ LongRoPE configuration detected and supported")
         print(f"✅ Config loaded: {config.model_type}")
         # ===== STEP 3: Load Model =====
                 torch_dtype=torch.bfloat16,
                 device_map="auto",
                 token=hf_token,
+                low_cpu_mem_usage=True
             )
         else:
+            # CPU Configuration
             print("💻 Using CPU with float32 precision")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 device_map={"": "cpu"},
                 token=hf_token,
                 low_cpu_mem_usage=True,
+                offload_folder="offload"
             )
         # Set model to evaluation mode
             model=model,
             tokenizer=tokenizer,
             device=0 if device == "cuda" else -1,
+            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
         )
         print("✅ Pipeline created successfully!")
         return gen_pipeline, tokenizer
+    except ValueError as ve:
+        if "rope_scaling" in str(ve):
+            print(f"\n❌ RoPE Scaling Error: {str(ve)}")
+            print("\n💡 SOLUTION:")
+            print("   This model requires transformers>=4.43.0 for LongRoPE support.")
+            print("   Please update requirements.txt with: transformers==4.45.2")
+            raise RuntimeError(
+                "Transformers version too old for this model. "
+                "Requires transformers>=4.43.0 for Phi-3 LongRoPE support."
+            ) from ve
+        raise
     except Exception as e:
         print(f"❌ Error during model loading: {str(e)}")
         print("\n🔍 Diagnostic Information:")
             "Please check: "
             "1) Internet connection, "
             "2) HuggingFace token (if model is private), "
+            "3) Transformers version (requires >=4.43.0 for Phi-3 LongRoPE)"
         ) from e

requirements.txt CHANGED Viewed

@@ -1,21 +1,23 @@
 # Web Framework
-fastapi==0.109.2
-uvicorn[standard]==0.27.1
 python-multipart==0.0.9
-# Machine Learning - CRITICAL VERSIONS FOR PHI-3
-transformers==4.41.2
-torch==2.2.2
-accelerate==0.30.1
-safetensors==0.4.3
 sentencepiece==0.2.0
 # Utilities
-pydantic==2.7.1
-pydantic-settings==2.2.1
 python-dotenv==1.0.1
-protobuf==4.25.3
 einops==0.8.0
 # Monitoring
-psutil==5.9.8

 # Web Framework
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
 python-multipart==0.0.9
+# Machine Learning - UPDATED FOR PHI-3 LONGROPE SUPPORT
+transformers==4.45.2
+torch==2.4.1
+accelerate==0.34.2
+safetensors==0.4.5
 sentencepiece==0.2.0
+tokenizers==0.20.1
 # Utilities
+pydantic==2.9.2
+pydantic-settings==2.5.2
 python-dotenv==1.0.1
+protobuf==5.28.2
 einops==0.8.0
+huggingface-hub==0.25.1
 # Monitoring
+psutil==6.0.0