Distopia22 commited on
Commit
7cd0e22
Β·
1 Parent(s): 24c7b48

Fix: Update transformers to 4.45.2 for Phi-3 LongRoPE support

Browse files
Files changed (3) hide show
  1. Dockerfile +7 -19
  2. app/model_loader.py +37 -15
  3. requirements.txt +13 -11
Dockerfile CHANGED
@@ -1,16 +1,13 @@
1
  FROM python:3.10-slim
2
 
3
- # Set working directory
4
  WORKDIR /app
5
 
6
- # Set environment variables
7
  ENV PYTHONUNBUFFERED=1 \
8
  PYTHONDONTWRITEBYTECODE=1 \
9
  PIP_NO_CACHE_DIR=1 \
10
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
11
  TRANSFORMERS_CACHE=/app/.cache/transformers \
12
- HF_HOME=/app/.cache/huggingface \
13
- DEBIAN_FRONTEND=noninteractive
14
 
15
  # Install system dependencies
16
  RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -23,36 +20,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
23
  && rm -rf /var/lib/apt/lists/* \
24
  && apt-get clean
25
 
26
- # Upgrade pip and install build tools
27
- RUN pip install --no-cache-dir --upgrade \
28
- pip==24.0 \
29
- setuptools==69.5.1 \
30
- wheel==0.43.0
31
 
32
- # Copy requirements first for better Docker caching
33
  COPY requirements.txt .
34
-
35
- # Install Python dependencies
36
  RUN pip install --no-cache-dir -r requirements.txt
37
 
38
- # Copy application code
39
  COPY app/ ./app/
40
 
41
- # Create necessary directories with proper permissions
42
  RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
43
  chmod -R 777 /app/offload /app/.cache
44
 
45
- # Expose port 7860 (HuggingFace Spaces standard)
46
  EXPOSE 7860
47
 
48
- # Health check - more lenient for model loading
49
  HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
50
  CMD curl -f http://localhost:7860/health || exit 1
51
 
52
- # Run the application with increased timeouts
53
  CMD ["uvicorn", "app.api:app", \
54
  "--host", "0.0.0.0", \
55
  "--port", "7860", \
56
  "--timeout-keep-alive", "300", \
57
- "--workers", "1", \
58
- "--log-level", "info"]
 
1
  FROM python:3.10-slim
2
 
 
3
  WORKDIR /app
4
 
 
5
  ENV PYTHONUNBUFFERED=1 \
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  PIP_NO_CACHE_DIR=1 \
8
  PIP_DISABLE_PIP_VERSION_CHECK=1 \
9
  TRANSFORMERS_CACHE=/app/.cache/transformers \
10
+ HF_HOME=/app/.cache/huggingface
 
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
20
  && rm -rf /var/lib/apt/lists/* \
21
  && apt-get clean
22
 
23
+ # Upgrade pip
24
+ RUN pip install --no-cache-dir --upgrade pip==24.2
 
 
 
25
 
26
+ # Copy and install requirements
27
  COPY requirements.txt .
 
 
28
  RUN pip install --no-cache-dir -r requirements.txt
29
 
30
+ # Copy application
31
  COPY app/ ./app/
32
 
33
+ # Create directories
34
  RUN mkdir -p /app/offload /app/.cache/transformers /app/.cache/huggingface && \
35
  chmod -R 777 /app/offload /app/.cache
36
 
 
37
  EXPOSE 7860
38
 
 
39
  HEALTHCHECK --interval=30s --timeout=20s --start-period=300s --retries=5 \
40
  CMD curl -f http://localhost:7860/health || exit 1
41
 
 
42
  CMD ["uvicorn", "app.api:app", \
43
  "--host", "0.0.0.0", \
44
  "--port", "7860", \
45
  "--timeout-keep-alive", "300", \
46
+ "--workers", "1"]
 
app/model_loader.py CHANGED
@@ -16,8 +16,8 @@ MODEL_NAME = "RayyanAhmed9477/med-coding"
16
 
17
  def load_model_and_tokenizer():
18
  """
19
- Loads Phi-3 model with comprehensive error handling and fallbacks.
20
- Supports both CPU and GPU with automatic detection.
21
  """
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸ”§ Using device: {device}")
@@ -26,16 +26,19 @@ def load_model_and_tokenizer():
26
 
27
  # Get HuggingFace token from environment
28
  hf_token = os.getenv("HF_TOKEN")
 
 
 
 
29
 
30
  try:
31
  # ===== STEP 1: Load Tokenizer =====
32
  print(f"πŸ“₯ Loading tokenizer: {MODEL_NAME}")
33
  tokenizer = AutoTokenizer.from_pretrained(
34
  MODEL_NAME,
35
- trust_remote_code=True, # Critical for Phi-3
36
  token=hf_token,
37
- use_fast=True,
38
- legacy=False
39
  )
40
 
41
  # Configure tokenizer
@@ -46,13 +49,23 @@ def load_model_and_tokenizer():
46
 
47
  print("βœ… Tokenizer loaded successfully")
48
 
49
- # ===== STEP 2: Load Configuration with trust_remote_code =====
50
  print(f"πŸ“₯ Loading model configuration: {MODEL_NAME}")
51
  config = AutoConfig.from_pretrained(
52
  MODEL_NAME,
53
- trust_remote_code=True, # Critical for Phi-3
54
  token=hf_token
55
  )
 
 
 
 
 
 
 
 
 
 
56
  print(f"βœ… Config loaded: {config.model_type}")
57
 
58
  # ===== STEP 3: Load Model =====
@@ -69,11 +82,10 @@ def load_model_and_tokenizer():
69
  torch_dtype=torch.bfloat16,
70
  device_map="auto",
71
  token=hf_token,
72
- low_cpu_mem_usage=True,
73
- attn_implementation="eager" # More stable than flash attention
74
  )
75
  else:
76
- # CPU Configuration - optimized for stability
77
  print("πŸ’» Using CPU with float32 precision")
78
  model = AutoModelForCausalLM.from_pretrained(
79
  MODEL_NAME,
@@ -83,8 +95,7 @@ def load_model_and_tokenizer():
83
  device_map={"": "cpu"},
84
  token=hf_token,
85
  low_cpu_mem_usage=True,
86
- offload_folder="offload",
87
- attn_implementation="eager"
88
  )
89
 
90
  # Set model to evaluation mode
@@ -103,8 +114,7 @@ def load_model_and_tokenizer():
103
  model=model,
104
  tokenizer=tokenizer,
105
  device=0 if device == "cuda" else -1,
106
- torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
107
- framework="pt"
108
  )
109
 
110
  print("βœ… Pipeline created successfully!")
@@ -114,6 +124,18 @@ def load_model_and_tokenizer():
114
 
115
  return gen_pipeline, tokenizer
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  except Exception as e:
118
  print(f"❌ Error during model loading: {str(e)}")
119
  print("\nπŸ” Diagnostic Information:")
@@ -129,5 +151,5 @@ def load_model_and_tokenizer():
129
  "Please check: "
130
  "1) Internet connection, "
131
  "2) HuggingFace token (if model is private), "
132
- "3) Transformers version (requires >=4.36.0 for Phi-3)"
133
  ) from e
 
16
 
17
  def load_model_and_tokenizer():
18
  """
19
+ Loads Phi-3 model with LongRoPE support.
20
+ Requires transformers>=4.43.0 for longrope rope_scaling type.
21
  """
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"πŸ”§ Using device: {device}")
 
26
 
27
  # Get HuggingFace token from environment
28
  hf_token = os.getenv("HF_TOKEN")
29
+ if hf_token:
30
+ print("πŸ”‘ HuggingFace token found")
31
+ else:
32
+ print("⚠️ No HuggingFace token - assuming public model")
33
 
34
  try:
35
  # ===== STEP 1: Load Tokenizer =====
36
  print(f"πŸ“₯ Loading tokenizer: {MODEL_NAME}")
37
  tokenizer = AutoTokenizer.from_pretrained(
38
  MODEL_NAME,
39
+ trust_remote_code=True,
40
  token=hf_token,
41
+ use_fast=True
 
42
  )
43
 
44
  # Configure tokenizer
 
49
 
50
  print("βœ… Tokenizer loaded successfully")
51
 
52
+ # ===== STEP 2: Load Configuration =====
53
  print(f"πŸ“₯ Loading model configuration: {MODEL_NAME}")
54
  config = AutoConfig.from_pretrained(
55
  MODEL_NAME,
56
+ trust_remote_code=True,
57
  token=hf_token
58
  )
59
+
60
+ # βœ… Handle LongRoPE configuration
61
+ if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
62
+ rope_type = config.rope_scaling.get('type', 'default')
63
+ print(f"πŸ“ RoPE scaling type detected: {rope_type}")
64
+
65
+ # LongRoPE is supported in transformers>=4.43.0
66
+ if rope_type == 'longrope':
67
+ print("βœ… LongRoPE configuration detected and supported")
68
+
69
  print(f"βœ… Config loaded: {config.model_type}")
70
 
71
  # ===== STEP 3: Load Model =====
 
82
  torch_dtype=torch.bfloat16,
83
  device_map="auto",
84
  token=hf_token,
85
+ low_cpu_mem_usage=True
 
86
  )
87
  else:
88
+ # CPU Configuration
89
  print("πŸ’» Using CPU with float32 precision")
90
  model = AutoModelForCausalLM.from_pretrained(
91
  MODEL_NAME,
 
95
  device_map={"": "cpu"},
96
  token=hf_token,
97
  low_cpu_mem_usage=True,
98
+ offload_folder="offload"
 
99
  )
100
 
101
  # Set model to evaluation mode
 
114
  model=model,
115
  tokenizer=tokenizer,
116
  device=0 if device == "cuda" else -1,
117
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
 
118
  )
119
 
120
  print("βœ… Pipeline created successfully!")
 
124
 
125
  return gen_pipeline, tokenizer
126
 
127
+ except ValueError as ve:
128
+ if "rope_scaling" in str(ve):
129
+ print(f"\n❌ RoPE Scaling Error: {str(ve)}")
130
+ print("\nπŸ’‘ SOLUTION:")
131
+ print(" This model requires transformers>=4.43.0 for LongRoPE support.")
132
+ print(" Please update requirements.txt with: transformers==4.45.2")
133
+ raise RuntimeError(
134
+ "Transformers version too old for this model. "
135
+ "Requires transformers>=4.43.0 for Phi-3 LongRoPE support."
136
+ ) from ve
137
+ raise
138
+
139
  except Exception as e:
140
  print(f"❌ Error during model loading: {str(e)}")
141
  print("\nπŸ” Diagnostic Information:")
 
151
  "Please check: "
152
  "1) Internet connection, "
153
  "2) HuggingFace token (if model is private), "
154
+ "3) Transformers version (requires >=4.43.0 for Phi-3 LongRoPE)"
155
  ) from e
requirements.txt CHANGED
@@ -1,21 +1,23 @@
1
  # Web Framework
2
- fastapi==0.109.2
3
- uvicorn[standard]==0.27.1
4
  python-multipart==0.0.9
5
 
6
- # Machine Learning - CRITICAL VERSIONS FOR PHI-3
7
- transformers==4.41.2
8
- torch==2.2.2
9
- accelerate==0.30.1
10
- safetensors==0.4.3
11
  sentencepiece==0.2.0
 
12
 
13
  # Utilities
14
- pydantic==2.7.1
15
- pydantic-settings==2.2.1
16
  python-dotenv==1.0.1
17
- protobuf==4.25.3
18
  einops==0.8.0
 
19
 
20
  # Monitoring
21
- psutil==5.9.8
 
1
  # Web Framework
2
+ fastapi==0.115.0
3
+ uvicorn[standard]==0.30.6
4
  python-multipart==0.0.9
5
 
6
+ # Machine Learning - UPDATED FOR PHI-3 LONGROPE SUPPORT
7
+ transformers==4.45.2
8
+ torch==2.4.1
9
+ accelerate==0.34.2
10
+ safetensors==0.4.5
11
  sentencepiece==0.2.0
12
+ tokenizers==0.20.1
13
 
14
  # Utilities
15
+ pydantic==2.9.2
16
+ pydantic-settings==2.5.2
17
  python-dotenv==1.0.1
18
+ protobuf==5.28.2
19
  einops==0.8.0
20
+ huggingface-hub==0.25.1
21
 
22
  # Monitoring
23
+ psutil==6.0.0