david167 commited on
Commit
de72460
·
1 Parent(s): 67f9bcb

Fix download errors and warnings: retry logic + clean startup logs

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. app.py +47 -22
Dockerfile CHANGED
@@ -37,8 +37,8 @@ COPY README.md .
37
  # Create HF cache directory with proper permissions
38
  RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
39
  ENV HF_HOME=/app/.cache
40
- ENV TRANSFORMERS_CACHE=/app/.cache
41
  ENV HF_DATASETS_CACHE=/app/.cache
 
42
 
43
  # Expose port
44
  EXPOSE 7860
 
37
  # Create HF cache directory with proper permissions
38
  RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
39
  ENV HF_HOME=/app/.cache
 
40
  ENV HF_DATASETS_CACHE=/app/.cache
41
+ ENV OMP_NUM_THREADS=1
42
 
43
  # Expose port
44
  EXPOSE 7860
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import logging
 
 
3
  from typing import List, Optional, Dict, Any
4
  from contextlib import asynccontextmanager
5
 
@@ -33,11 +35,51 @@ class QuestionGenerationResponse(BaseModel):
33
  metadata: Dict[str, Any]
34
 
35
  class HealthResponse(BaseModel):
 
 
36
  status: str
37
  model_loaded: bool
38
  device: str
39
  memory_usage: Dict[str, float]
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  async def load_model():
42
  """Load the model and tokenizer"""
43
  global model, tokenizer, device
@@ -56,34 +98,17 @@ async def load_model():
56
  model_name = "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF"
57
  model_file = "Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-Q4_K_M.gguf"
58
 
59
- # Use transformers library instead of llama-cpp-python
 
 
 
60
  try:
61
- from transformers import AutoTokenizer, AutoModelForCausalLM
62
-
63
  logger.info("Loading model with transformers...")
64
 
65
  # Use Llama 3.1 8B Instruct (user now has access)
66
  base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
67
 
68
- # Get HF token from environment
69
- hf_token = os.getenv("HF_TOKEN")
70
-
71
- tokenizer = AutoTokenizer.from_pretrained(
72
- base_model_name,
73
- use_fast=True,
74
- trust_remote_code=True,
75
- token=hf_token
76
- )
77
-
78
- model = AutoModelForCausalLM.from_pretrained(
79
- base_model_name,
80
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
81
- device_map="auto" if device == "cuda" else None,
82
- trust_remote_code=True,
83
- low_cpu_mem_usage=True,
84
- use_safetensors=True, # Force safetensors to avoid CVE-2025-32434 (PyTorch 2.5.0 vulnerable to torch.load RCE)
85
- token=hf_token
86
- )
87
 
88
  if device == "cuda":
89
  model = model.to(device)
 
1
  import os
2
  import logging
3
+ import time
4
+ import asyncio
5
  from typing import List, Optional, Dict, Any
6
  from contextlib import asynccontextmanager
7
 
 
35
  metadata: Dict[str, Any]
36
 
37
  class HealthResponse(BaseModel):
38
+ model_config = {"protected_namespaces": ()}
39
+
40
  status: str
41
  model_loaded: bool
42
  device: str
43
  memory_usage: Dict[str, float]
44
 
45
+ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int = 3, delay: float = 5.0):
46
+ """Load model with retry logic for network issues"""
47
+ for attempt in range(max_retries):
48
+ try:
49
+ logger.info(f"Loading model attempt {attempt + 1}/{max_retries}: {model_name}")
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(
52
+ model_name,
53
+ use_fast=True,
54
+ trust_remote_code=True,
55
+ token=hf_token,
56
+ resume_download=True, # Resume interrupted downloads
57
+ force_download=False # Use cache if available
58
+ )
59
+
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ model_name,
62
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
63
+ device_map="auto" if device == "cuda" else None,
64
+ trust_remote_code=True,
65
+ low_cpu_mem_usage=True,
66
+ use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
67
+ token=hf_token,
68
+ resume_download=True, # Resume interrupted downloads
69
+ force_download=False # Use cache if available
70
+ )
71
+
72
+ return tokenizer, model
73
+
74
+ except Exception as e:
75
+ logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
76
+ if attempt < max_retries - 1:
77
+ logger.info(f"Retrying in {delay} seconds...")
78
+ await asyncio.sleep(delay)
79
+ delay *= 1.5 # Exponential backoff
80
+ else:
81
+ raise e
82
+
83
  async def load_model():
84
  """Load the model and tokenizer"""
85
  global model, tokenizer, device
 
98
  model_name = "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF"
99
  model_file = "Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-Q4_K_M.gguf"
100
 
101
+ # Get HF token from environment
102
+ hf_token = os.getenv("HF_TOKEN")
103
+
104
+ # Use transformers library with retry logic
105
  try:
 
 
106
  logger.info("Loading model with transformers...")
107
 
108
  # Use Llama 3.1 8B Instruct (user now has access)
109
  base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
110
 
111
+ tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  if device == "cuda":
114
  model = model.to(device)