Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

de72460

1 Parent(s): 67f9bcb

Fix download errors and warnings: retry logic + clean startup logs

Browse files

Files changed (2) hide show

Dockerfile +1 -1
app.py +47 -22

Dockerfile CHANGED Viewed

@@ -37,8 +37,8 @@ COPY README.md .
 # Create HF cache directory with proper permissions
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 ENV HF_HOME=/app/.cache
-ENV TRANSFORMERS_CACHE=/app/.cache
 ENV HF_DATASETS_CACHE=/app/.cache
 # Expose port
 EXPOSE 7860

 # Create HF cache directory with proper permissions
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 ENV HF_HOME=/app/.cache
 ENV HF_DATASETS_CACHE=/app/.cache
+ENV OMP_NUM_THREADS=1
 # Expose port
 EXPOSE 7860

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import logging
 from typing import List, Optional, Dict, Any
 from contextlib import asynccontextmanager
@@ -33,11 +35,51 @@ class QuestionGenerationResponse(BaseModel):
     metadata: Dict[str, Any]
 class HealthResponse(BaseModel):
     status: str
     model_loaded: bool
     device: str
     memory_usage: Dict[str, float]
 async def load_model():
     """Load the model and tokenizer"""
     global model, tokenizer, device
@@ -56,34 +98,17 @@ async def load_model():
         model_name = "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF"
         model_file = "Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-Q4_K_M.gguf"
-        # Use transformers library instead of llama-cpp-python
         try:
-            from transformers import AutoTokenizer, AutoModelForCausalLM
             logger.info("Loading model with transformers...")
             # Use Llama 3.1 8B Instruct (user now has access)
             base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
-            # Get HF token from environment
-            hf_token = os.getenv("HF_TOKEN")
-            tokenizer = AutoTokenizer.from_pretrained(
-                base_model_name,
-                use_fast=True,
-                trust_remote_code=True,
-                token=hf_token
-            )
-            model = AutoModelForCausalLM.from_pretrained(
-                base_model_name,
-                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                device_map="auto" if device == "cuda" else None,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434 (PyTorch 2.5.0 vulnerable to torch.load RCE)
-                token=hf_token
-            )
             if device == "cuda":
                 model = model.to(device)

 import os
 import logging
+import time
+import asyncio
 from typing import List, Optional, Dict, Any
 from contextlib import asynccontextmanager
     metadata: Dict[str, Any]
 class HealthResponse(BaseModel):
+    model_config = {"protected_namespaces": ()}
     status: str
     model_loaded: bool
     device: str
     memory_usage: Dict[str, float]
+async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int = 3, delay: float = 5.0):
+    """Load model with retry logic for network issues"""
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"Loading model attempt {attempt + 1}/{max_retries}: {model_name}")
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                use_fast=True,
+                trust_remote_code=True,
+                token=hf_token,
+                resume_download=True,  # Resume interrupted downloads
+                force_download=False   # Use cache if available
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+                device_map="auto" if device == "cuda" else None,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
+                token=hf_token,
+                resume_download=True,  # Resume interrupted downloads
+                force_download=False   # Use cache if available
+            )
+            return tokenizer, model
+        except Exception as e:
+            logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
+            if attempt < max_retries - 1:
+                logger.info(f"Retrying in {delay} seconds...")
+                await asyncio.sleep(delay)
+                delay *= 1.5  # Exponential backoff
+            else:
+                raise e
 async def load_model():
     """Load the model and tokenizer"""
     global model, tokenizer, device
         model_name = "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF"
         model_file = "Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-Q4_K_M.gguf"
+        # Get HF token from environment
+        hf_token = os.getenv("HF_TOKEN")
+        # Use transformers library with retry logic
         try:
             logger.info("Loading model with transformers...")
             # Use Llama 3.1 8B Instruct (user now has access)
             base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
+            tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
             if device == "cuda":
                 model = model.to(device)