Spaces:

david167
/

question-generation-api

Sleeping

david167 commited on Aug 6, 2025

Commit

444b4d9

1 Parent(s): de72460

Fix XetHub issue: use Meta-Llama-3.1-8B-Instruct (official HF storage)

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -38,7 +38,7 @@ COPY README.md .
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 ENV HF_HOME=/app/.cache
 ENV HF_DATASETS_CACHE=/app/.cache
-ENV OMP_NUM_THREADS=1
 # Expose port
 EXPOSE 7860

 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 ENV HF_HOME=/app/.cache
 ENV HF_DATASETS_CACHE=/app/.cache
+ENV OMP_NUM_THREADS=4
 # Expose port
 EXPOSE 7860

app.py CHANGED Viewed

@@ -52,9 +52,7 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
                 model_name,
                 use_fast=True,
                 trust_remote_code=True,
-                token=hf_token,
-                resume_download=True,  # Resume interrupted downloads
-                force_download=False   # Use cache if available
             )
             model = AutoModelForCausalLM.from_pretrained(
@@ -64,9 +62,7 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
-                token=hf_token,
-                resume_download=True,  # Resume interrupted downloads
-                force_download=False   # Use cache if available
             )
             return tokenizer, model
@@ -105,8 +101,8 @@ async def load_model():
         try:
             logger.info("Loading model with transformers...")
-            # Use Llama 3.1 8B Instruct (user now has access)
-            base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)

                 model_name,
                 use_fast=True,
                 trust_remote_code=True,
+                token=hf_token
             )
             model = AutoModelForCausalLM.from_pretrained(
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
+                token=hf_token
             )
             return tokenizer, model
         try:
             logger.info("Loading model with transformers...")
+            # Use Llama 3.1 8B Instruct from official HF storage (not XetHub)
+            base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)