Spaces:

david167
/

question-generation-api

Sleeping

david167 commited on Aug 6, 2025

Commit

bd1b0d6

1 Parent(s): 4656a02

Fix permissions error: proper cache directory and HF token auth for Llama 3.1

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -34,9 +34,11 @@ RUN pip install -r requirements.txt
 COPY app.py .
 COPY README.md .
-# Create HF cache directory
-RUN mkdir -p /.cache
-ENV HF_HOME=/.cache
 # Expose port
 EXPOSE 7860

 COPY app.py .
 COPY README.md .
+# Create HF cache directory with proper permissions
+RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
+ENV HF_HOME=/app/.cache
+ENV TRANSFORMERS_CACHE=/app/.cache
+ENV HF_DATASETS_CACHE=/app/.cache
 # Expose port
 EXPOSE 7860

app.py CHANGED Viewed

@@ -62,13 +62,17 @@ async def load_model():
             logger.info("Loading model with transformers...")
-            # Use the base model instead of GGUF for better compatibility
             base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
             tokenizer = AutoTokenizer.from_pretrained(
                 base_model_name,
                 use_fast=True,
-                trust_remote_code=True
             )
             model = AutoModelForCausalLM.from_pretrained(
@@ -76,7 +80,8 @@ async def load_model():
                 torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                 device_map="auto" if device == "cuda" else None,
                 trust_remote_code=True,
-                low_cpu_mem_usage=True
             )
             if device == "cuda":

             logger.info("Loading model with transformers...")
+            # Use Llama 3.1 8B Instruct (requires HF token with access)
             base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
+            # Get HF token from environment
+            hf_token = os.getenv("HF_TOKEN")
             tokenizer = AutoTokenizer.from_pretrained(
                 base_model_name,
                 use_fast=True,
+                trust_remote_code=True,
+                token=hf_token
             )
             model = AutoModelForCausalLM.from_pretrained(
                 torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                 device_map="auto" if device == "cuda" else None,
                 trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                token=hf_token
             )
             if device == "cuda":