Spaces:
Runtime error
Runtime error
Update Dockerfile
Browse files
app.py
CHANGED
|
@@ -118,7 +118,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
| 118 |
from huggingface_hub import login
|
| 119 |
import torch
|
| 120 |
import os
|
| 121 |
-
import time # For manual retries
|
| 122 |
|
| 123 |
# Authentication
|
| 124 |
login(token=os.getenv('HF_TOKEN'))
|
|
@@ -128,50 +127,37 @@ MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
|
|
| 128 |
CACHE_DIR = "/cache/models"
|
| 129 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 130 |
|
| 131 |
-
def
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 160 |
-
MODEL_ID,
|
| 161 |
-
cache_dir=CACHE_DIR
|
| 162 |
-
)
|
| 163 |
-
return model, tokenizer
|
| 164 |
-
|
| 165 |
-
except Exception as e:
|
| 166 |
-
if attempt == max_retries - 1:
|
| 167 |
-
raise RuntimeError(f"Failed after {max_retries} attempts: {str(e)}")
|
| 168 |
-
|
| 169 |
-
wait_time = min(2 ** (attempt + 1), 10)
|
| 170 |
-
print(f"Attempt {attempt + 1} failed ({str(e)}), retrying in {wait_time}s...")
|
| 171 |
-
time.sleep(wait_time)
|
| 172 |
|
| 173 |
# Load model
|
| 174 |
-
model, tokenizer =
|
| 175 |
|
| 176 |
def generate_text(prompt, max_length=200):
|
| 177 |
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
|
|
|
|
| 118 |
from huggingface_hub import login
|
| 119 |
import torch
|
| 120 |
import os
|
|
|
|
| 121 |
|
| 122 |
# Authentication
|
| 123 |
login(token=os.getenv('HF_TOKEN'))
|
|
|
|
| 127 |
CACHE_DIR = "/cache/models"
|
| 128 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 129 |
|
| 130 |
+
def load_model():
|
| 131 |
+
"""Load model directly, attempting cache first"""
|
| 132 |
+
try:
|
| 133 |
+
# Try loading from cache
|
| 134 |
+
print("Attempting to load from cache...")
|
| 135 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 136 |
+
MODEL_ID,
|
| 137 |
+
cache_dir=CACHE_DIR,
|
| 138 |
+
local_files_only=True # Force cache usage
|
| 139 |
+
).to(DEVICE)
|
| 140 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 141 |
+
MODEL_ID,
|
| 142 |
+
cache_dir=CACHE_DIR,
|
| 143 |
+
local_files_only=True
|
| 144 |
+
)
|
| 145 |
+
except OSError:
|
| 146 |
+
# Fallback to download if cache missing
|
| 147 |
+
print("Cache not found, downloading...")
|
| 148 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 149 |
+
MODEL_ID,
|
| 150 |
+
cache_dir=CACHE_DIR
|
| 151 |
+
).to(DEVICE)
|
| 152 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 153 |
+
MODEL_ID,
|
| 154 |
+
cache_dir=CACHE_DIR
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
return model, tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
# Load model
|
| 160 |
+
model, tokenizer = load_model()
|
| 161 |
|
| 162 |
def generate_text(prompt, max_length=200):
|
| 163 |
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
|