Juna190825 commited on
Commit
d8d0f11
·
verified ·
1 Parent(s): fe55dbd

Update Dockerfile

Browse files
Files changed (1) hide show
  1. app.py +29 -43
app.py CHANGED
@@ -118,7 +118,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
118
  from huggingface_hub import login
119
  import torch
120
  import os
121
- import time # For manual retries
122
 
123
  # Authentication
124
  login(token=os.getenv('HF_TOKEN'))
@@ -128,50 +127,37 @@ MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
128
  CACHE_DIR = "/cache/models"
129
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
130
 
131
- def load_model_with_retry(max_retries=3):
132
- for attempt in range(max_retries):
133
- try:
134
- # First try loading from local cache only
135
- if attempt == 0:
136
- print("Attempting to load from local cache...")
137
- try:
138
- model = AutoModelForCausalLM.from_pretrained(
139
- MODEL_ID,
140
- cache_dir=CACHE_DIR,
141
- local_files_only=True
142
- ).to(DEVICE)
143
- tokenizer = AutoTokenizer.from_pretrained(
144
- MODEL_ID,
145
- cache_dir=CACHE_DIR,
146
- local_files_only=True
147
- )
148
- return model, tokenizer
149
- except OSError:
150
- print("Cache not found, will download...")
151
- continue
152
-
153
- # Download with retry
154
- print(f"Downloading model (attempt {attempt + 1})...")
155
- model = AutoModelForCausalLM.from_pretrained(
156
- MODEL_ID,
157
- cache_dir=CACHE_DIR
158
- ).to(DEVICE)
159
- tokenizer = AutoTokenizer.from_pretrained(
160
- MODEL_ID,
161
- cache_dir=CACHE_DIR
162
- )
163
- return model, tokenizer
164
-
165
- except Exception as e:
166
- if attempt == max_retries - 1:
167
- raise RuntimeError(f"Failed after {max_retries} attempts: {str(e)}")
168
-
169
- wait_time = min(2 ** (attempt + 1), 10)
170
- print(f"Attempt {attempt + 1} failed ({str(e)}), retrying in {wait_time}s...")
171
- time.sleep(wait_time)
172
 
173
  # Load model
174
- model, tokenizer = load_model_with_retry()
175
 
176
  def generate_text(prompt, max_length=200):
177
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
 
118
  from huggingface_hub import login
119
  import torch
120
  import os
 
121
 
122
  # Authentication
123
  login(token=os.getenv('HF_TOKEN'))
 
127
  CACHE_DIR = "/cache/models"
128
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
129
 
130
+ def load_model():
131
+ """Load model directly, attempting cache first"""
132
+ try:
133
+ # Try loading from cache
134
+ print("Attempting to load from cache...")
135
+ model = AutoModelForCausalLM.from_pretrained(
136
+ MODEL_ID,
137
+ cache_dir=CACHE_DIR,
138
+ local_files_only=True # Force cache usage
139
+ ).to(DEVICE)
140
+ tokenizer = AutoTokenizer.from_pretrained(
141
+ MODEL_ID,
142
+ cache_dir=CACHE_DIR,
143
+ local_files_only=True
144
+ )
145
+ except OSError:
146
+ # Fallback to download if cache missing
147
+ print("Cache not found, downloading...")
148
+ model = AutoModelForCausalLM.from_pretrained(
149
+ MODEL_ID,
150
+ cache_dir=CACHE_DIR
151
+ ).to(DEVICE)
152
+ tokenizer = AutoTokenizer.from_pretrained(
153
+ MODEL_ID,
154
+ cache_dir=CACHE_DIR
155
+ )
156
+
157
+ return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  # Load model
160
+ model, tokenizer = load_model()
161
 
162
  def generate_text(prompt, max_length=200):
163
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)