Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -302,42 +302,42 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 302 |
current_time = datetime.now()
|
| 303 |
|
| 304 |
self.model_name = model_path
|
| 305 |
-
|
| 306 |
try:
|
| 307 |
-
# Load tokenizer
|
| 308 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 309 |
model_path,
|
| 310 |
trust_remote_code=True,
|
| 311 |
token=hf_token
|
| 312 |
)
|
| 313 |
|
| 314 |
-
#
|
| 315 |
-
self.
|
| 316 |
-
|
| 317 |
-
torch_dtype=torch.float16,
|
| 318 |
-
device_map="auto", # This will work with ZeroGPU allocation
|
| 319 |
-
trust_remote_code=True,
|
| 320 |
-
low_cpu_mem_usage=True,
|
| 321 |
-
token=hf_token,
|
| 322 |
-
attn_implementation="eager"
|
| 323 |
-
)
|
| 324 |
-
|
| 325 |
-
# Success path - log timing
|
| 326 |
-
end_Loading_Model_time = time.perf_counter()
|
| 327 |
-
Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
|
| 328 |
-
log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
|
| 329 |
|
| 330 |
except Exception as e:
|
| 331 |
-
logger.error(f"Failed to
|
| 332 |
raise
|
| 333 |
|
| 334 |
# Ensure pad token exists
|
| 335 |
if self.tokenizer.pad_token is None:
|
| 336 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 337 |
|
| 338 |
-
# Initialize TextIteratorStreamer
|
| 339 |
self.streamer = None
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def _format_chat_template(self, prompt: str) -> str:
|
| 342 |
"""Format prompt using Phi-3's chat template"""
|
| 343 |
try:
|
|
|
|
| 302 |
current_time = datetime.now()
|
| 303 |
|
| 304 |
self.model_name = model_path
|
| 305 |
+
|
| 306 |
try:
|
| 307 |
+
# Load tokenizer
|
| 308 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 309 |
model_path,
|
| 310 |
trust_remote_code=True,
|
| 311 |
token=hf_token
|
| 312 |
)
|
| 313 |
|
| 314 |
+
# Store model path instead of loading model immediately
|
| 315 |
+
self.model_path = model_path
|
| 316 |
+
self.model = None # Load model lazily in GPU methods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
except Exception as e:
|
| 319 |
+
logger.error(f"Failed to initialize Phi-3-mini model {model_path}: {e}")
|
| 320 |
raise
|
| 321 |
|
| 322 |
# Ensure pad token exists
|
| 323 |
if self.tokenizer.pad_token is None:
|
| 324 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 325 |
|
|
|
|
| 326 |
self.streamer = None
|
| 327 |
|
| 328 |
+
def _load_model_if_needed(self):
|
| 329 |
+
"""Load model only when needed inside GPU context"""
|
| 330 |
+
if self.model is None:
|
| 331 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 332 |
+
self.model_path,
|
| 333 |
+
torch_dtype=torch.float16,
|
| 334 |
+
trust_remote_code=True,
|
| 335 |
+
low_cpu_mem_usage=True,
|
| 336 |
+
token=hf_token,
|
| 337 |
+
attn_implementation="eager"
|
| 338 |
+
)
|
| 339 |
+
return self.model
|
| 340 |
+
|
| 341 |
def _format_chat_template(self, prompt: str) -> str:
|
| 342 |
"""Format prompt using Phi-3's chat template"""
|
| 343 |
try:
|