Spaces:

salmapm
/

chatllama.io

Sleeping

salmapm commited on Aug 5, 2024

Commit

b0092a1

verified ·

1 Parent(s): 5cd3006

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,31 +3,26 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from huggingface_hub import login
-def load_model(token):
-    # Log in with the user's token
-    login(token=token)
-    # Define model loading parameters
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model_kwargs = {}
-    if torch.cuda.is_available():
-        model_kwargs = {
-            'load_in_8bit': True,  # Enable 8-bit quantization if GPU is available
-            'device_map': 'auto',  # Automatically maps model to available devices
-            'low_cpu_mem_usage': True  # Reduce CPU memory usage
-        }
-    try:
         tokenizer = AutoTokenizer.from_pretrained("salmapm/llama2_salma")
         model = AutoModelForCausalLM.from_pretrained(
             "salmapm/llama2_salma",
             **model_kwargs
         )
         model.to(device)
-    except Exception as e:
-        raise RuntimeError(f"Model loading failed: {e}")
     return model, tokenizer, device
 def respond(message, history, system_message, max_tokens, temperature, top_p, token):

 import torch
 from huggingface_hub import login
+model, tokenizer, device = None, None, None
+def load_model(token):
+    global model, tokenizer, device
+    if model is None:
+        login(token=token)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model_kwargs = {}
+        if torch.cuda.is_available():
+            model_kwargs = {
+                'load_in_8bit': True,
+                'device_map': 'auto',
+                'low_cpu_mem_usage': True
+            }
         tokenizer = AutoTokenizer.from_pretrained("salmapm/llama2_salma")
         model = AutoModelForCausalLM.from_pretrained(
             "salmapm/llama2_salma",
             **model_kwargs
         )
         model.to(device)
     return model, tokenizer, device
 def respond(message, history, system_message, max_tokens, temperature, top_p, token):