Spaces:

salmapm
/

chatllama.io

Sleeping

salmapm commited on Aug 5, 2024

Commit

5cd3006

verified ·

1 Parent(s): 6faa75f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,23 +1,32 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-from huggingface_hub import login, HfApi
 def load_model(token):
     # Log in with the user's token
     login(token=token)
-    # Load the model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained("salmapm/llama2_salma")
-    model = AutoModelForCausalLM.from_pretrained(
-        "salmapm/llama2_salma",
-        load_in_8bit=True,  # Enable 8-bit quantization
-        device_map='auto'  # Automatically maps model to available devices
-    )
-    # Ensure the model is on the correct device (GPU if available)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
     return model, tokenizer, device

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+from huggingface_hub import login
 def load_model(token):
     # Log in with the user's token
     login(token=token)
+    # Define model loading parameters
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_kwargs = {}
+    if torch.cuda.is_available():
+        model_kwargs = {
+            'load_in_8bit': True,  # Enable 8-bit quantization if GPU is available
+            'device_map': 'auto',  # Automatically maps model to available devices
+            'low_cpu_mem_usage': True  # Reduce CPU memory usage
+        }
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("salmapm/llama2_salma")
+        model = AutoModelForCausalLM.from_pretrained(
+            "salmapm/llama2_salma",
+            **model_kwargs
+        )
+        model.to(device)
+    except Exception as e:
+        raise RuntimeError(f"Model loading failed: {e}")
     return model, tokenizer, device