Spaces:

Braszczynski
/

ID2223Lab2

Runtime error

Braszczynski commited on Dec 9, 2024

Commit

94d5aca

verified ·

1 Parent(s): 32ab136

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,24 +1,26 @@
 import gradio as gr
 import torch
-from unsloth import FastLanguageModel
-from transformers import TextStreamer
 # Configuration Variables
 model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
 lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
 max_seq_length = 512  # Adjust as needed
-dtype = None   # Example dtype, adjust based on your setup
-load_in_4bit = True     # Set to True if you want to use 4-bit quantization
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
-# Load the base model with adapters
-model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cuda")
-model.load_adapter(lora_adapter)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
@@ -51,6 +53,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     response = response[len(chat_history):].strip()  # Remove the input context
     return response
 # Define the Gradio interface
 demo = gr.ChatInterface(
     respond,
@@ -63,4 +68,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
 # Configuration Variables
 model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
 lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
 max_seq_length = 512  # Adjust as needed
+dtype = None          # Example dtype, adjust based on your setup
+load_in_4bit = True   # Set to True if you want to use 4-bit quantization
+# Dynamically select device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Conditional import based on GPU availability
+if device.type == "cuda":
+    from unsloth import FastLanguageModel
+    model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
+    model.load_adapter(lora_adapter)
+else:
+    raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
     response = response[len(chat_history):].strip()  # Remove the input context
     return response
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 # Define the Gradio interface
 demo = gr.ChatInterface(
     respond,
 )
 if __name__ == "__main__":
+    demo.launch()