Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,24 +1,26 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
-
from
|
| 4 |
-
from transformers import TextStreamer
|
| 5 |
|
| 6 |
# Configuration Variables
|
| 7 |
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
|
| 8 |
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
|
| 9 |
|
| 10 |
max_seq_length = 512 # Adjust as needed
|
| 11 |
-
dtype = None
|
| 12 |
-
load_in_4bit = True
|
| 13 |
-
|
| 14 |
-
# Load the tokenizer
|
| 15 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
| 16 |
-
|
| 17 |
-
# Load the base model with adapters
|
| 18 |
-
model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cuda")
|
| 19 |
-
model.load_adapter(lora_adapter)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 24 |
# Combine system message and chat history
|
|
@@ -51,6 +53,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
| 51 |
response = response[len(chat_history):].strip() # Remove the input context
|
| 52 |
return response
|
| 53 |
|
|
|
|
|
|
|
|
|
|
| 54 |
# Define the Gradio interface
|
| 55 |
demo = gr.ChatInterface(
|
| 56 |
respond,
|
|
@@ -63,4 +68,4 @@ demo = gr.ChatInterface(
|
|
| 63 |
)
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
|
|
|
|
| 4 |
|
| 5 |
# Configuration Variables
|
| 6 |
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
|
| 7 |
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
|
| 8 |
|
| 9 |
max_seq_length = 512 # Adjust as needed
|
| 10 |
+
dtype = None # Example dtype, adjust based on your setup
|
| 11 |
+
load_in_4bit = True # Set to True if you want to use 4-bit quantization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
# Dynamically select device
|
| 14 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 15 |
+
print(f"Using device: {device}")
|
| 16 |
|
| 17 |
+
# Conditional import based on GPU availability
|
| 18 |
+
if device.type == "cuda":
|
| 19 |
+
from unsloth import FastLanguageModel
|
| 20 |
+
model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
|
| 21 |
+
model.load_adapter(lora_adapter)
|
| 22 |
+
else:
|
| 23 |
+
raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
|
| 24 |
|
| 25 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 26 |
# Combine system message and chat history
|
|
|
|
| 53 |
response = response[len(chat_history):].strip() # Remove the input context
|
| 54 |
return response
|
| 55 |
|
| 56 |
+
# Load the tokenizer
|
| 57 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
| 58 |
+
|
| 59 |
# Define the Gradio interface
|
| 60 |
demo = gr.ChatInterface(
|
| 61 |
respond,
|
|
|
|
| 68 |
)
|
| 69 |
|
| 70 |
if __name__ == "__main__":
|
| 71 |
+
demo.launch()
|