Spaces:

ahmedbasemdev
/

FineTunedChatbot

Runtime error

App Files Files Community

ahmedbasemdev commited on Nov 23, 2024

Commit

2fcb420

verified ·

1 Parent(s): c9746e0

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -21

app.py CHANGED Viewed

@@ -1,34 +1,38 @@
-import gradio as gr
-# Load your model and tokenizer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-# Specify the model name
 model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
-# Load the model with 8-bit quantization
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",  # Automatically map the model to the available device (CPU)
-    load_in_8bit=True,  # Enable 8-bit quantization
-    torch_dtype=torch.float16  # Use mixed precision
 )
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 def single_inference(question):
     messages = []
     messages.append({"role": "user", "content": question})
     input_ids = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
-    ).to(model.device)
     terminators = [
         tokenizer.eos_token_id,
         tokenizer.convert_tokens_to_ids("<|eot_id|>")
@@ -45,15 +49,15 @@ def single_inference(question):
     output = tokenizer.decode(response, skip_special_tokens=True)
     return output
-# Create the Gradio interface
 interface = gr.Interface(
-    fn=single_inference,  # Function to wrap
-    inputs=gr.Textbox(lines=2, placeholder="Ask a question..."),  # Input type
-    outputs=gr.Textbox(label="Response"),  # Output type
-    title="Chat with Your Model",  # App title
-    description="Enter a question, and the model will generate a response.",  # App description
 )
-# Launch the app
-if __name__ == "__main__":
-    interface.launch()

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+import gradio as gr
+# Model and tokenizer paths
 model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
+# Load the model
+print("Loading the model...")
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Apply dynamic quantization to reduce model size and improve CPU performance
+print("Applying quantization...")
+model = torch.quantization.quantize_dynamic(
+    model,  # Model to quantize
+    {torch.nn.Linear},  # Layers to quantize (e.g., Linear layers)
+    dtype=torch.qint8,  # Quantized data type
 )
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Define the inference function
 def single_inference(question):
     messages = []
     messages.append({"role": "user", "content": question})
+    # Tokenize the input
     input_ids = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
+    ).to("cpu")  # Ensure everything runs on CPU
+    # Generate a response
     terminators = [
         tokenizer.eos_token_id,
         tokenizer.convert_tokens_to_ids("<|eot_id|>")
     output = tokenizer.decode(response, skip_special_tokens=True)
     return output
+# Gradio interface
+print("Setting up Gradio app...")
 interface = gr.Interface(
+    fn=single_inference,
+    inputs="text",
+    outputs="text",
+    title="Chatbot",
+    description="Ask me anything!"
 )
+# Launch the Gradio app
+interface.launch()