VoxAI-Turbo

Sleeping

App Files Files Community

Fu01978 commited on Dec 11, 2025

Commit

3ad9149

verified ·

1 Parent(s): ffe33f1

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -33

app.py CHANGED Viewed

@@ -1,14 +1,24 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-# Load model and tokenizer
-model_name = "unsloth/Llama-3.2-1B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto"
 )
 def chat(message, history):
@@ -30,39 +40,23 @@ def chat(message, history):
     # Add current message
     messages.append({"role": "user", "content": message})
-    # Apply chat template
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # Generate response
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=512,
         temperature=0.7,
         top_p=0.9,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
     )
-    # Decode and extract only the new response
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the assistant's response (after the last prompt)
-    response = response.split("assistant")[-1].strip()
-    return response
 # Create Gradio interface
 demo = gr.ChatInterface(
     fn=chat,
-    title="Llama 3.2 1B Instruct Chatbot",
-    description="Chat with Llama 3.2 1B Instruct model. Ask me anything!",
     examples=[
         "What is artificial intelligence?",
         "Write a short poem about coding",

 import gradio as gr
+from llama_cpp import Llama
+import os
+# Download and load the GGUF model
+model_url = "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K_L.gguf?download=true"
+model_path = "model.gguf"
+# Download model if not already present
+if not os.path.exists(model_path):
+    print("Downloading model...")
+    import urllib.request
+    urllib.request.urlretrieve(model_url, model_path)
+    print("Model downloaded!")
+# Load the model
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,  # Context window
+    n_threads=4,  # Number of CPU threads
+    n_gpu_layers=0  # Set to -1 to offload all layers to GPU if available
 )
 def chat(message, history):
     # Add current message
     messages.append({"role": "user", "content": message})
     # Generate response
+    response = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
         temperature=0.7,
         top_p=0.9,
+        stream=False
     )
+    # Extract the assistant's response
+    return response["choices"][0]["message"]["content"]
 # Create Gradio interface
 demo = gr.ChatInterface(
     fn=chat,
+    title="Llama 3.2 3B Instruct Chatbot (GGUF)",
+    description="Chat with Llama 3.2 3B Instruct model running from GGUF format. Ask me anything!",
     examples=[
         "What is artificial intelligence?",
         "Write a short poem about coding",