Spaces:

Fu01978
/

VoxAI

Sleeping

App Files Files Community

Fu01978 commited on Dec 11, 2025

Commit

10ee8ca

verified ·

1 Parent(s): e77b699

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -29

app.py CHANGED Viewed

@@ -1,26 +1,25 @@
 import gradio as gr
-from llama_cpp import Llama
-import os
-# Download and load the GGUF model
-model_url = "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K_L.gguf?download=true"
-model_path = "model.gguf"
-# Download model if not already present
-if not os.path.exists(model_path):
-    print("Downloading model...")
-    import urllib.request
-    urllib.request.urlretrieve(model_url, model_path)
-    print("Model downloaded!")
-# Load the model
 print("Loading model...")
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=4,
-    n_gpu_layers=0,
-    verbose=False
 )
 print("Model loaded!")
@@ -43,22 +42,38 @@ def chat(message, history):
     # Add current message
     messages.append({"role": "user", "content": message})
-    # Generate response
-    response = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=512,
-        temperature=0.7,
-        top_p=0.9,
     )
-    # Extract the assistant's response
-    return response["choices"][0]["message"]["content"]
 # Create Gradio interface
 demo = gr.ChatInterface(
     fn=chat,
-    title="Llama 3.2 3B Instruct Chatbot (GGUF)",
-    description="Chat with Llama 3.2 3B Instruct model running from GGUF format. Ask me anything!",
     examples=[
         "What is artificial intelligence?",
         "Write a short poem about coding",

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import torch
+# Configure 4-bit quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+# Load model and tokenizer
+model_name = "meta-llama/Llama-3.2-3B-Instruct"
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 print("Loading model...")
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=quantization_config,
+    device_map="auto",
+    low_cpu_mem_usage=True
 )
 print("Model loaded!")
     # Add current message
     messages.append({"role": "user", "content": message})
+    # Apply chat template
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
+    # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Generate response with streaming
+    streamer_output = ""
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    # Decode and extract only the new response
+    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    return response.strip()
 # Create Gradio interface
 demo = gr.ChatInterface(
     fn=chat,
+    title="Llama 3.2 3B Instruct Chatbot",
+    description="Chat with Llama 3.2 3B Instruct model (4-bit quantized). Ask me anything!",
     examples=[
         "What is artificial intelligence?",
         "Write a short poem about coding",