Spaces:

Fu01978
/

VoxAI

Sleeping

App Files Files Community

Update app.py

by Fu01978 - opened Dec 12, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+22

-76

Files changed (1) hide show

app.py +22 -76

app.py CHANGED Viewed

@@ -1,86 +1,32 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-import torch
-# Configure 4-bit quantization
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4"
-)
-# Load model and tokenizer
-model_name = "DavidAU/Qwen3-Zero-Coder-Reasoning-V2-0.8B"
-print("Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=quantization_config,
-    device_map="auto",
-    low_cpu_mem_usage=True
 )
-print("Model loaded!")
-def chat(message, history):
-    """
-    Process chat messages and generate responses.
-    Args:
-        message: Current user message
-        history: List of [user_msg, bot_msg] pairs
-    """
-    # Build conversation with proper Llama format
-    messages = []
-    # Add chat history
-    for user_msg, bot_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
-    # Add current message
-    messages.append({"role": "user", "content": message})
-    # Apply chat template
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
     )
-    # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Generate response with streaming
-    streamer_output = ""
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    # Decode and extract only the new response
-    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
-    return response.strip()
-# Create Gradio interface
 demo = gr.ChatInterface(
-    fn=chat,
-    title="Llama 3.2 3B Instruct Chatbot",
-    description="Chat with Llama 3.2 3B Instruct model (4-bit quantized). Ask me anything!",
-    examples=[
-        "What is artificial intelligence?",
-        "Write a short poem about coding",
-        "Explain quantum computing in simple terms"
-    ],
-    theme=gr.themes.Soft()
 )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from koboldcpp import KoboldCpp
+from huggingface_hub import hf_hub_download
+# Download GGUF model
+REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+# Load KoboldCpp runner
+llm = KoboldCpp(
+    model_path=model_path,
+    context_length=2048,
+    threads=4
 )
+def chat_fn(message, history):
+    response = llm.generate(
+        prompt=message,
+        max_length=256,
+        temp=0.7,
+        top_p=0.95,
     )
+    return response
 demo = gr.ChatInterface(
+    fn=chat_fn,
+    title="GGUF via KoboldCpp ⚡",
 )
+demo.launch()