Files changed (1) hide show
  1. app.py +22 -76
app.py CHANGED
@@ -1,86 +1,32 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
- import torch
4
 
5
- # Configure 4-bit quantization
6
- quantization_config = BitsAndBytesConfig(
7
- load_in_4bit=True,
8
- bnb_4bit_compute_dtype=torch.float16,
9
- bnb_4bit_use_double_quant=True,
10
- bnb_4bit_quant_type="nf4"
11
- )
12
 
13
- # Load model and tokenizer
14
- model_name = "DavidAU/Qwen3-Zero-Coder-Reasoning-V2-0.8B"
15
- print("Loading tokenizer...")
16
- tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- print("Loading model...")
18
- model = AutoModelForCausalLM.from_pretrained(
19
- model_name,
20
- quantization_config=quantization_config,
21
- device_map="auto",
22
- low_cpu_mem_usage=True
23
  )
24
- print("Model loaded!")
25
 
26
- def chat(message, history):
27
- """
28
- Process chat messages and generate responses.
29
-
30
- Args:
31
- message: Current user message
32
- history: List of [user_msg, bot_msg] pairs
33
- """
34
- # Build conversation with proper Llama format
35
- messages = []
36
-
37
- # Add chat history
38
- for user_msg, bot_msg in history:
39
- messages.append({"role": "user", "content": user_msg})
40
- messages.append({"role": "assistant", "content": bot_msg})
41
-
42
- # Add current message
43
- messages.append({"role": "user", "content": message})
44
-
45
- # Apply chat template
46
- prompt = tokenizer.apply_chat_template(
47
- messages,
48
- tokenize=False,
49
- add_generation_prompt=True
50
  )
51
-
52
- # Tokenize
53
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
54
-
55
- # Generate response with streaming
56
- streamer_output = ""
57
- with torch.no_grad():
58
- outputs = model.generate(
59
- **inputs,
60
- max_new_tokens=512,
61
- temperature=0.7,
62
- top_p=0.9,
63
- do_sample=True,
64
- pad_token_id=tokenizer.eos_token_id
65
- )
66
-
67
- # Decode and extract only the new response
68
- response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
69
-
70
- return response.strip()
71
 
72
- # Create Gradio interface
73
  demo = gr.ChatInterface(
74
- fn=chat,
75
- title="Llama 3.2 3B Instruct Chatbot",
76
- description="Chat with Llama 3.2 3B Instruct model (4-bit quantized). Ask me anything!",
77
- examples=[
78
- "What is artificial intelligence?",
79
- "Write a short poem about coding",
80
- "Explain quantum computing in simple terms"
81
- ],
82
- theme=gr.themes.Soft()
83
  )
84
 
85
- if __name__ == "__main__":
86
- demo.launch()
 
1
  import gradio as gr
2
+ from koboldcpp import KoboldCpp
3
+ from huggingface_hub import hf_hub_download
4
 
5
+ # Download GGUF model
6
+ REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
7
+ FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
8
+
9
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
10
 
11
+ # Load KoboldCpp runner
12
+ llm = KoboldCpp(
13
+ model_path=model_path,
14
+ context_length=2048,
15
+ threads=4
 
 
 
 
 
16
  )
 
17
 
18
+ def chat_fn(message, history):
19
+ response = llm.generate(
20
+ prompt=message,
21
+ max_length=256,
22
+ temp=0.7,
23
+ top_p=0.95,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
25
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
27
  demo = gr.ChatInterface(
28
+ fn=chat_fn,
29
+ title="GGUF via KoboldCpp ",
 
 
 
 
 
 
 
30
  )
31
 
32
+ demo.launch()