YashChowdhary commited on
Commit
36ef25a
·
verified ·
1 Parent(s): 604136f

Update app.py

Browse files

Second change ,
changed the llm model to - Kutches/UncensoredV2 (2B, GGUF, multiple quant levels) . Unrestricted , with Q4_K_M ≈ 1.4 - 2.0 GB that fits comfortably and performs well on CPU Basic.

Files changed (1) hide show
  1. app.py +37 -35
app.py CHANGED
@@ -1,69 +1,71 @@
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
- import os
4
 
5
- # GGUF model on the Hub (4-bit). You can change to another GGUF filename from the same repo if needed.
6
- REPO_ID = "Melvin56/Nidum-Llama-3.2-3B-Uncensored-IQ4_XS-GGUF"
7
- # Use a wildcard or exact filename; IQ4_XS is very light for CPU Basic.
8
- FILENAME_PATTERN = "*Q4_K_M.gguf" # fallback wildcard
9
- N_CTX = int(os.getenv("N_CTX", "4096")) # context length; keep moderate on free CPU
10
 
11
- # Download and memory-map the model from the Hub
12
- # This API pulls GGUF directly; no GPU is required.
 
 
 
 
13
  llm = Llama.from_pretrained(
14
  repo_id=REPO_ID,
15
- filename=FILENAME_PATTERN,
16
  n_ctx=N_CTX,
17
- n_threads=None, # let llama-cpp choose based on CPU cores
18
  verbose=False
19
  )
20
 
21
- SYSTEM_DEFAULT = "You are a helpful assistant. Keep replies concise."
22
 
23
- def format_messages(messages, system_prompt):
24
- # llama.cpp expects a single prompt string. We'll compose a simple chat format.
25
- # You can switch to Alpaca-style or other templates if a GGUF requires it.
26
  sys = system_prompt or SYSTEM_DEFAULT
27
- conv = f"<<SYS>>\n{sys}\n<</SYS>>\n"
28
  for m in messages:
29
  role = m.get("role", "user")
30
- content = m.get("content", "")
31
  if role == "user":
32
- conv += f"[INST] {content.strip()} [/INST]\n"
33
  elif role == "assistant":
34
- conv += f"{content.strip()}\n"
35
- return conv.strip()
36
 
37
- def generate_reply(messages, temperature, top_p, max_tokens):
38
- prompt = format_messages(messages, system_prompt=None)
39
- # Streaming responses with llama.cpp
40
  stream = llm(
41
  prompt,
42
  max_tokens=max_tokens,
43
  temperature=temperature,
44
  top_p=top_p,
45
  stop=["</s>", "[/INST]"],
46
- stream=True
47
  )
48
- partial = ""
49
  for chunk in stream:
50
- token = chunk.get("choices", [{}])[0].get("text", "")
51
- partial += token
52
- yield partial
53
 
54
- with gr.Blocks(title="Nidum-Llama-3.2-3B-Uncensored (CPU)") as demo:
55
- gr.Markdown("## Nidum-Llama-3.2-3B-Uncensored (GGUF on CPU Basic)")
56
  chat = gr.Chatbot(type="messages")
57
  with gr.Row():
58
  temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
59
  topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
60
  max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
61
- inp = gr.Textbox(placeholder="Ask something...", label="Message")
62
 
63
- def respond(msg, history, temperature, top_p, max_new_tokens):
64
- history = (history or []) + [{"role": "user", "content": msg}]
65
- return generate_reply(history, temperature, top_p, max_new_tokens)
66
 
67
- inp.submit(respond, [inp, chat, temp, topp, max_new], chat)
68
 
69
- demo.queue().launch()
 
1
+ import os, threading
2
  import gradio as gr
3
  from llama_cpp import Llama
 
4
 
5
+ # ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
6
+ REPO_ID = "Kutches/UncensoredV2"
7
+ # Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
8
+ # llama-cpp-python supports glob patterns for filename.
9
+ FILENAME_PATTERN = "*Q4_K_M*.gguf" # will match the Q4_K_M file in the repo
10
 
11
+ # ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
12
+ N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom
13
+ N_THREADS = None # Let llama.cpp pick; or set to 2
14
+ # If you see slow tokenization, you can set N_THREADS=2 explicitly.
15
+
16
+ # ====== LOAD MODEL (downloads from the Hub automatically) ======
17
  llm = Llama.from_pretrained(
18
  repo_id=REPO_ID,
19
+ filename=FILENAME_PATTERN, # glob is supported
20
  n_ctx=N_CTX,
21
+ n_threads=N_THREADS,
22
  verbose=False
23
  )
24
 
25
+ SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."
26
 
27
+ def build_prompt(messages, system_prompt=None):
28
+ # Simple instruction-style prompt works reliably with many GGUF finetunes
 
29
  sys = system_prompt or SYSTEM_DEFAULT
30
+ prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
31
  for m in messages:
32
  role = m.get("role", "user")
33
+ content = (m.get("content") or "").strip()
34
  if role == "user":
35
+ prompt += f"[INST] {content} [/INST]\n"
36
  elif role == "assistant":
37
+ prompt += content + "\n"
38
+ return prompt.strip()
39
 
40
+ def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
41
+ prompt = build_prompt(messages)
 
42
  stream = llm(
43
  prompt,
44
  max_tokens=max_tokens,
45
  temperature=temperature,
46
  top_p=top_p,
47
  stop=["</s>", "[/INST]"],
48
+ stream=True,
49
  )
50
+ buf = ""
51
  for chunk in stream:
52
+ text = chunk.get("choices", [{}])[0].get("text", "")
53
+ buf += text
54
+ yield buf
55
 
56
+ with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
57
+ gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
58
  chat = gr.Chatbot(type="messages")
59
  with gr.Row():
60
  temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
61
  topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
62
  max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
63
+ msg = gr.Textbox(placeholder="Ask anything…", label="Message")
64
 
65
+ def respond(m, history, temperature, top_p, max_new_tokens):
66
+ history = (history or []) + [{"role": "user", "content": m}]
67
+ return stream_reply(history, temperature, top_p, max_new_tokens)
68
 
69
+ msg.submit(respond, [msg, chat, temp, topp, max_new], chat)
70
 
71
+ demo.queue().launch()