YashChowdhary's picture
Update app.py
36ef25a verified
import os, threading
import gradio as gr
from llama_cpp import Llama
# ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
REPO_ID = "Kutches/UncensoredV2"
# Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
# llama-cpp-python supports glob patterns for filename.
FILENAME_PATTERN = "*Q4_K_M*.gguf" # will match the Q4_K_M file in the repo
# ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom
N_THREADS = None # Let llama.cpp pick; or set to 2
# If you see slow tokenization, you can set N_THREADS=2 explicitly.
# ====== LOAD MODEL (downloads from the Hub automatically) ======
llm = Llama.from_pretrained(
repo_id=REPO_ID,
filename=FILENAME_PATTERN, # glob is supported
n_ctx=N_CTX,
n_threads=N_THREADS,
verbose=False
)
SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."
def build_prompt(messages, system_prompt=None):
# Simple instruction-style prompt works reliably with many GGUF finetunes
sys = system_prompt or SYSTEM_DEFAULT
prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
for m in messages:
role = m.get("role", "user")
content = (m.get("content") or "").strip()
if role == "user":
prompt += f"[INST] {content} [/INST]\n"
elif role == "assistant":
prompt += content + "\n"
return prompt.strip()
def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
prompt = build_prompt(messages)
stream = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=["</s>", "[/INST]"],
stream=True,
)
buf = ""
for chunk in stream:
text = chunk.get("choices", [{}])[0].get("text", "")
buf += text
yield buf
with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
chat = gr.Chatbot(type="messages")
with gr.Row():
temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
msg = gr.Textbox(placeholder="Ask anything…", label="Message")
def respond(m, history, temperature, top_p, max_new_tokens):
history = (history or []) + [{"role": "user", "content": m}]
return stream_reply(history, temperature, top_p, max_new_tokens)
msg.submit(respond, [msg, chat, temp, topp, max_new], chat)
demo.queue().launch()