import os, threading import gradio as gr from llama_cpp import Llama # ====== MODEL CHOICE (uncensored, 2B, GGUF) ====== REPO_ID = "Kutches/UncensoredV2" # Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality. # llama-cpp-python supports glob patterns for filename. FILENAME_PATTERN = "*Q4_K_M*.gguf" # will match the Q4_K_M file in the repo # ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ====== N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom N_THREADS = None # Let llama.cpp pick; or set to 2 # If you see slow tokenization, you can set N_THREADS=2 explicitly. # ====== LOAD MODEL (downloads from the Hub automatically) ====== llm = Llama.from_pretrained( repo_id=REPO_ID, filename=FILENAME_PATTERN, # glob is supported n_ctx=N_CTX, n_threads=N_THREADS, verbose=False ) SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely." def build_prompt(messages, system_prompt=None): # Simple instruction-style prompt works reliably with many GGUF finetunes sys = system_prompt or SYSTEM_DEFAULT prompt = f"<>\n{sys}\n<>\n" for m in messages: role = m.get("role", "user") content = (m.get("content") or "").strip() if role == "user": prompt += f"[INST] {content} [/INST]\n" elif role == "assistant": prompt += content + "\n" return prompt.strip() def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384): prompt = build_prompt(messages) stream = llm( prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stop=["", "[/INST]"], stream=True, ) buf = "" for chunk in stream: text = chunk.get("choices", [{}])[0].get("text", "") buf += text yield buf with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo: gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)") chat = gr.Chatbot(type="messages") with gr.Row(): temp = gr.Slider(0.0, 1.5, 0.7, label="temperature") topp = gr.Slider(0.0, 1.0, 0.9, label="top_p") max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens") msg = gr.Textbox(placeholder="Ask anything…", label="Message") def respond(m, history, temperature, top_p, max_new_tokens): history = (history or []) + [{"role": "user", "content": m}] return stream_reply(history, temperature, top_p, max_new_tokens) msg.submit(respond, [msg, chat, temp, topp, max_new], chat) demo.queue().launch()