Spaces:
Build error
Build error
File size: 2,649 Bytes
36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a 50b020a 36ef25a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os, threading
import gradio as gr
from llama_cpp import Llama
# ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
REPO_ID = "Kutches/UncensoredV2"
# Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
# llama-cpp-python supports glob patterns for filename.
FILENAME_PATTERN = "*Q4_K_M*.gguf" # will match the Q4_K_M file in the repo
# ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom
N_THREADS = None # Let llama.cpp pick; or set to 2
# If you see slow tokenization, you can set N_THREADS=2 explicitly.
# ====== LOAD MODEL (downloads from the Hub automatically) ======
llm = Llama.from_pretrained(
repo_id=REPO_ID,
filename=FILENAME_PATTERN, # glob is supported
n_ctx=N_CTX,
n_threads=N_THREADS,
verbose=False
)
SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."
def build_prompt(messages, system_prompt=None):
# Simple instruction-style prompt works reliably with many GGUF finetunes
sys = system_prompt or SYSTEM_DEFAULT
prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
for m in messages:
role = m.get("role", "user")
content = (m.get("content") or "").strip()
if role == "user":
prompt += f"[INST] {content} [/INST]\n"
elif role == "assistant":
prompt += content + "\n"
return prompt.strip()
def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
prompt = build_prompt(messages)
stream = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop=["</s>", "[/INST]"],
stream=True,
)
buf = ""
for chunk in stream:
text = chunk.get("choices", [{}])[0].get("text", "")
buf += text
yield buf
with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
chat = gr.Chatbot(type="messages")
with gr.Row():
temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
msg = gr.Textbox(placeholder="Ask anything…", label="Message")
def respond(m, history, temperature, top_p, max_new_tokens):
history = (history or []) + [{"role": "user", "content": m}]
return stream_reply(history, temperature, top_p, max_new_tokens)
msg.submit(respond, [msg, chat, temp, topp, max_new], chat)
demo.queue().launch() |