Spaces:
Build error
Build error
| import os, threading | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| # ====== MODEL CHOICE (uncensored, 2B, GGUF) ====== | |
| REPO_ID = "Kutches/UncensoredV2" | |
| # Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality. | |
| # llama-cpp-python supports glob patterns for filename. | |
| FILENAME_PATTERN = "*Q4_K_M*.gguf" # will match the Q4_K_M file in the repo | |
| # ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ====== | |
| N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom | |
| N_THREADS = None # Let llama.cpp pick; or set to 2 | |
| # If you see slow tokenization, you can set N_THREADS=2 explicitly. | |
| # ====== LOAD MODEL (downloads from the Hub automatically) ====== | |
| llm = Llama.from_pretrained( | |
| repo_id=REPO_ID, | |
| filename=FILENAME_PATTERN, # glob is supported | |
| n_ctx=N_CTX, | |
| n_threads=N_THREADS, | |
| verbose=False | |
| ) | |
| SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely." | |
| def build_prompt(messages, system_prompt=None): | |
| # Simple instruction-style prompt works reliably with many GGUF finetunes | |
| sys = system_prompt or SYSTEM_DEFAULT | |
| prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n" | |
| for m in messages: | |
| role = m.get("role", "user") | |
| content = (m.get("content") or "").strip() | |
| if role == "user": | |
| prompt += f"[INST] {content} [/INST]\n" | |
| elif role == "assistant": | |
| prompt += content + "\n" | |
| return prompt.strip() | |
| def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384): | |
| prompt = build_prompt(messages) | |
| stream = llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stop=["</s>", "[/INST]"], | |
| stream=True, | |
| ) | |
| buf = "" | |
| for chunk in stream: | |
| text = chunk.get("choices", [{}])[0].get("text", "") | |
| buf += text | |
| yield buf | |
| with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo: | |
| gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)") | |
| chat = gr.Chatbot(type="messages") | |
| with gr.Row(): | |
| temp = gr.Slider(0.0, 1.5, 0.7, label="temperature") | |
| topp = gr.Slider(0.0, 1.0, 0.9, label="top_p") | |
| max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens") | |
| msg = gr.Textbox(placeholder="Ask anything…", label="Message") | |
| def respond(m, history, temperature, top_p, max_new_tokens): | |
| history = (history or []) + [{"role": "user", "content": m}] | |
| return stream_reply(history, temperature, top_p, max_new_tokens) | |
| msg.submit(respond, [msg, chat, temp, topp, max_new], chat) | |
| demo.queue().launch() |