File size: 2,649 Bytes
36ef25a
50b020a
 
 
36ef25a
 
 
 
 
50b020a
36ef25a
 
 
 
 
 
50b020a
 
36ef25a
50b020a
36ef25a
50b020a
 
 
36ef25a
50b020a
36ef25a
 
50b020a
36ef25a
50b020a
 
36ef25a
50b020a
36ef25a
50b020a
36ef25a
 
50b020a
36ef25a
 
50b020a
 
 
 
 
 
36ef25a
50b020a
36ef25a
50b020a
36ef25a
 
 
50b020a
36ef25a
 
50b020a
 
 
 
 
36ef25a
50b020a
36ef25a
 
 
50b020a
36ef25a
50b020a
36ef25a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os, threading
import gradio as gr
from llama_cpp import Llama

# ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
REPO_ID = "Kutches/UncensoredV2"
# Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
# llama-cpp-python supports glob patterns for filename.
FILENAME_PATTERN = "*Q4_K_M*.gguf"  # will match the Q4_K_M file in the repo

# ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
N_CTX   = int(os.getenv("N_CTX", "2048"))    # reduce to 2048 for memory headroom
N_THREADS = None                              # Let llama.cpp pick; or set to 2
# If you see slow tokenization, you can set N_THREADS=2 explicitly.

# ====== LOAD MODEL (downloads from the Hub automatically) ======
llm = Llama.from_pretrained(
    repo_id=REPO_ID,
    filename=FILENAME_PATTERN,   # glob is supported
    n_ctx=N_CTX,
    n_threads=N_THREADS,
    verbose=False
)

SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."

def build_prompt(messages, system_prompt=None):
    # Simple instruction-style prompt works reliably with many GGUF finetunes
    sys = system_prompt or SYSTEM_DEFAULT
    prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
    for m in messages:
        role = m.get("role", "user")
        content = (m.get("content") or "").strip()
        if role == "user":
            prompt += f"[INST] {content} [/INST]\n"
        elif role == "assistant":
            prompt += content + "\n"
    return prompt.strip()

def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
    prompt = build_prompt(messages)
    stream = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=["</s>", "[/INST]"],
        stream=True,
    )
    buf = ""
    for chunk in stream:
        text = chunk.get("choices", [{}])[0].get("text", "")
        buf += text
        yield buf

with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
    gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
    chat = gr.Chatbot(type="messages")
    with gr.Row():
        temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
        topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
        max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
    msg = gr.Textbox(placeholder="Ask anything…", label="Message")

    def respond(m, history, temperature, top_p, max_new_tokens):
        history = (history or []) + [{"role": "user", "content": m}]
        return stream_reply(history, temperature, top_p, max_new_tokens)

    msg.submit(respond, [msg, chat, temp, topp, max_new], chat)

demo.queue().launch()