Spaces:

YashChowdhary
/

Unrestricted_Language_Model

Build error

App Files Files Community

Unrestricted_Language_Model / app.py

YashChowdhary

Update app.py

36ef25a verified about 1 month ago

raw

history blame contribute delete

2.65 kB

	import os, threading
	import gradio as gr
	from llama_cpp import Llama

	# ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
	REPO_ID = "Kutches/UncensoredV2"
	# Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
	# llama-cpp-python supports glob patterns for filename.
	FILENAME_PATTERN = "Q4_K_M.gguf" # will match the Q4_K_M file in the repo

	# ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
	N_CTX = int(os.getenv("N_CTX", "2048")) # reduce to 2048 for memory headroom
	N_THREADS = None # Let llama.cpp pick; or set to 2
	# If you see slow tokenization, you can set N_THREADS=2 explicitly.

	# ====== LOAD MODEL (downloads from the Hub automatically) ======
	llm = Llama.from_pretrained(
	repo_id=REPO_ID,
	filename=FILENAME_PATTERN, # glob is supported
	n_ctx=N_CTX,
	n_threads=N_THREADS,
	verbose=False
	)

	SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."

	def build_prompt(messages, system_prompt=None):
	# Simple instruction-style prompt works reliably with many GGUF finetunes
	sys = system_prompt or SYSTEM_DEFAULT
	prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
	for m in messages:
	role = m.get("role", "user")
	content = (m.get("content") or "").strip()
	if role == "user":
	prompt += f"[INST] {content} [/INST]\n"
	elif role == "assistant":
	prompt += content + "\n"
	return prompt.strip()

	def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
	prompt = build_prompt(messages)
	stream = llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stop=["</s>", "[/INST]"],
	stream=True,
	)
	buf = ""
	for chunk in stream:
	text = chunk.get("choices", [{}])[0].get("text", "")
	buf += text
	yield buf

	with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
	gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
	chat = gr.Chatbot(type="messages")
	with gr.Row():
	temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
	topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
	max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
	msg = gr.Textbox(placeholder="Ask anything…", label="Message")

	def respond(m, history, temperature, top_p, max_new_tokens):
	history = (history or []) + [{"role": "user", "content": m}]
	return stream_reply(history, temperature, top_p, max_new_tokens)

	msg.submit(respond, [msg, chat, temp, topp, max_new], chat)

	demo.queue().launch()