Spaces:

Reza2kn
/

MiniCPM5-1B-WebGPU

Running

App Files Files Community

MiniCPM5-1B-WebGPU / app.py

Reza2kn

Add MiniCPM5-1B browser Space scaffold

80a9976 verified about 24 hours ago

raw

history blame contribute delete

4.28 kB

	import os
	import time

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer


	MODEL_ID = os.environ.get("MODEL_ID", "openbmb/MiniCPM5-1B-SFT")

	SYSTEM_NOTE = (
	"MiniCPM5-1B is a text-only language model. Local validation is currently cleanest for English, Chinese, "
	"code snippets with explicit constraints, and tool-planning prompts. Persian and native Arabic are not marked supported yet."
	)

	EXAMPLES = [
	["Briefly introduce yourself as a local AI assistant in two sentences.", 96, 0.2, 0.95],
	["请用中文用三点总结：为什么本地小模型对隐私有帮助？", 160, 0.3, 0.95],
	["Return only Python code. Write count_jsonl_rows(path) that counts lines in a JSONL file without using json.load.", 160, 0.2, 0.95],
	["Give exactly two numbered steps to inspect a local README and summarize it safely. Do not say you cannot inspect files; write the tool-use plan.", 192, 0.2, 0.95],
	]


	tokenizer = None
	model = None


	def load_model():
	global tokenizer, model
	if model is not None:
	return
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	device_map="auto" if torch.cuda.is_available() else None,
	).eval()


	def generate(prompt, max_new_tokens, temperature, top_p):
	if not prompt.strip():
	return "Enter a prompt first.", ""
	load_model()
	start = time.time()
	rendered = tokenizer.apply_chat_template(
	[
	{
	"role": "system",
	"content": "Answer directly and concisely. Do not include hidden reasoning or thinking process text.",
	},
	{"role": "user", "content": prompt},
	],
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)
	inputs = tokenizer(rendered, return_tensors="pt")
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	do_sample = temperature > 0
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	temperature=float(temperature) if do_sample else None,
	top_p=float(top_p) if do_sample else None,
	do_sample=do_sample,
	pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
	)
	text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	if "</think>" in text:
	text = text.split("</think>", 1)[1].strip()
	elif rendered in text:
	text = text.split(rendered, 1)[1].strip()
	new_tokens = max(0, output_ids.shape[-1] - inputs["input_ids"].shape[-1])
	elapsed = max(time.time() - start, 1e-6)
	metrics = f"{new_tokens} new tokens \| {new_tokens / elapsed:.2f} tok/s \| {elapsed:.2f}s \| model: {MODEL_ID}"
	return text, metrics


	css = """
	.status-box {
	border: 1px solid #d8dee8;
	border-radius: 8px;
	padding: 12px 14px;
	background: #f8fafc;
	color: #263244;
	}
	.status-box strong {
	color: #101827;
	}
	"""


	with gr.Blocks(title="MiniCPM5-1B Chat", theme=gr.themes.Soft(), css=css) as demo:
	gr.Markdown("# MiniCPM5-1B Chat")
	gr.HTML(f"<div class='status-box'><strong>Validation status:</strong> {SYSTEM_NOTE}<br><strong>Runtime model:</strong> {MODEL_ID}</div>")
	with gr.Row():
	with gr.Column(scale=3):
	prompt = gr.Textbox(label="Prompt", lines=8, value=EXAMPLES[0][0])
	run = gr.Button("Generate", variant="primary")
	with gr.Column(scale=1):
	max_new_tokens = gr.Slider(16, 512, value=128, step=1, label="Max new tokens")
	temperature = gr.Slider(0, 1.5, value=0.2, step=0.05, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
	output = gr.Textbox(label="Output", lines=14)
	metrics = gr.Textbox(label="Run metrics", interactive=False)
	gr.Examples(EXAMPLES, inputs=[prompt, max_new_tokens, temperature, top_p])
	run.click(generate, inputs=[prompt, max_new_tokens, temperature, top_p], outputs=[output, metrics])


	if __name__ == "__main__":
	demo.launch()