Reza2kn's picture
Add MiniCPM5-1B browser Space scaffold
80a9976 verified
import os
import time
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = os.environ.get("MODEL_ID", "openbmb/MiniCPM5-1B-SFT")
SYSTEM_NOTE = (
"MiniCPM5-1B is a text-only language model. Local validation is currently cleanest for English, Chinese, "
"code snippets with explicit constraints, and tool-planning prompts. Persian and native Arabic are not marked supported yet."
)
EXAMPLES = [
["Briefly introduce yourself as a local AI assistant in two sentences.", 96, 0.2, 0.95],
["请用中文用三点总结:为什么本地小模型对隐私有帮助?", 160, 0.3, 0.95],
["Return only Python code. Write count_jsonl_rows(path) that counts lines in a JSONL file without using json.load.", 160, 0.2, 0.95],
["Give exactly two numbered steps to inspect a local README and summarize it safely. Do not say you cannot inspect files; write the tool-use plan.", 192, 0.2, 0.95],
]
tokenizer = None
model = None
def load_model():
global tokenizer, model
if model is not None:
return
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
device_map="auto" if torch.cuda.is_available() else None,
).eval()
def generate(prompt, max_new_tokens, temperature, top_p):
if not prompt.strip():
return "Enter a prompt first.", ""
load_model()
start = time.time()
rendered = tokenizer.apply_chat_template(
[
{
"role": "system",
"content": "Answer directly and concisely. Do not include hidden reasoning or thinking process text.",
},
{"role": "user", "content": prompt},
],
tokenize=False,
add_generation_prompt=True,
enable_thinking=False,
)
inputs = tokenizer(rendered, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
do_sample = temperature > 0
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
temperature=float(temperature) if do_sample else None,
top_p=float(top_p) if do_sample else None,
do_sample=do_sample,
pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
)
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
if "</think>" in text:
text = text.split("</think>", 1)[1].strip()
elif rendered in text:
text = text.split(rendered, 1)[1].strip()
new_tokens = max(0, output_ids.shape[-1] - inputs["input_ids"].shape[-1])
elapsed = max(time.time() - start, 1e-6)
metrics = f"{new_tokens} new tokens | {new_tokens / elapsed:.2f} tok/s | {elapsed:.2f}s | model: {MODEL_ID}"
return text, metrics
css = """
.status-box {
border: 1px solid #d8dee8;
border-radius: 8px;
padding: 12px 14px;
background: #f8fafc;
color: #263244;
}
.status-box strong {
color: #101827;
}
"""
with gr.Blocks(title="MiniCPM5-1B Chat", theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# MiniCPM5-1B Chat")
gr.HTML(f"<div class='status-box'><strong>Validation status:</strong> {SYSTEM_NOTE}<br><strong>Runtime model:</strong> {MODEL_ID}</div>")
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(label="Prompt", lines=8, value=EXAMPLES[0][0])
run = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
max_new_tokens = gr.Slider(16, 512, value=128, step=1, label="Max new tokens")
temperature = gr.Slider(0, 1.5, value=0.2, step=0.05, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
output = gr.Textbox(label="Output", lines=14)
metrics = gr.Textbox(label="Run metrics", interactive=False)
gr.Examples(EXAMPLES, inputs=[prompt, max_new_tokens, temperature, top_p])
run.click(generate, inputs=[prompt, max_new_tokens, temperature, top_p], outputs=[output, metrics])
if __name__ == "__main__":
demo.launch()