import os import time import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = os.environ.get("MODEL_ID", "openbmb/MiniCPM5-1B-SFT") SYSTEM_NOTE = ( "MiniCPM5-1B is a text-only language model. Local validation is currently cleanest for English, Chinese, " "code snippets with explicit constraints, and tool-planning prompts. Persian and native Arabic are not marked supported yet." ) EXAMPLES = [ ["Briefly introduce yourself as a local AI assistant in two sentences.", 96, 0.2, 0.95], ["请用中文用三点总结:为什么本地小模型对隐私有帮助?", 160, 0.3, 0.95], ["Return only Python code. Write count_jsonl_rows(path) that counts lines in a JSONL file without using json.load.", 160, 0.2, 0.95], ["Give exactly two numbered steps to inspect a local README and summarize it safely. Do not say you cannot inspect files; write the tool-use plan.", 192, 0.2, 0.95], ] tokenizer = None model = None def load_model(): global tokenizer, model if model is not None: return tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None, ).eval() def generate(prompt, max_new_tokens, temperature, top_p): if not prompt.strip(): return "Enter a prompt first.", "" load_model() start = time.time() rendered = tokenizer.apply_chat_template( [ { "role": "system", "content": "Answer directly and concisely. Do not include hidden reasoning or thinking process text.", }, {"role": "user", "content": prompt}, ], tokenize=False, add_generation_prompt=True, enable_thinking=False, ) inputs = tokenizer(rendered, return_tensors="pt") inputs = {k: v.to(model.device) for k, v in inputs.items()} do_sample = temperature > 0 with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature) if do_sample else None, top_p=float(top_p) if do_sample else None, do_sample=do_sample, pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, ) text = tokenizer.decode(output_ids[0], skip_special_tokens=True) if "" in text: text = text.split("", 1)[1].strip() elif rendered in text: text = text.split(rendered, 1)[1].strip() new_tokens = max(0, output_ids.shape[-1] - inputs["input_ids"].shape[-1]) elapsed = max(time.time() - start, 1e-6) metrics = f"{new_tokens} new tokens | {new_tokens / elapsed:.2f} tok/s | {elapsed:.2f}s | model: {MODEL_ID}" return text, metrics css = """ .status-box { border: 1px solid #d8dee8; border-radius: 8px; padding: 12px 14px; background: #f8fafc; color: #263244; } .status-box strong { color: #101827; } """ with gr.Blocks(title="MiniCPM5-1B Chat", theme=gr.themes.Soft(), css=css) as demo: gr.Markdown("# MiniCPM5-1B Chat") gr.HTML(f"