Spaces:

Reza2kn
/

MiniCPM5-1B-WebGPU

Running

App Files Files Community

Reza2kn commited on 3 days ago

Commit

328e8d9

verified ·

1 Parent(s): aa937f3

Add MiniCPM5-1B browser Space scaffold

Browse files

Files changed (4) hide show

README.md +7 -6
__pycache__/app.cpython-311.pyc +0 -0
app.py +83 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,17 +1,18 @@
 ---
-title: MiniCPM5 1B WebGPU
 emoji: ⚡
 colorFrom: blue
 colorTo: green
-sdk: static
 pinned: false
 license: apache-2.0
 models:
 - Reza2kn/MiniCPM5-1B-ONNX-Web
 ---
-# MiniCPM5-1B WebGPU
-Static browser demo scaffold for `Reza2kn/MiniCPM5-1B-ONNX-Web`.
-The browser runtime loads tokenizer/config/model assets from Hugging Face and runs generation with `onnxruntime-web` when the ONNX artifact is available.

 ---
+title: MiniCPM5 1B Chat
 emoji: ⚡
 colorFrom: blue
 colorTo: green
+sdk: gradio
+sdk_version: 5.49.1
 pinned: false
 license: apache-2.0
 models:
+- Reza2kn/MiniCPM5-1B-MLX-DWQ-4bit
 - Reza2kn/MiniCPM5-1B-ONNX-Web
+- openbmb/MiniCPM5-1B
 ---
+# MiniCPM5-1B Chat
+Gradio demo for MiniCPM5-1B with visible generation settings and sample prompts.

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (6.45 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import time
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_ID = os.environ.get("MODEL_ID", "openbmb/MiniCPM5-1B")
+SYSTEM_NOTE = (
+    "MiniCPM5-1B is a text-only language model. "
+    "This demo validates chat, multilingual text, code, math, and tool-planning prompts; it does not accept image/audio/video inputs."
+)
+EXAMPLES = [
+    ["Briefly introduce yourself as a local AI assistant in two sentences.", 96, 0.2, 0.9],
+    ["请用中文用三点总结：为什么本地小模型对隐私有帮助？", 128, 0.3, 0.9],
+    ["به فارسی، خیلی کوتاه توضیح بده چطور یک مدل محلی می‌تواند به برنامه‌نویس کمک کند.", 128, 0.3, 0.9],
+    ["Write a small Python function that reads a JSONL file and returns the number of rows.", 160, 0.2, 0.9],
+    ["You need to inspect a local README and then summarize it. Give a safe two-step tool-use plan.", 128, 0.2, 0.9],
+]
+tokenizer = None
+model = None
+def load_model():
+    global tokenizer, model
+    if model is not None:
+        return
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=dtype,
+        device_map="auto" if torch.cuda.is_available() else None,
+    ).eval()
+def generate(prompt, max_new_tokens, temperature, top_p):
+    if not prompt.strip():
+        return "Enter a prompt first.", ""
+    load_model()
+    start = time.time()
+    inputs = tokenizer(prompt, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    do_sample = temperature > 0
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            temperature=float(temperature) if do_sample else None,
+            top_p=float(top_p) if do_sample else None,
+            do_sample=do_sample,
+            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+        )
+    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    new_tokens = max(0, output_ids.shape[-1] - inputs["input_ids"].shape[-1])
+    elapsed = max(time.time() - start, 1e-6)
+    metrics = f"{new_tokens} new tokens | {new_tokens / elapsed:.2f} tok/s | {elapsed:.2f}s | model: {MODEL_ID}"
+    return text, metrics
+with gr.Blocks(title="MiniCPM5-1B Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# MiniCPM5-1B Chat\n" + SYSTEM_NOTE)
+    with gr.Row():
+        with gr.Column(scale=3):
+            prompt = gr.Textbox(label="Prompt", lines=8, value=EXAMPLES[0][0])
+            run = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=1):
+            max_new_tokens = gr.Slider(16, 512, value=128, step=1, label="Max new tokens")
+            temperature = gr.Slider(0, 1.5, value=0.2, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+    output = gr.Textbox(label="Output", lines=14)
+    metrics = gr.Textbox(label="Run metrics", interactive=False)
+    gr.Examples(EXAMPLES, inputs=[prompt, max_new_tokens, temperature, top_p])
+    run.click(generate, inputs=[prompt, max_new_tokens, temperature, top_p], outputs=[output, metrics])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==5.49.1
+transformers>=5.6
+torch
+accelerate
+safetensors
+huggingface_hub