Spaces:

CrazyQuantz
/

MiniCPM5-1B

Build error

App Files Files Community

CrazyQuantz commited on May 30

Commit

b6618f6

verified ·

1 Parent(s): adcfa1e

Upload 4 files

Browse files

Files changed (4) hide show

README.md +19 -15
app.py +237 -0
packages.txt +1 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,15 +1,19 @@
----
-title: MiniCPM5 1B
-emoji: 👀
-colorFrom: pink
-colorTo: pink
-sdk: gradio
-sdk_version: 6.15.2
-python_version: '3.13'
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Chat with MiniCPM5-1B
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: MiniCPM5-1B-GGUF API
+emoji: 🦙
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.0.0
+app_file: app.py
+python_version: 3.10
+startup_duration_timeout: 1h
+preload_from_hub:
+  - repo_id: openbmb/MiniCPM5-1B-GGUF
+    files:
+      - MiniCPM5-1B-Q8_0.gguf
+---
+# MiniCPM5-1B-GGUF (Q8_0) CPU Space
+Interactive chat + API with full generation parameter control and prompt logging.

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+import json
+import logging
+import time
+from pathlib import Path
+from typing import List, Tuple
+import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+# ───────────────────────────────────────────────
+# CONFIG
+# ───────────────────────────────────────────────
+MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF"
+MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf"
+N_CTX = 8192          # Context window
+N_THREADS = 8         # HF Basic CPU has 8 cores
+CHAT_FORMAT = "chatml"  # MiniCPM5 uses ChatML-style templates
+# Logging setup
+LOG_PATH = Path("/tmp/prompt_logs.jsonl")  # /tmp is writable on HF Spaces
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+)
+logger = logging.getLogger("minicpm5-api")
+# ───────────────────────────────────────────────
+# MODEL LOAD
+# ───────────────────────────────────────────────
+@logger.catch  # optional: use `from loguru import logger` if you prefer
+def load_model():
+    logger.info("Downloading/verifying GGUF...")
+    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+    logger.info(f"Loading {MODEL_FILE}...")
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        verbose=False,
+        # chat_format is handled manually below for max control
+    )
+    logger.info("Model loaded.")
+    return llm
+llm = load_model()
+# ───────────────────────────────────────────────
+# INFERENCE + LOGGING
+# ───────────────────────────────────────────────
+def log_request(
+    messages: List[dict],
+    params: dict,
+    output: str,
+    latency: float,
+):
+    """Append structured log entry to JSONL."""
+    entry = {
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "messages": messages,
+        "params": params,
+        "output": output,
+        "latency_sec": round(latency, 3),
+    }
+    with open(LOG_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+def build_messages(
+    system_msg: str,
+    history: List[Tuple[str, str]],
+    user_msg: str,
+    enable_thinking: bool,
+) -> List[dict]:
+    """
+    MiniCPM5 supports two modes via the chat template:
+      - enable_thinking=True  -> reasoning mode
+      - enable_thinking=False -> direct mode
+    We simulate this by injecting a prefix/suffix in the final user message
+    since llama-cpp-python's generic chat_format doesn't expose the custom
+    MiniCPM5 template natively.
+    """
+    messages = []
+    if system_msg.strip():
+        messages.append({"role": "system", "content": system_msg.strip()})
+    for human, assistant in history:
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": assistant})
+    # MiniCPM5 thinking trigger (documented in OpenBMB repo)
+    if enable_thinking:
+        user_msg = user_msg.strip() + " /think"
+    else:
+        user_msg = user_msg.strip() + " /no_think"
+    messages.append({"role": "user", "content": user_msg})
+    return messages
+def generate(
+    user_msg: str,
+    history: List[Tuple[str, str]],
+    system_msg: str,
+    enable_thinking: bool,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repeat_penalty: float,
+    max_tokens: int,
+    seed: int,
+) -> Tuple[str, List[Tuple[str, str]], str]:
+    """
+    Gradio handler. Returns: (assistant_reply, updated_history, status)
+    """
+    start = time.time()
+    # 1. Build messages
+    messages = build_messages(system_msg, history, user_msg, enable_thinking)
+    # 2. Call llama.cpp
+    try:
+        response = llm.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repeat_penalty=repeat_penalty,
+            max_tokens=max_tokens,
+            seed=seed,
+            stream=False,
+        )
+        assistant_text = response["choices"][0]["message"]["content"]
+    except Exception as e:
+        logger.exception("Inference failed")
+        return f"Error: {e}", history, "❌ Inference error"
+    latency = time.time() - start
+    # 3. Log
+    params = {
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repeat_penalty": repeat_penalty,
+        "max_tokens": max_tokens,
+        "seed": seed,
+        "enable_thinking": enable_thinking,
+    }
+    log_request(messages, params, assistant_text, latency)
+    logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s")
+    # 4. Update history
+    history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)]
+    status = f"✅ Done in {latency:.2f}s | {len(assistant_text)} chars"
+    return "", history, status
+def clear_chat():
+    return "", [], "Chat cleared."
+# ───────────────────────────────────────────────
+# GRADIO UI
+# ───────────────────────────────────────────────
+with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🦙 MiniCPM5-1B-GGUF (Q8_0) — CPU Inference
+    **System message**, **thinking mode**, and **full sampling control** with prompt logging.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Chat", height=450, type="messages")
+            with gr.Row():
+                msg_input = gr.Textbox(
+                    placeholder="Type your message...",
+                    show_label=False,
+                    scale=4,
+                )
+                submit_btn = gr.Button("Send", variant="primary", scale=1)
+            with gr.Row():
+                clear_btn = gr.Button("Clear")
+                status_box = gr.Textbox(label="Status", interactive=False)
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Generation Parameters")
+            system_msg = gr.Textbox(
+                label="System Message",
+                value="You are a helpful assistant.",
+                lines=2,
+            )
+            thinking_chk = gr.Checkbox(
+                label="Enable Thinking (/think)",
+                value=False,
+                info="MiniCPM5 reasoning mode",
+            )
+            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p")
+            top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k")
+            repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
+            max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens")
+            seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)")
+            gr.Markdown("### 📊 Logging")
+            gr.Textbox(
+                value=str(LOG_PATH),
+                label="Log File Path",
+                interactive=False,
+            )
+    # Event wiring
+    submit_btn.click(
+        fn=generate,
+        inputs=[
+            msg_input, chatbot, system_msg, thinking_chk,
+            temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
+        ],
+        outputs=[msg_input, chatbot, status_box],
+    )
+    msg_input.submit(
+        fn=generate,
+        inputs=[
+            msg_input, chatbot, system_msg, thinking_chk,
+            temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
+        ],
+        outputs=[msg_input, chatbot, status_box],
+    )
+    clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box])
+    # ── Gradio API docs are auto-generated at /api/predict/ ──
+    # You can also view them by clicking "Use via API" in the UI footer
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libopenblas0-pthread

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+--only-binary llama-cpp-python
+https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
+gradio>=5.0.0
+huggingface-hub>=0.24.0