Spaces:
Build error
Build error
| import os | |
| import json | |
| import logging | |
| import time | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF" | |
| MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf" | |
| N_CTX = 8192 # Context window | |
| N_THREADS = 8 # HF Basic CPU has 8 cores | |
| CHAT_FORMAT = "chatml" # MiniCPM5 uses ChatML-style templates | |
| # Logging setup | |
| LOG_PATH = Path("/tmp/prompt_logs.jsonl") # /tmp is writable on HF Spaces | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| ) | |
| logger = logging.getLogger("minicpm5-api") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODEL LOAD | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # optional: use `from loguru import logger` if you prefer | |
| def load_model(): | |
| logger.info("Downloading/verifying GGUF...") | |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) | |
| logger.info(f"Loading {MODEL_FILE}...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=N_CTX, | |
| n_threads=N_THREADS, | |
| verbose=False, | |
| # chat_format is handled manually below for max control | |
| ) | |
| logger.info("Model loaded.") | |
| return llm | |
| llm = load_model() | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # INFERENCE + LOGGING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def log_request( | |
| messages: List[dict], | |
| params: dict, | |
| output: str, | |
| latency: float, | |
| ): | |
| """Append structured log entry to JSONL.""" | |
| entry = { | |
| "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), | |
| "messages": messages, | |
| "params": params, | |
| "output": output, | |
| "latency_sec": round(latency, 3), | |
| } | |
| with open(LOG_PATH, "a", encoding="utf-8") as f: | |
| f.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| def build_messages( | |
| system_msg: str, | |
| history: List[Tuple[str, str]], | |
| user_msg: str, | |
| enable_thinking: bool, | |
| ) -> List[dict]: | |
| """ | |
| MiniCPM5 supports two modes via the chat template: | |
| - enable_thinking=True -> reasoning mode | |
| - enable_thinking=False -> direct mode | |
| We simulate this by injecting a prefix/suffix in the final user message | |
| since llama-cpp-python's generic chat_format doesn't expose the custom | |
| MiniCPM5 template natively. | |
| """ | |
| messages = [] | |
| if system_msg.strip(): | |
| messages.append({"role": "system", "content": system_msg.strip()}) | |
| for human, assistant in history: | |
| messages.append({"role": "user", "content": human}) | |
| messages.append({"role": "assistant", "content": assistant}) | |
| # MiniCPM5 thinking trigger (documented in OpenBMB repo) | |
| if enable_thinking: | |
| user_msg = user_msg.strip() + " /think" | |
| else: | |
| user_msg = user_msg.strip() + " /no_think" | |
| messages.append({"role": "user", "content": user_msg}) | |
| return messages | |
| def generate( | |
| user_msg: str, | |
| history: List[Tuple[str, str]], | |
| system_msg: str, | |
| enable_thinking: bool, | |
| temperature: float, | |
| top_p: float, | |
| top_k: int, | |
| repeat_penalty: float, | |
| max_tokens: int, | |
| seed: int, | |
| ) -> Tuple[str, List[Tuple[str, str]], str]: | |
| """ | |
| Gradio handler. Returns: (assistant_reply, updated_history, status) | |
| """ | |
| start = time.time() | |
| # 1. Build messages | |
| messages = build_messages(system_msg, history, user_msg, enable_thinking) | |
| # 2. Call llama.cpp | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repeat_penalty=repeat_penalty, | |
| max_tokens=max_tokens, | |
| seed=seed, | |
| stream=False, | |
| ) | |
| assistant_text = response["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| logger.exception("Inference failed") | |
| return f"Error: {e}", history, "β Inference error" | |
| latency = time.time() - start | |
| # 3. Log | |
| params = { | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "top_k": top_k, | |
| "repeat_penalty": repeat_penalty, | |
| "max_tokens": max_tokens, | |
| "seed": seed, | |
| "enable_thinking": enable_thinking, | |
| } | |
| log_request(messages, params, assistant_text, latency) | |
| logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s") | |
| # 4. Update history | |
| history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)] | |
| status = f"β Done in {latency:.2f}s | {len(assistant_text)} chars" | |
| return "", history, status | |
| def clear_chat(): | |
| return "", [], "Chat cleared." | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π¦ MiniCPM5-1B-GGUF (Q8_0) β CPU Inference | |
| **System message**, **thinking mode**, and **full sampling control** with prompt logging. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(label="Chat", height=450, type="messages") | |
| with gr.Row(): | |
| msg_input = gr.Textbox( | |
| placeholder="Type your message...", | |
| show_label=False, | |
| scale=4, | |
| ) | |
| submit_btn = gr.Button("Send", variant="primary", scale=1) | |
| with gr.Row(): | |
| clear_btn = gr.Button("Clear") | |
| status_box = gr.Textbox(label="Status", interactive=False) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Generation Parameters") | |
| system_msg = gr.Textbox( | |
| label="System Message", | |
| value="You are a helpful assistant.", | |
| lines=2, | |
| ) | |
| thinking_chk = gr.Checkbox( | |
| label="Enable Thinking (/think)", | |
| value=False, | |
| info="MiniCPM5 reasoning mode", | |
| ) | |
| temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature") | |
| top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p") | |
| top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k") | |
| repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty") | |
| max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens") | |
| seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)") | |
| gr.Markdown("### π Logging") | |
| gr.Textbox( | |
| value=str(LOG_PATH), | |
| label="Log File Path", | |
| interactive=False, | |
| ) | |
| # Event wiring | |
| submit_btn.click( | |
| fn=generate, | |
| inputs=[ | |
| msg_input, chatbot, system_msg, thinking_chk, | |
| temperature, top_p, top_k, repeat_penalty, max_tokens, seed, | |
| ], | |
| outputs=[msg_input, chatbot, status_box], | |
| ) | |
| msg_input.submit( | |
| fn=generate, | |
| inputs=[ | |
| msg_input, chatbot, system_msg, thinking_chk, | |
| temperature, top_p, top_k, repeat_penalty, max_tokens, seed, | |
| ], | |
| outputs=[msg_input, chatbot, status_box], | |
| ) | |
| clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box]) | |
| # ββ Gradio API docs are auto-generated at /api/predict/ ββ | |
| # You can also view them by clicking "Use via API" in the UI footer | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |