CrazyQuantz commited on
Commit
b6618f6
Β·
verified Β·
1 Parent(s): adcfa1e

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +19 -15
  2. app.py +237 -0
  3. packages.txt +1 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -1,15 +1,19 @@
1
- ---
2
- title: MiniCPM5 1B
3
- emoji: πŸ‘€
4
- colorFrom: pink
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.15.2
8
- python_version: '3.13'
9
- app_file: app.py
10
- pinned: false
11
- license: apache-2.0
12
- short_description: Chat with MiniCPM5-1B
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ ---
2
+ title: MiniCPM5-1B-GGUF API
3
+ emoji: πŸ¦™
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.0.0
8
+ app_file: app.py
9
+ python_version: 3.10
10
+ startup_duration_timeout: 1h
11
+ preload_from_hub:
12
+ - repo_id: openbmb/MiniCPM5-1B-GGUF
13
+ files:
14
+ - MiniCPM5-1B-Q8_0.gguf
15
+ ---
16
+
17
+ # MiniCPM5-1B-GGUF (Q8_0) CPU Space
18
+
19
+ Interactive chat + API with full generation parameter control and prompt logging.
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+ from typing import List, Tuple
7
+
8
+ import gradio as gr
9
+ from llama_cpp import Llama
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ # ───────────────────────────────────────────────
13
+ # CONFIG
14
+ # ───────────────────────────────────────────────
15
+ MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF"
16
+ MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf"
17
+ N_CTX = 8192 # Context window
18
+ N_THREADS = 8 # HF Basic CPU has 8 cores
19
+ CHAT_FORMAT = "chatml" # MiniCPM5 uses ChatML-style templates
20
+
21
+ # Logging setup
22
+ LOG_PATH = Path("/tmp/prompt_logs.jsonl") # /tmp is writable on HF Spaces
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format="%(asctime)s | %(levelname)s | %(message)s",
26
+ )
27
+ logger = logging.getLogger("minicpm5-api")
28
+
29
+ # ───────────────────────────────────────────────
30
+ # MODEL LOAD
31
+ # ───────────────────────────────────────────────
32
+ @logger.catch # optional: use `from loguru import logger` if you prefer
33
+ def load_model():
34
+ logger.info("Downloading/verifying GGUF...")
35
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
36
+ logger.info(f"Loading {MODEL_FILE}...")
37
+
38
+ llm = Llama(
39
+ model_path=model_path,
40
+ n_ctx=N_CTX,
41
+ n_threads=N_THREADS,
42
+ verbose=False,
43
+ # chat_format is handled manually below for max control
44
+ )
45
+ logger.info("Model loaded.")
46
+ return llm
47
+
48
+ llm = load_model()
49
+
50
+ # ───────────────────────────────────────────────
51
+ # INFERENCE + LOGGING
52
+ # ───────────────────────────────────────────────
53
+ def log_request(
54
+ messages: List[dict],
55
+ params: dict,
56
+ output: str,
57
+ latency: float,
58
+ ):
59
+ """Append structured log entry to JSONL."""
60
+ entry = {
61
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
62
+ "messages": messages,
63
+ "params": params,
64
+ "output": output,
65
+ "latency_sec": round(latency, 3),
66
+ }
67
+ with open(LOG_PATH, "a", encoding="utf-8") as f:
68
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
69
+
70
+ def build_messages(
71
+ system_msg: str,
72
+ history: List[Tuple[str, str]],
73
+ user_msg: str,
74
+ enable_thinking: bool,
75
+ ) -> List[dict]:
76
+ """
77
+ MiniCPM5 supports two modes via the chat template:
78
+ - enable_thinking=True -> reasoning mode
79
+ - enable_thinking=False -> direct mode
80
+ We simulate this by injecting a prefix/suffix in the final user message
81
+ since llama-cpp-python's generic chat_format doesn't expose the custom
82
+ MiniCPM5 template natively.
83
+ """
84
+ messages = []
85
+ if system_msg.strip():
86
+ messages.append({"role": "system", "content": system_msg.strip()})
87
+
88
+ for human, assistant in history:
89
+ messages.append({"role": "user", "content": human})
90
+ messages.append({"role": "assistant", "content": assistant})
91
+
92
+ # MiniCPM5 thinking trigger (documented in OpenBMB repo)
93
+ if enable_thinking:
94
+ user_msg = user_msg.strip() + " /think"
95
+ else:
96
+ user_msg = user_msg.strip() + " /no_think"
97
+
98
+ messages.append({"role": "user", "content": user_msg})
99
+ return messages
100
+
101
+ def generate(
102
+ user_msg: str,
103
+ history: List[Tuple[str, str]],
104
+ system_msg: str,
105
+ enable_thinking: bool,
106
+ temperature: float,
107
+ top_p: float,
108
+ top_k: int,
109
+ repeat_penalty: float,
110
+ max_tokens: int,
111
+ seed: int,
112
+ ) -> Tuple[str, List[Tuple[str, str]], str]:
113
+ """
114
+ Gradio handler. Returns: (assistant_reply, updated_history, status)
115
+ """
116
+ start = time.time()
117
+
118
+ # 1. Build messages
119
+ messages = build_messages(system_msg, history, user_msg, enable_thinking)
120
+
121
+ # 2. Call llama.cpp
122
+ try:
123
+ response = llm.create_chat_completion(
124
+ messages=messages,
125
+ temperature=temperature,
126
+ top_p=top_p,
127
+ top_k=top_k,
128
+ repeat_penalty=repeat_penalty,
129
+ max_tokens=max_tokens,
130
+ seed=seed,
131
+ stream=False,
132
+ )
133
+ assistant_text = response["choices"][0]["message"]["content"]
134
+ except Exception as e:
135
+ logger.exception("Inference failed")
136
+ return f"Error: {e}", history, "❌ Inference error"
137
+
138
+ latency = time.time() - start
139
+
140
+ # 3. Log
141
+ params = {
142
+ "temperature": temperature,
143
+ "top_p": top_p,
144
+ "top_k": top_k,
145
+ "repeat_penalty": repeat_penalty,
146
+ "max_tokens": max_tokens,
147
+ "seed": seed,
148
+ "enable_thinking": enable_thinking,
149
+ }
150
+ log_request(messages, params, assistant_text, latency)
151
+ logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s")
152
+
153
+ # 4. Update history
154
+ history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)]
155
+ status = f"βœ… Done in {latency:.2f}s | {len(assistant_text)} chars"
156
+ return "", history, status
157
+
158
+ def clear_chat():
159
+ return "", [], "Chat cleared."
160
+
161
+ # ───────────────────────────────────────────────
162
+ # GRADIO UI
163
+ # ───────────────────────────────────────────────
164
+ with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo:
165
+ gr.Markdown("""
166
+ # πŸ¦™ MiniCPM5-1B-GGUF (Q8_0) β€” CPU Inference
167
+ **System message**, **thinking mode**, and **full sampling control** with prompt logging.
168
+ """)
169
+
170
+ with gr.Row():
171
+ with gr.Column(scale=2):
172
+ chatbot = gr.Chatbot(label="Chat", height=450, type="messages")
173
+
174
+ with gr.Row():
175
+ msg_input = gr.Textbox(
176
+ placeholder="Type your message...",
177
+ show_label=False,
178
+ scale=4,
179
+ )
180
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
181
+
182
+ with gr.Row():
183
+ clear_btn = gr.Button("Clear")
184
+ status_box = gr.Textbox(label="Status", interactive=False)
185
+
186
+ with gr.Column(scale=1):
187
+ gr.Markdown("### βš™οΈ Generation Parameters")
188
+
189
+ system_msg = gr.Textbox(
190
+ label="System Message",
191
+ value="You are a helpful assistant.",
192
+ lines=2,
193
+ )
194
+ thinking_chk = gr.Checkbox(
195
+ label="Enable Thinking (/think)",
196
+ value=False,
197
+ info="MiniCPM5 reasoning mode",
198
+ )
199
+
200
+ temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
201
+ top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p")
202
+ top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k")
203
+ repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
204
+ max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens")
205
+ seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)")
206
+
207
+ gr.Markdown("### πŸ“Š Logging")
208
+ gr.Textbox(
209
+ value=str(LOG_PATH),
210
+ label="Log File Path",
211
+ interactive=False,
212
+ )
213
+
214
+ # Event wiring
215
+ submit_btn.click(
216
+ fn=generate,
217
+ inputs=[
218
+ msg_input, chatbot, system_msg, thinking_chk,
219
+ temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
220
+ ],
221
+ outputs=[msg_input, chatbot, status_box],
222
+ )
223
+ msg_input.submit(
224
+ fn=generate,
225
+ inputs=[
226
+ msg_input, chatbot, system_msg, thinking_chk,
227
+ temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
228
+ ],
229
+ outputs=[msg_input, chatbot, status_box],
230
+ )
231
+ clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box])
232
+
233
+ # ── Gradio API docs are auto-generated at /api/predict/ ──
234
+ # You can also view them by clicking "Use via API" in the UI footer
235
+
236
+ if __name__ == "__main__":
237
+ demo.launch(server_name="0.0.0.0", server_port=7860)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libopenblas0-pthread
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ --only-binary llama-cpp-python
2
+ https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.22-cp310-cp310-linux_x86_64.whl
3
+
4
+ gradio>=5.0.0
5
+ huggingface-hub>=0.24.0