Cialtion commited on
Commit
0ce0c33
·
verified ·
1 Parent(s): 4d3ae4f

Delete .ipynb_checkpoints

Browse files
.ipynb_checkpoints/01_benchmark-checkpoint.py DELETED
@@ -1,129 +0,0 @@
1
- #!/usr/bin/env python3
2
- """SimpleTool multi-head parallel decode — vLLM, v1/v2, external prompts
3
- python 01_benchmark.py --version v2 # v2 default model
4
- python 01_benchmark.py --version v1 # v1 default model
5
- python 01_benchmark.py --version v2 --n-args 3 # fixed three arg heads
6
- python 01_benchmark.py --version v1 --model /my/model # customed model path
7
- """
8
- import argparse, json, time, os
9
- from pathlib import Path
10
-
11
- DIR = Path("./prompts")
12
- HEADS = [("function","<function>","</function>")] + [(f"arg{i}",f"<arg{i}>",f"</arg{i}>") for i in range(1,7)]
13
- STOPS = ["</function>"] + [f"</arg{i}>" for i in range(1,7)] + ["</content>","<|null|>","<|im_end|>"]
14
- MODELS = {"v1":"./models/RT-Qwen3-4B-AWQ", "v2":"./models/RT-Qwen3-4B-AWQ-v2"}
15
-
16
- def load_scenarios():
17
- scs = json.loads((DIR/"scenarios.json").read_text())
18
- for sc in scs:
19
- sc["tools"] = (DIR/sc["tools_file"]).read_text().strip()
20
- return scs
21
-
22
- def max_tool_params(tools_str):
23
- m = 0
24
- for l in tools_str.strip().split("\n"):
25
- try: m = max(m, len(json.loads(l)["function"]["parameters"]["properties"]))
26
- except: pass
27
- return m
28
-
29
- def build_prompt(sc, ver):
30
- t = sc["tools"]
31
- if ver == "v1":
32
- v1sys = (DIR/"v1_system.txt").read_text()
33
- return (f"<|im_start|>system\n{v1sys}\n## Available Tools:\n\n{t}<|im_end|>\n"
34
- f"<|im_start|>user\nenvironment: []\nhistory: {sc['history']}\n\n{sc['system']}\n\n{sc['query']}<|im_end|>\n"
35
- f"<|im_start|>assistant\n")
36
- return (f"<|im_start|>system\n{sc['system']}\n\n## Available Tools:\n\n{t}<|im_end|>\n"
37
- f"<|im_start|>user\nhistory: {sc['history']}\n\n{sc['query']}<|im_end|>\n"
38
- f"<|im_start|>assistant\n")
39
-
40
- def clean(t):
41
- t = t.strip()
42
- return "<|null|>" if "<|null|>" in t or t == "" else t.split("</")[0].strip()
43
-
44
- def main():
45
- ap = argparse.ArgumentParser()
46
- ap.add_argument("--model", default=None)
47
- ap.add_argument("--version", default="v2", choices=["v1","v2"])
48
- ap.add_argument("--n-args", default="auto")
49
- ap.add_argument("--gpu", type=int, default=0)
50
- ap.add_argument("--max-model-len", type=int, default=4096)
51
- a = ap.parse_args()
52
- a.model = a.model or MODELS[a.version]
53
- os.environ["CUDA_VISIBLE_DEVICES"] = str(a.gpu)
54
- from vllm import LLM, SamplingParams
55
-
56
- SC = load_scenarios()
57
- print(f"\n{'='*60}\n {a.version} | {a.model}\n{'='*60}")
58
- llm = LLM(model=a.model, trust_remote_code=True, dtype="auto", gpu_memory_utilization=0.80,
59
- max_model_len=a.max_model_len, max_num_seqs=8, enable_prefix_caching=True)
60
- sp = SamplingParams(temperature=0.0, max_tokens=128, stop=STOPS, include_stop_str_in_output=True)
61
- na = [min(max_tool_params(s["tools"]),6) if a.n_args=="auto" else max(1,min(6,int(a.n_args))) for s in SC]
62
- for s,n in zip(SC,na): print(f" {s['name']:<35} heads={1+n}")
63
-
64
- def run(sc, n):
65
- hd = HEADS[:1+n]; base = build_prompt(sc, a.version)
66
- t0 = time.perf_counter()
67
- outs = llm.generate([base+op for _,op,_ in hd], sp)
68
- ms = (time.perf_counter()-t0)*1000
69
- raw, toks, full = {}, {}, {}
70
- for j,(nm,_,_) in enumerate(hd):
71
- if j<len(outs) and outs[j].outputs:
72
- o = outs[j].outputs[0]; full[nm]=o.text; raw[nm]=clean(o.text); toks[nm]=len(o.token_ids)
73
- else: raw[nm],toks[nm],full[nm] = "<|null|>",0,""
74
- return raw, toks, full, ms, hd
75
-
76
- # Cold
77
- print(f"\n{'='*60}\n COLD START\n{'='*60}")
78
- cold = []
79
- for i,s in enumerate(SC): _,_,_,ms,_=run(s,na[i]); cold.append(ms); print(f" {s['name']:<35} {ms:7.1f}ms")
80
-
81
- # Hot x3
82
- print(f"\n{'='*60}\n HOT WARMUP (3 rounds)\n{'='*60}")
83
- hot = [[] for _ in SC]
84
- for r in range(3):
85
- for i,s in enumerate(SC): _,_,_,ms,_=run(s,na[i]); hot[i].append(ms)
86
- print(f" Round {r+1}: "+" ".join(f"{hot[j][-1]:6.1f}ms" for j in range(len(SC))))
87
-
88
- # Test
89
- print(f"\n{'='*60}\n PARALLEL TEST ({a.version})\n{'='*60}\n")
90
- res = []
91
- for i,s in enumerate(SC):
92
- raw,toks,full,ms,hd = run(s,na[i]); mt=max(toks.values()) if toks else 0
93
- ok = raw.get("function","") == s["expected"]; res.append((s,raw,toks,full,ms,mt,hd,ok))
94
- print(f"─── {s['name']} ───\n{'PASS' if ok else 'FAIL'} {s['desc']}")
95
- for nm,_,_ in hd:
96
- v,tc = raw.get(nm,""),toks.get(nm,0); d=v if len(v)<=43 else v[:43]+"…"
97
- st = ("OK" if ok else f"WRONG({v})") if nm=="function" else ("NULL" if v=="<|null|>" else "FILL")
98
- print(f" {nm:<10} {d:<45} {tc:<4} {st}")
99
- print(f" e2e={ms:.1f}ms max_tok={mt}\n")
100
-
101
- # Summary
102
- N=len(res); np_=sum(r[7] for r in res); ae=sum(r[4] for r in res)/N; amt=sum(r[5] for r in res)/N
103
- print(f"{'='*60}\n SUMMARY ({a.version})\n{'='*60}")
104
- print(f" Accuracy : {np_}/{N}\n Cold start avg : {sum(cold)/N:.1f}ms\n Hot prefill avg: {sum(sum(h) for h in hot)/sum(len(h) for h in hot):.1f}ms")
105
- print(f" E2E avg (hot) : {ae:.1f}ms\n Max head tokens: {amt:.1f} avg\n E2E / max_tok : {ae/amt:.1f}ms/tok (decode bottleneck)\n")
106
- print(f" {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'E2E':>7} {'MaxTk':>6} {'ms/tk':>6}\n {'─'*70}")
107
- for i,(s,_,_,_,ms,mt,_,_) in enumerate(res):
108
- print(f" {s['name']:<35} {cold[i]:6.1f} {sum(hot[i])/3:6.1f} {ms:6.1f} {mt:>5} {ms/mt if mt else 0:5.1f}")
109
-
110
- # Example dump
111
- s,raw,toks,full,ms,mt,hd,ok = res[0]; base=build_prompt(s,a.version)
112
- print(f"\n{'='*60}\n EXAMPLE ({a.version}): {s['name']}\n{'='*60}")
113
- print(f"\n┌─ Shared Prefix ({len(base)} chars) ────────────────────")
114
- for ln in base.split("\n"): print(f"│ {ln}")
115
- print(f"└──────────────────────────────────────────────────")
116
- print(f"\n┌─ Per-Head Trigger Tokens ─────────────────────────")
117
- for nm,op,_ in hd: print(f"│ {nm:<10} → {op}")
118
- print(f"└──────────────────────────────────────────────────")
119
- print(f"\n┌─ Decode Output (all tokens, incl. stop) ──────────")
120
- for nm,op,_ in hd: print(f"│ {nm:<10} [{toks.get(nm,0):>2} tok] {op}{full.get(nm,'')}")
121
- print(f"└──────────────────────────────────────────────────")
122
- print(f"\n Reconstructed multi-head response:")
123
- for nm,op,cl in hd:
124
- if raw.get(nm,"")=="<|null|>": print(f" {op}<|null|>")
125
- else:
126
- ft=full.get(nm,""); print(f" {op}{ft}" if any(ft.rstrip().endswith(x) for x in STOPS) else f" {op}{ft}{cl}")
127
- print()
128
-
129
- if __name__ == "__main__": main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/02_server-checkpoint.py DELETED
@@ -1,336 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- SimpleTool vLLM Server - Multi-Head Parallel Decoding for Real-Time Function Calling
4
- Supports both v1 and v2 prompt formats. HTML clients need zero changes.
5
- """
6
-
7
- import json
8
- import time
9
- import os
10
- from typing import List, Dict, Any, Optional
11
- from contextlib import asynccontextmanager
12
-
13
- from fastapi import FastAPI, HTTPException
14
- from fastapi.middleware.cors import CORSMiddleware
15
- from pydantic import BaseModel
16
- import uvicorn
17
-
18
- from vllm import LLM, SamplingParams
19
-
20
- # ==================== Config ====================
21
- MODEL_PATH = "./models/RT-Qwen3-4B-AWQ-v2" # v2 model path
22
- MODEL_VERSION = "v2" # "v1" or "v2"
23
- SERVER_HOST = "0.0.0.0"
24
- SERVER_PORT = 8899
25
- MAX_HISTORY = 6
26
-
27
- os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
28
-
29
- # ==================== Multi-Head Tags ====================
30
- HEAD_TAGS = ["<content>", "<function>", "<arg1>", "<arg2>", "<arg3>", "<arg4>", "<arg5>", "<arg6>"]
31
- STOP_TOKENS = ["<|null|>", "</content>", "</function>", "</arg1>", "</arg2>", "</arg3>", "</arg4>", "</arg5>", "</arg6>", "<|im_end|>"]
32
-
33
- # ── v1: generic head-format instructions in system, domain context in user ──
34
- V1_SYSTEM_TEMPLATE = """<|im_start|>system
35
- You are a multi-head parallel function calling model.
36
- ## Output Heads
37
-
38
- **Head 0 - <content>**: Natural language response
39
- - Format: <content>response text</content>
40
-
41
- **Head 1 - <function>**: Function names to call
42
- - Format: <function>name</function>
43
-
44
- **Head 2-7 - <arg1>-<arg6>**: Function arguments by position
45
- - Format: <argN>value</argN>
46
- - If Unnecessary: <argN><|null|></argN>
47
-
48
- ## Available Tools:
49
-
50
- {tools_json}
51
- <|im_end|>
52
- """
53
-
54
- V1_USER_TEMPLATE = "<|im_start|>user\nenvironment: {env}\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
55
-
56
- # ── v2: domain system prompt + tools in system, leaner user turn ──
57
- V2_SYSTEM_TEMPLATE = """<|im_start|>system
58
- {system_prompt}
59
-
60
- ## Available Tools:
61
-
62
- {tools_json}
63
- <|im_end|>
64
- """
65
-
66
- V2_USER_TEMPLATE = "<|im_start|>user\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
67
-
68
- # Default system prompt when HTML client doesn't send one (backward compat)
69
- V2_DEFAULT_SYSTEM = "You are a real-time function calling assistant. Convert user commands into function calls using the available tools."
70
-
71
-
72
- # ==================== Data Models ====================
73
- class Message(BaseModel):
74
- role: str
75
- content: str
76
-
77
-
78
- class FCRequest(BaseModel):
79
- messages: List[Message]
80
- tools: List[Dict[str, Any]]
81
- # ── v1 fields (still accepted, used when version=v1) ──
82
- environment: Optional[List[str]] = None
83
- history: Optional[List[str]] = None
84
- # ── v2 optional: domain system prompt ──
85
- system: Optional[str] = None
86
- # ── shared ──
87
- max_tokens: int = 32
88
- temperature: float = 0.0
89
- include_content_head: bool = False
90
-
91
-
92
- class FCResponse(BaseModel):
93
- success: bool
94
- function: Optional[str] = None
95
- args: Dict[str, Any] = {}
96
- heads: Dict[str, str] = {}
97
- content: Optional[str] = None
98
- latency_ms: float = 0
99
- error: Optional[str] = None
100
-
101
-
102
- # ==================== SimpleTool Engine ====================
103
- class SimpleToolEngine:
104
- def __init__(self, model_path: str, version: str = "v2"):
105
- self.model_path = model_path
106
- self.version = version
107
- self.llm: Optional[LLM] = None
108
- self.sampling_params = None
109
-
110
- def initialize(self):
111
- print(f"[SimpleTool] Loading model ({self.version}): {self.model_path}")
112
- self.llm = LLM(
113
- model=self.model_path,
114
- trust_remote_code=True,
115
- enable_prefix_caching=True,
116
- tensor_parallel_size=1,
117
- gpu_memory_utilization=0.8,
118
- max_model_len=4096,
119
- dtype="auto",
120
- )
121
- self.sampling_params = SamplingParams(
122
- temperature=0.0,
123
- max_tokens=32,
124
- stop=STOP_TOKENS,
125
- include_stop_str_in_output=True
126
- )
127
- print(f"[SimpleTool] Model loaded! (version={self.version})")
128
- self._warmup()
129
-
130
- def _warmup(self):
131
- print("[SimpleTool] Warming up...")
132
- dummy_tools = '{"type":"function","function":{"name":"test","parameters":{}}}'
133
- if self.version == "v1":
134
- prefix = V1_SYSTEM_TEMPLATE.format(tools_json=dummy_tools)
135
- prefix += V1_USER_TEMPLATE.format(env="[]", hist="", query="test")
136
- else:
137
- prefix = V2_SYSTEM_TEMPLATE.format(system_prompt=V2_DEFAULT_SYSTEM, tools_json=dummy_tools)
138
- prefix += V2_USER_TEMPLATE.format(hist="", query="test")
139
- prompts = [prefix + tag for tag in HEAD_TAGS[:2]] # function + arg1 enough
140
- self.llm.generate(prompts, self.sampling_params)
141
- print("[SimpleTool] Warmup complete!")
142
-
143
- def _build_tools_json(self, tools: List[Dict]) -> str:
144
- return "\n".join(json.dumps(t, ensure_ascii=False) for t in tools)
145
-
146
- def _extract_param_info(self, tools: List[Dict]) -> List[str]:
147
- names = []
148
- for tool in tools:
149
- func = tool.get("function", {})
150
- params = func.get("parameters", {}).get("properties", {})
151
- for name in params.keys():
152
- if name not in names:
153
- names.append(name)
154
- return names[:6]
155
-
156
- def _get_max_args(self, tools: List[Dict]) -> int:
157
- max_args = 0
158
- for tool in tools:
159
- func = tool.get("function", {})
160
- params = func.get("parameters", {}).get("properties", {})
161
- max_args = max(max_args, len(params))
162
- return min(max_args, 6)
163
-
164
- def _build_prompt(self, request: FCRequest) -> str:
165
- """Build the shared prefix according to version."""
166
- tools_json = self._build_tools_json(request.tools)
167
-
168
- # Extract query from messages
169
- query = ""
170
- for msg in request.messages:
171
- if msg.role == "user":
172
- query = msg.content
173
-
174
- hist_list = (request.history or [])[-MAX_HISTORY:]
175
- hist_str = ", ".join(hist_list) if hist_list else ""
176
-
177
- if self.version == "v1":
178
- # ── v1: head descriptions + tools in system, env+history+query in user ──
179
- env_str = json.dumps(request.environment or [], ensure_ascii=False)
180
- system_part = V1_SYSTEM_TEMPLATE.format(tools_json=tools_json)
181
- user_part = V1_USER_TEMPLATE.format(env=env_str, hist=hist_str, query=query)
182
- else:
183
- # ── v2: domain system + tools in system, history+query in user ──
184
- # If client sends a system prompt, use it; otherwise use default.
185
- # For legacy HTML clients that send environment[], fold it into query.
186
- system_prompt = request.system or V2_DEFAULT_SYSTEM
187
- system_part = V2_SYSTEM_TEMPLATE.format(
188
- system_prompt=system_prompt,
189
- tools_json=tools_json
190
- )
191
- # Backward compat: if environment is provided (old HTML clients),
192
- # prepend it to the query so the model still sees context.
193
- env_prefix = ""
194
- if request.environment:
195
- env_prefix = "environment: " + json.dumps(request.environment, ensure_ascii=False) + "\n"
196
- user_part = V2_USER_TEMPLATE.format(
197
- hist=hist_str,
198
- query=env_prefix + query
199
- )
200
-
201
- return system_part + user_part
202
-
203
- def call(self, request: FCRequest) -> FCResponse:
204
- start = time.perf_counter()
205
-
206
- full_prefix = self._build_prompt(request)
207
-
208
- # Dynamic head selection based on max args
209
- max_args = self._get_max_args(request.tools)
210
- active_tags = ["<function>"] + [f"<arg{i}>" for i in range(1, max_args + 1)]
211
- if request.include_content_head:
212
- active_tags = ["<content>"] + active_tags
213
-
214
- prompts = [full_prefix + tag for tag in active_tags]
215
- outputs = self.llm.generate(prompts, self.sampling_params)
216
-
217
- latency_ms = (time.perf_counter() - start) * 1000
218
-
219
- # Parse outputs
220
- heads = {}
221
- head_names = []
222
- if request.include_content_head:
223
- head_names.append("content")
224
- head_names.append("function")
225
- head_names.extend([f"arg{i}" for i in range(1, max_args + 1)])
226
-
227
- for i, output in enumerate(outputs):
228
- text = output.outputs[0].text.strip()
229
- for stop in STOP_TOKENS:
230
- if text.endswith(stop):
231
- text = text[:-len(stop)].strip()
232
- break
233
- heads[head_names[i]] = text
234
-
235
- func_name = heads.get("function", "").strip()
236
- if not func_name or func_name == "<|null|>":
237
- return FCResponse(
238
- success=False,
239
- heads=heads,
240
- content=heads.get("content"),
241
- latency_ms=latency_ms,
242
- error="No function called"
243
- )
244
-
245
- param_names = self._extract_param_info(request.tools)
246
- args = {}
247
- for i, name in enumerate(param_names):
248
- val = heads.get(f"arg{i+1}", "").strip()
249
- if val and val != "<|null|>":
250
- if val.isdigit():
251
- args[name] = int(val)
252
- elif val.lstrip('-').replace('.', '', 1).isdigit():
253
- args[name] = float(val)
254
- else:
255
- args[name] = val.lower().strip()
256
-
257
- return FCResponse(
258
- success=True,
259
- function=func_name,
260
- args=args,
261
- heads=heads,
262
- content=heads.get("content"),
263
- latency_ms=latency_ms
264
- )
265
-
266
-
267
- # ==================== FastAPI ====================
268
- engine: Optional[SimpleToolEngine] = None
269
-
270
-
271
- @asynccontextmanager
272
- async def lifespan(app: FastAPI):
273
- global engine
274
- engine = SimpleToolEngine(MODEL_PATH, version=MODEL_VERSION)
275
- engine.initialize()
276
- yield
277
- print("[Server] Shutdown")
278
-
279
-
280
- app = FastAPI(title="SimpleTool Server", version="2.0.0", lifespan=lifespan)
281
-
282
- app.add_middleware(
283
- CORSMiddleware,
284
- allow_origins=["*"],
285
- allow_credentials=True,
286
- allow_methods=["*"],
287
- allow_headers=["*"],
288
- )
289
-
290
-
291
- @app.get("/health")
292
- async def health():
293
- return {
294
- "status": "ok",
295
- "loaded": engine is not None and engine.llm is not None,
296
- "model": MODEL_PATH,
297
- "version": MODEL_VERSION,
298
- }
299
-
300
-
301
- @app.post("/v1/function_call", response_model=FCResponse)
302
- async def function_call(request: FCRequest):
303
- if engine is None or engine.llm is None:
304
- raise HTTPException(503, "Model not loaded")
305
- try:
306
- return engine.call(request)
307
- except Exception as e:
308
- import traceback
309
- traceback.print_exc()
310
- return FCResponse(success=False, error=str(e), latency_ms=0)
311
-
312
-
313
- if __name__ == "__main__":
314
- print(r"""
315
- ╔════════════════════════════════════════════════════════════════════╗
316
- ║ ║
317
- ║ ███████╗██╗███╗ ███╗██████╗ ██╗ ███████╗ ║
318
- ║ ██╔════╝██║████╗ ████║██╔══██╗██║ ██╔════╝ ║
319
- ║ ███████╗██║██╔████╔██║██████╔╝██║ █████╗ ║
320
- ║ ╚════██║██║██║╚██╔╝██║██╔═══╝ ██║ ██╔══╝ ║
321
- ║ ███████║██║██║ ╚═╝ ██║██║ ███████╗███████╗ ║
322
- ║ ╚══════╝╚═╝╚═╝ ╚═╝╚═╝ ╚══════╝╚══════╝ ║
323
- ║ ║
324
- ║ SimpleTool vLLM-Server v2.0 ║
325
- ║ Multi-Head Parallel Decoding — v1/v2 Compatible ║
326
- ║ ║
327
- ║ Run Demos: Open demos/*.html in browser ║
328
- ║ Build New: Send simpletool-game-guide.md to AI(Claude Gemini...) ║
329
- ║ for Building new your own HTML games easily ║
330
- ║ Endpoints: ║
331
- ║ GET /health - Health check (+ version info) ║
332
- ║ POST /v1/function_call - Function call API (v1 & v2) ║
333
- ║ ║
334
- ╚════════════════════════════════════════════════════════════════════╝
335
- """)
336
- uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/03_test_server-checkpoint.py DELETED
@@ -1,250 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- test_server.py — Hit running rt_server /v1/function_call with 4 scenarios
4
- Usage: python test_server.py [--url http://localhost:8899]
5
- """
6
-
7
- import argparse, json, time, sys, requests
8
-
9
- # ==================== Test Scenarios ====================
10
- SCENARIOS = [
11
- # ── 1. Game: Tower Defense (from benchmark) ──
12
- {
13
- "name": "Game — Tower Defense",
14
- "desc": "use_skill(Amiya)",
15
- "expected_fn": "use_skill",
16
- "request": {
17
- "messages": [{"role": "user", "content":
18
- "Wave 5, BOSS appeared, 8 enemies remaining\n"
19
- "Operators: Blaze(north,HP50%,skill ready) Amiya(center,HP90%,skill ready)\n"
20
- "Enemy direction: concentrated north\n\n"
21
- "Amiya use skill now"
22
- }],
23
- "tools": [
24
- {"type":"function","function":{"name":"move","description":"Move a deployed operator to a new position on the battlefield. Use this when the player wants to reposition a unit to a different lane or strategic point.","parameters":{"type":"object","properties":{
25
- "unit_id":{"type":"string","description":"The name of the operator to move. Must match one of the currently deployed operators shown in the battlefield state. Supports fuzzy matching for ASR input, e.g. 'blaze', 'Blaze', 'BLAZE' all refer to the same operator."},
26
- "target":{"type":"string","description":"The destination position on the battlefield grid. Must be one of: 'north' (top lane), 'south' (bottom lane), 'east' (right/enemy side), 'west' (left/base side), 'center' (middle area). Choose based on the player's spoken direction."}},"required":["unit_id","target"]}}},
27
- {"type":"function","function":{"name":"use_skill","description":"Activate the special skill of a deployed operator. Each operator has a unique skill that can be triggered when the skill gauge is ready. The skill effect depends on the operator type (e.g. AoE damage, healing, buff).","parameters":{"type":"object","properties":{
28
- "unit_id":{"type":"string","description":"The name of the operator whose skill should be activated. The operator must be currently deployed on the battlefield and have their skill ready (skill gauge full). Supports fuzzy name matching for ASR input."},
29
- "skill_id":{"type":"string","description":"Optional skill identifier when an operator has multiple skills. If the operator only has one skill or the player did not specify which skill, this can be omitted. Format: 's1', 's2', 's3' for skill slot 1/2/3."}},"required":["unit_id"]}}},
30
- {"type":"function","function":{"name":"retreat","description":"Withdraw a single operator from the battlefield back to the reserve bench. The operator's redeployment timer starts after retreat. Use when the player wants to pull back a specific unit to save them or free up a deployment slot.","parameters":{"type":"object","properties":{
31
- "unit_id":{"type":"string","description":"The name of the operator to retreat. Must be currently deployed on the battlefield. After retreat, this operator enters cooldown before they can be redeployed. Supports fuzzy name matching for ASR input."}},"required":["unit_id"]}}},
32
- {"type":"function","function":{"name":"set_stance","description":"Change the combat behavior mode of a deployed operator. This affects how the operator selects targets and whether they prioritize attacking or surviving.","parameters":{"type":"object","properties":{
33
- "unit_id":{"type":"string","description":"The name of the operator whose stance should be changed. Must be currently deployed on the battlefield. Supports fuzzy name matching for ASR input."},
34
- "stance":{"type":"string","description":"The behavior mode to set. Must be one of: 'aggressive' (prioritize attacking nearest enemy, maximize DPS), 'defensive' (prioritize blocking and damage reduction, focus on survival), 'hold' (stay in position and only attack enemies in range, do not chase)."}},"required":["unit_id","stance"]}}},
35
- {"type":"function","function":{"name":"retreat_all","description":"Emergency retreat of all currently deployed operators from the battlefield at once. Use only when the player explicitly requests a full withdrawal, typically in dire situations. All operators enter redeployment cooldown simultaneously.","parameters":{"type":"object","properties":{}}}},
36
- {"type":"function","function":{"name":"pass","description":"Take no action this turn. Use when the player's command has already been fulfilled in history, or when the player explicitly says to wait, skip, or do nothing. Also use when the voice input is ambiguous and no clear command can be extracted.","parameters":{"type":"object","properties":{}}}}
37
- ],
38
- "system": "You are the voice command interpreter for a real-time tower defense game. The player issues orders by voice. You convert ASR-transcribed commands into function calls.\n\nRules:\n- One function call per command\n- Fuzzy match operator names\n- Positions: north, south, east, west, center\n- If all tasks in history are done, call pass",
39
- "history": []
40
- }
41
- },
42
- # ── 2. Robotic Arm — Assembly (from benchmark) ──
43
- {
44
- "name": "Robotic Arm — Assembly",
45
- "desc": "move_to(300,150,50,slow)",
46
- "expected_fn": "move_to",
47
- "request": {
48
- "messages": [{"role": "user", "content":
49
- "Arm at home (0,0,500), gripper open\n"
50
- "Workpiece: red gear at (300,150,50), target tray at (600,0,80)\n\n"
51
- "Move to the red gear position slowly"
52
- }],
53
- "tools": [
54
- {"type":"function","function":{"name":"move_to","description":"Move the robotic arm end-effector (tool center point) to a specified 3D coordinate in the workspace. The arm plans a collision-free path from its current position to the target. Optionally control movement speed for precision tasks.","parameters":{"type":"object","properties":{
55
- "x":{"type":"number","description":"Target X coordinate in millimeters, relative to the robot base frame origin. Positive X points forward (away from the robot base). Valid range depends on arm reach, typically -800 to 800 mm."},
56
- "y":{"type":"number","description":"Target Y coordinate in millimeters, relative to the robot base frame origin. Positive Y points to the left when facing the robot. Valid range depends on arm reach, typically -800 to 800 mm."},
57
- "z":{"type":"number","description":"Target Z coordinate in millimeters, relative to the robot base frame origin (table surface = 0). Positive Z points upward. Must be >= 0 to avoid collision with the work surface. Typical range: 0 to 500 mm."},
58
- "speed":{"type":"string","description":"Movement speed profile for the path. 'slow' (25% max velocity) for precision placement and delicate parts, 'normal' (50% max velocity) for standard pick-and-place, 'fast' (100% max velocity) for rapid repositioning when precision is not critical. Default: 'normal'."}},"required":["x","y","z"]}}},
59
- {"type":"function","function":{"name":"grip","description":"Close the gripper jaws to grasp an object at the current end-effector position. The gripper applies the specified force and holds it. Must be called after positioning the arm above/around the target object.","parameters":{"type":"object","properties":{
60
- "force":{"type":"number","description":"Gripping force in Newtons applied by the gripper jaws. Choose based on object fragility: 10N for light/fragile items (electronics, thin plastic), 50N for medium items (standard gears, metal parts), 100N for heavy/robust items (large castings, steel blocks). Excessive force may damage delicate workpieces."}},"required":["force"]}}},
61
- {"type":"function","function":{"name":"release","description":"Open the gripper jaws to release the currently held object. The gripper fully opens to its maximum width. Should be called after positioning the arm at the target placement location. Ensure the object is at a safe height above the surface before releasing.","parameters":{"type":"object","properties":{}}}},
62
- {"type":"function","function":{"name":"rotate","description":"Rotate the end-effector around a specified axis without changing its position. Used to orient the gripper or tool for proper approach angle before grasping, or to rotate a held workpiece for assembly alignment.","parameters":{"type":"object","properties":{
63
- "axis":{"type":"string","description":"The rotation axis in the end-effector frame. 'roll' rotates around the approach direction (Z-axis of tool frame, like turning a screwdriver), 'pitch' tilts the end-effector up/down (like nodding), 'yaw' swings the end-effector left/right (like shaking head). Choose based on the desired orientation change."},
64
- "angle":{"type":"number","description":"Rotation angle in degrees. Positive values follow the right-hand rule around the specified axis. Typical range: -180 to 180 degrees. Small angles (< 15°) for fine adjustment, larger angles for major reorientation."}},"required":["axis","angle"]}}},
65
- {"type":"function","function":{"name":"home","description":"Return the robotic arm to its predefined home position (0, 0, 500) with the gripper pointing straight down and jaws open. Use as a safe starting/ending position for task sequences, or to clear the workspace. The arm takes a collision-free path at normal speed.","parameters":{"type":"object","properties":{}}}}
66
- ],
67
- "system": "You are the voice controller for an industrial 6-axis robotic arm. You convert spoken commands into function calls.\n\nRules:\n- One function call per command\n- Coordinates in mm, angles in degrees\n- Gripper force: light=10N, medium=50N, heavy=100N\n- Speed: slow/normal/fast",
68
- "history": []
69
- }
70
- },
71
- # ── 3. Digital Human — Streamer (from benchmark) ──
72
- {
73
- "name": "Digital Human — Streamer",
74
- "desc": "speak(welcome,cheerful)",
75
- "expected_fn": "speak",
76
- "request": {
77
- "messages": [{"role": "user", "content":
78
- "Stream just started, viewers flooding in\n"
79
- "Chat: \"Hello streamer!\" \"Good evening!\"\n"
80
- "Director: greet the audience warmly, say welcome and look at camera"
81
- }],
82
- "tools": [
83
- {"type":"function","function":{"name":"set_expression","description":"Set the facial expression of the digital human avatar. Controls the blend shapes for eyes, eyebrows, and mouth to display the target emotion. The expression persists until changed by another set_expression call or overridden by a speak animation.","parameters":{"type":"object","properties":{
84
- "emotion":{"type":"string","description":"The target facial expression to display. Must be one of: 'happy' (smile, raised cheeks), 'sad' (downturned mouth, drooping eyebrows), 'surprised' (wide eyes, raised eyebrows, open mouth), 'angry' (furrowed brows, tight lips), 'neutral' (relaxed default face), 'thinking' (slightly furrowed brow, eyes looking up/away, subtle lip purse)."},
85
- "intensity":{"type":"number","description":"The strength of the facial expression blend, from 0.0 (barely visible, subtle hint) to 1.0 (maximum exaggeration, full expression). Recommended: 0.3-0.5 for natural conversation, 0.6-0.8 for reactive moments, 0.9-1.0 for comedic or dramatic emphasis."}},"required":["emotion","intensity"]}}},
86
- {"type":"function","function":{"name":"speak","description":"Make the digital human speak the given text with lip-sync animation and appropriate facial expressions. The TTS engine converts text to audio while the avatar performs real-time viseme-based lip synchronization. The tone parameter affects both voice prosody and accompanying facial micro-expressions.","parameters":{"type":"object","properties":{
87
- "text":{"type":"string","description":"The speech content for the digital human to say aloud. Should be natural conversational language appropriate for a live stream context. Keep sentences concise (under 50 characters preferred for real-time responsiveness). May include casual expressions, emoji descriptions, or audience interaction phrases."},
88
- "tone":{"type":"string","description":"The vocal tone and emotional coloring of the speech delivery. Must be one of: 'cheerful' (upbeat, warm, higher pitch, for greetings and positive moments), 'calm' (steady, soothing, moderate pace, for explanations and transitions), 'serious' (lower pitch, measured pace, for important announcements), 'excited' (high energy, faster pace, emphasis peaks, for reactions and hype moments)."}},"required":["text","tone"]}}},
89
- {"type":"function","function":{"name":"gesture","description":"Trigger a pre-defined body gesture animation on the digital human avatar. The gesture plays once and blends back to the idle pose. Can be combined with speak or set_expression for more natural multi-channel communication.","parameters":{"type":"object","properties":{
90
- "type":{"type":"string","description":"The gesture animation to play. Must be one of: 'wave' (friendly hand wave, for greetings and farewells), 'nod' (head nod, to show agreement or acknowledgment), 'shake_head' (head shake, to express disagreement or disbelief), 'bow' (respectful bow, for gratitude or formal greeting), 'point' (index finger pointing forward, to direct attention), 'thumbs_up' (approval gesture, for positive feedback), 'clap' (both hands clapping, for celebration or applause)."}},"required":["type"]}}},
91
- {"type":"function","function":{"name":"look_at","description":"Direct the digital human's eye gaze and subtle head orientation toward a specified target. Creates natural eye contact or directional attention. The gaze shift is smoothly interpolated over ~200ms for realistic movement.","parameters":{"type":"object","properties":{
92
- "target":{"type":"string","description":"The gaze target direction. Must be one of: 'camera' (look directly at the audience through the camera lens, creates eye contact with viewers), 'left' (glance to the left side of the screen, e.g. toward a chat panel or co-host), 'right' (glance to the right, e.g. toward a game screen or secondary content), 'up' (look upward, conveys thinking or reacting to something above), 'down' (look downward, conveys reading chat, shyness, or sadness)."}},"required":["target"]}}},
93
- {"type":"function","function":{"name":"idle","description":"Return the digital human to its default idle animation loop. Resets any active expression to neutral, stops ongoing gestures, and returns gaze to a soft forward direction with natural idle micro-movements (subtle breathing, occasional blinks, slight sway). Use during pauses or transitions between active segments.","parameters":{"type":"object","properties":{}}}}
94
- ],
95
- "system": "You are the expression controller for a virtual digital human streamer. You convert director instructions into animation function calls.\n\nRules:\n- One function call per instruction\n- Emotion intensity: 0.0-1.0\n- Speech text should be natural\n- Tone: cheerful/calm/serious/excited",
96
- "history": []
97
- }
98
- },
99
- # ── 4. Neon Arena (what HTML actually sends, legacy env style) ──
100
- {
101
- "name": "Neon Arena — Legacy HTML",
102
- "desc": "fire(left) or move(left)",
103
- "expected_fn": "move",
104
- "request": {
105
- "messages": [{"role": "user", "content":
106
- "Arena 900x600. FIRE or Call move(dir) or fire(dir). dir:up/down/left/right"
107
- }],
108
- "tools": [
109
- {"type":"function","function":{"name":"move","description":"Move the player's spaceship in the specified direction by one step on the 900x600 arena grid. Use to reposition for better firing angle, dodge incoming bullets, or approach/retreat from enemies.","parameters":{"type":"object","properties":{
110
- "direction":{"type":"string","enum":["up","down","left","right"],"description":"The movement direction on the arena. 'up' decreases Y (toward top edge), 'down' increases Y (toward bottom edge), 'left' decreases X (toward left edge), 'right' increases X (toward right edge). Choose based on tactical positioning relative to the player and arena walls."}},"required":["direction"]}}},
111
- {"type":"function","function":{"name":"fire","description":"Fire a bullet from the spaceship in the specified direction. The bullet travels in a straight line until it hits a target or exits the arena boundary. Use when aligned with the player's position on the horizontal or vertical axis for best hit probability.","parameters":{"type":"object","properties":{
112
- "direction":{"type":"string","enum":["up","down","left","right"],"description":"The firing direction of the bullet. 'up' fires toward top edge, 'down' fires toward bottom edge, 'left' fires toward left edge (toward player's side), 'right' fires toward right edge. Choose based on current alignment with the player: fire horizontally when align_h=true, vertically when align_v=true."}},"required":["direction"]}}}
113
- ],
114
- "environment": ["pos=700,300","player=100,310","dist=600","align_h=true","align_v=false","cd=0","wall=no"],
115
- "history": ["fire(left)","move(up)","fire(left)"]
116
- }
117
- },
118
- ]
119
-
120
-
121
- def check_health(url: str) -> dict:
122
- r = requests.get(f"{url}/health", timeout=5)
123
- return r.json()
124
-
125
-
126
- def call_fc(url: str, req: dict) -> dict:
127
- t0 = time.perf_counter()
128
- r = requests.post(f"{url}/v1/function_call", json=req, timeout=30)
129
- wall_ms = (time.perf_counter() - t0) * 1000
130
- d = r.json()
131
- d["_wall_ms"] = wall_ms
132
- return d
133
-
134
-
135
- def fmt_heads(heads: dict) -> str:
136
- lines = []
137
- for k in ["function","arg1","arg2","arg3","arg4","arg5","arg6","content"]:
138
- if k in heads:
139
- v = heads[k]
140
- tag = "NULL" if (not v or v == "<|null|>") else v
141
- lines.append(f" {k:<10} = {tag}")
142
- return "\n".join(lines)
143
-
144
-
145
- def main():
146
- ap = argparse.ArgumentParser(description="Test SimpleTool server")
147
- ap.add_argument("--url", default="http://localhost:8899")
148
- ap.add_argument("--rounds", type=int, default=3, help="hot rounds per scenario")
149
- args = ap.parse_args()
150
-
151
- url = args.url.rstrip("/")
152
-
153
- # ── Health ──
154
- print(f"\n{'='*65}")
155
- print(f" SimpleTool Server Test")
156
- print(f" Target: {url}")
157
- print(f"{'='*65}\n")
158
-
159
- try:
160
- h = check_health(url)
161
- print(f" /health → {json.dumps(h)}")
162
- if not h.get("loaded") and h.get("status") != "ok":
163
- print(" ⚠ Model not loaded!"); sys.exit(1)
164
- except Exception as e:
165
- print(f" ✗ Cannot connect: {e}"); sys.exit(1)
166
-
167
- version = h.get("version", "unknown")
168
- print(f" Server version: {version}\n")
169
-
170
- # ── Cold start (first call warms KV cache) ──
171
- print(f"{'='*65}")
172
- print(f" COLD START")
173
- print(f"{'='*65}")
174
- cold_ms = []
175
- for sc in SCENARIOS:
176
- r = call_fc(url, sc["request"])
177
- ms = r.get("latency_ms", r.get("_wall_ms", 0))
178
- cold_ms.append(ms)
179
- ok = "✓" if r.get("function", "") == sc["expected_fn"] else "✗"
180
- print(f" {ok} {sc['name']:<35} {ms:7.1f}ms → {r.get('function','?')}({r.get('args',{})})")
181
- print()
182
-
183
- # ── Hot rounds ──
184
- print(f"{'='*65}")
185
- print(f" HOT ROUNDS (×{args.rounds})")
186
- print(f"{'='*65}")
187
- hot_ms = [[] for _ in SCENARIOS]
188
- for rd in range(args.rounds):
189
- parts = []
190
- for i, sc in enumerate(SCENARIOS):
191
- r = call_fc(url, sc["request"])
192
- ms = r.get("latency_ms", r.get("_wall_ms", 0))
193
- hot_ms[i].append(ms)
194
- parts.append(f"{ms:6.1f}ms")
195
- print(f" Round {rd+1}: {' '.join(parts)}")
196
- print()
197
-
198
- # ── Detailed test ──
199
- print(f"{'='*65}")
200
- print(f" DETAILED RESULTS")
201
- print(f"{'='*65}\n")
202
-
203
- results = []
204
- for i, sc in enumerate(SCENARIOS):
205
- r = call_fc(url, sc["request"])
206
- fn = r.get("function", "")
207
- ok = fn == sc["expected_fn"]
208
- results.append((sc, r, ok))
209
-
210
- status = "PASS ✓" if ok else "FAIL ✗"
211
- ms_server = r.get("latency_ms", 0)
212
- ms_wall = r.get("_wall_ms", 0)
213
-
214
- print(f"─── {sc['name']} ───")
215
- print(f" {status} expected={sc['expected_fn']} got={fn}")
216
- print(f" args: {json.dumps(r.get('args', {}), ensure_ascii=False)}")
217
- print(f" server={ms_server:.1f}ms wall={ms_wall:.1f}ms overhead={ms_wall-ms_server:.1f}ms")
218
- if r.get("heads"):
219
- print(f" heads:")
220
- print(fmt_heads(r["heads"]))
221
- if r.get("error"):
222
- print(f" error: {r['error']}")
223
- print()
224
-
225
- # ── Summary ──
226
- n = len(results)
227
- passed = sum(1 for _, _, ok in results if ok)
228
- avg_cold = sum(cold_ms) / n
229
- avg_hot = sum(sum(h) for h in hot_ms) / sum(len(h) for h in hot_ms) if hot_ms else 0
230
- avg_detail = sum(r.get("latency_ms", 0) for _, r, _ in results) / n
231
-
232
- print(f"{'='*65}")
233
- print(f" SUMMARY")
234
- print(f"{'='*65}")
235
- print(f" Server version : {version}")
236
- print(f" Accuracy : {passed}/{n}")
237
- print(f" Cold start avg : {avg_cold:.1f}ms")
238
- print(f" Hot avg : {avg_hot:.1f}ms")
239
- print(f" Detail avg : {avg_detail:.1f}ms")
240
- print()
241
- print(f" {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'Detail':>7} {'Status':>6}")
242
- print(f" {'─'*65}")
243
- for i, (sc, r, ok) in enumerate(results):
244
- havg = sum(hot_ms[i]) / len(hot_ms[i]) if hot_ms[i] else 0
245
- print(f" {sc['name']:<35} {cold_ms[i]:6.1f} {havg:6.1f} {r.get('latency_ms',0):6.1f} {'✓' if ok else '✗':>5}")
246
- print()
247
-
248
-
249
- if __name__ == "__main__":
250
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/README-checkpoint.md DELETED
@@ -1,308 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - simpletool
5
- - tool-calling
6
- - parallel-decoding
7
- license: apache-2.0
8
- datasets:
9
- - your-dataset-name
10
- language:
11
- - en
12
- - zh
13
- pipeline_tag: text-generation
14
- arxiv: 2603.00030
15
- ---
16
- <p align="center">
17
- <a href="README.md">English</a> | <a href="README_zh.md">中文</a>
18
- </p>
19
- <h1 align="center">SimpleTool</h1>
20
-
21
- <p align="center">
22
- <b>Parallel Decoding for Real-Time LLM Function Calling</b>
23
- </p>
24
-
25
- <p align="center">
26
- <a href="https://arxiv.org/abs/2603.00030"><img src="https://img.shields.io/badge/arXiv-2603.00030-red"></a>
27
- <a href="https://huggingface.co/Cialtion/SimpleTool"><img src="https://img.shields.io/badge/🤗-Models-yellow"></a>
28
- <a href="https://www.modelscope.cn/models/cialtion/SimpleTool"><img src="https://img.shields.io/badge/ModelScope-Models-blue"></a>
29
- <a href="#demo-videos"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
30
- <a href="#demo-videos"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
31
- <a href="#license"><img src="https://img.shields.io/badge/License-Apache%202.0-green"></a>
32
- </p>
33
-
34
- <p align="center">
35
- A 4B-parameter LLM achieving <b>16 Hz end-to-end real-time function calling</b> — fast enough to drive game AI, robotic arms, and digital humans.
36
- </p>
37
-
38
- ---
39
-
40
- SimpleTool enables **real-time LLM function calling** through multi-head parallel decoding. By introducing special tokens that compress redundant structured output (4–6×) and enable independent generation of function name and arguments, we achieve **3–6× end-to-end speedup** while maintaining competitive accuracy across three application domains: **games**, **robotic control**, and **digital human animation**.
41
-
42
- <p align="center">
43
- <img src="assets/fig_title_panel_a.png" alt="SimpleTool Overview" width="700">
44
- </p>
45
-
46
- ## How It Works
47
-
48
- Traditional function calling generates tokens sequentially — `function → arg1 → arg2 → ...` — so latency scales linearly with output length. SimpleTool exploits two key observations:
49
-
50
- 1. **Token Redundancy**: Structured outputs contain predictable tokens (brackets, parameter names, quotes) that can be compressed into single special tokens.
51
- 2. **Weak Causal Dependencies**: Function arguments are largely independent of each other and can be generated in parallel.
52
-
53
- <p align="center">
54
- <img src="assets/overview.png" alt="SimpleTool Architecture" width="600">
55
- </p>
56
-
57
- By decoding function name and arguments as parallel streams sharing the same prefix KV cache, latency drops from `sum(all_token_times)` to `max(per_head_time)`. The parallel heads utilize idle compute capacity within the memory-bandwidth-bound decode phase, making parallelization nearly free.
58
-
59
- For more details, see our [arXiv paper](https://arxiv.org/abs/2603.00030).
60
-
61
- ---
62
-
63
- ## Quick Start
64
-
65
- ### 1. Setup Environment
66
-
67
- ```bash
68
- git clone https://github.com/HaxxorCialtion/SimpleTool.git
69
- cd SimpleTool
70
- ```
71
-
72
- **Option A — uv (recommended)**
73
- ```bash
74
- uv venv env_rt -p python3.12
75
- source env_rt/bin/activate
76
- uv pip install -r requirements.txt
77
- ```
78
-
79
- **Option B — conda**
80
- ```bash
81
- conda create -n simpletool python=3.12 -y
82
- conda activate simpletool
83
- pip install -r requirements.txt
84
- ```
85
-
86
- **Option C — pip**
87
- ```bash
88
- python3.12 -m venv env_rt
89
- source env_rt/bin/activate
90
- pip install -r requirements.txt
91
- ```
92
-
93
- ### 2. Download Model
94
-
95
- The recommended default model is **RT-Qwen3-4B-AWQ-v2** (4B parameters, AWQ W4A16 quantized, v2 prompt format). All scripts default to `./models/RT-Qwen3-4B-AWQ-v2`.
96
-
97
- ```bash
98
- # HuggingFace
99
- huggingface-cli download Cialtion/SimpleTool \
100
- --include "RT-Qwen3-4B-AWQ-v2/*" --local-dir ./models
101
-
102
- # Or ModelScope
103
- modelscope download --model cialtion/SimpleTool \
104
- --include "RT-Qwen3-4B-AWQ-v2/*" --local_dir ./models
105
- ```
106
-
107
- <details>
108
- <summary><b>All Available Models</b></summary>
109
-
110
- | Model | Params | Latency | HuggingFace | ModelScope |
111
- |-------|--------|---------|-------------|------------|
112
- | RT-Qwen2.5-0.5B-AWQ | 0.5B | ~30ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-0.5B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-0.5B-AWQ) |
113
- | RT-Qwen2.5-1.5B-AWQ | 1.5B | ~40ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-1.5B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-1.5B-AWQ) |
114
- | RT-Qwen2.5-3B-AWQ | 3B | ~50ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-3B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-3B-AWQ) |
115
- | **RT-Qwen3-4B-AWQ-v2** | **4B** | **~60ms** | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ-v2) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ-v2) |
116
- | RT-Qwen3-4B-AWQ | 4B | ~60ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ) |
117
- | RT-Qwen2.5-7B-AWQ | 7B | ~70ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-7B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-7B-AWQ) |
118
- | RT-Qwen2.5-14B-AWQ | 14B | ~130ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-14B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-14B-AWQ) |
119
- | RT-Qwen3-30B-A3B-AWQ | 30B(A3B) | ~ | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-30B_awq_w4a16) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-30B_awq_w4a16) |
120
-
121
- > Latency measured on RTX 4090 with vLLM prefix caching. v2 models use an improved prompt format with domain-specific system prompts; v1 models use a generic multi-head instruction header.
122
-
123
- </details>
124
-
125
- ### 3. Run Benchmark (No Server Needed)
126
-
127
- `01_benchmark.py` runs multi-head parallel decoding directly via vLLM across three application domains — game AI, robotic arm control, and digital human animation — with cold start / hot prefill / decode bottleneck analysis.
128
-
129
- ```bash
130
- # v2 model (default)
131
- python 01_benchmark.py --version v2
132
-
133
- # v1 model
134
- python 01_benchmark.py --version v1 --model ./models/RT-Qwen3-4B-AWQ
135
-
136
- # Auto-detect optimal head count per scenario
137
- python 01_benchmark.py --n-args auto
138
- ```
139
-
140
- Example output:
141
- ```
142
- PARALLEL TEST (v2)
143
-
144
- ─── Game — Tower Defense ───
145
- PASS use_skill(Amiya)
146
- function use_skill 4 OK
147
- arg1 Amiya 4 FILL
148
- arg2 <|null|> 3 NULL
149
- e2e=24.6ms max_tok=4
150
-
151
- ─── Robotic Arm — Assembly ───
152
- PASS move_to(300,150,50,slow)
153
- function move_to 4 OK
154
- arg1 300 5 FILL
155
- arg2 150 5 FILL
156
- arg3 500 5 FILL
157
- arg4 slow 3 FILL
158
- e2e=39.9ms max_tok=5
159
-
160
- ─── Digital Human — Streamer ───
161
- PASS speak(welcome,cheerful)
162
- function speak 4 OK
163
- arg1 Welcome! 4 FILL
164
- arg2 cheerful 5 FILL
165
- e2e=29.1ms max_tok=5
166
-
167
- SUMMARY (v2)
168
- Accuracy : 3/3
169
- Cold start avg : 56.1ms
170
- Hot prefill avg: 29.3ms
171
- E2E avg (hot) : 31.2ms
172
- E2E / max_tok : 6.7ms/tok (decode bottleneck)
173
- ```
174
-
175
- The script also prints the full prompt structure and reconstructed multi-head output for inspection.
176
-
177
- ### 4. Start Server
178
-
179
- `02_server.py` wraps the engine in a FastAPI server with CORS support. HTML game clients connect to it.
180
-
181
- ```bash
182
- python 02_server.py
183
- ```
184
-
185
- Server starts at `http://localhost:8899` with two endpoints:
186
-
187
- | Endpoint | Method | Description |
188
- |----------|--------|-------------|
189
- | `/health` | GET | Health check, model version info |
190
- | `/v1/function_call` | POST | Multi-head parallel function call |
191
-
192
- Edit `MODEL_PATH` and `MODEL_VERSION` at the top of `02_server.py` to switch between v1/v2 models.
193
-
194
- ### 5. Test Server
195
-
196
- With the server running, test it from another terminal:
197
-
198
- ```bash
199
- python 03_test_server.py
200
- ```
201
-
202
- This sends the same three domain scenarios (game, robotic arm, digital human) to the server API and reports accuracy, cold/hot latency, and per-head output.
203
-
204
- ```bash
205
- # Custom server URL
206
- python 03_test_server.py --url http://192.168.1.100:8899
207
-
208
- # More hot rounds
209
- python 03_test_server.py --rounds 10
210
- ```
211
-
212
- ### 6. Play Demos
213
-
214
- Open demo HTML files in your browser. They connect to the running SimpleTool server.
215
-
216
- | Demo | Description | File |
217
- |------|-------------|------|
218
- | **Pong** | AI vs Human paddle game | `demos/pong_game.html` |
219
- | **Neon Arena** | Multi-AI battle shooter | `demos/neon_arena.html` |
220
-
221
- For games with extra assets:
222
- ```bash
223
- cd demos/neon_arena
224
- python3 -m http.server 8080 --bind 127.0.0.1
225
- ```
226
- Then open http://127.0.0.1:8080/neon_arena.html and enter your SimpleTool server URL (default: `http://localhost:8899`).
227
-
228
- <p align="center">
229
- <video src="https://github.com/user-attachments/assets/436e3b97-e8ab-4d36-9fa0-8f1962da4a38" autoplay loop muted width="400"></video>
230
- <video src="https://github.com/user-attachments/assets/f9b127da-b65e-4a06-b48f-836e759a6029" autoplay loop muted width="400"></video>
231
- </p>
232
-
233
- ---
234
-
235
- ## Project Structure
236
-
237
- ```
238
- SimpleTool/
239
- ├── 01_benchmark.py # Step 1: Direct parallel decode benchmark
240
- ├── 02_server.py # Step 2: FastAPI vLLM server
241
- ├── 03_test_server.py # Step 3: Server API test client
242
- ├── prompts/ # External prompt & scenario files
243
- │ ├── v1_system.txt # v1 multi-head system prompt
244
- │ ├── scenarios.json # 3 domain test scenarios
245
- │ ├─�� tools_game.jsonl # Tower defense tool definitions
246
- │ ├── tools_arm.jsonl # Robotic arm tool definitions
247
- │ └── tools_avatar.jsonl # Digital human tool definitions
248
- ├── models/ # Downloaded models go here
249
- │ └── RT-Qwen3-4B-AWQ-v2/ # Default model
250
- ├── demos/ # HTML game clients
251
- │ ├── pong_game.html
252
- │ └── neon_arena/
253
- ├── assets/ # Figures for README
254
- ├── requirements.txt
255
- ├── simpletool-game.skill.md # Guide for building new games with AI
256
- ├── README.md
257
- └── README_zh.md
258
- ```
259
-
260
- ## Build Your Own Game
261
-
262
- Feed **`simpletool-game.skill.md`** along with this **`README.md`** into your AI coding agent (Claude Code, Codex, Antigravity, etc.) — the skill file covers server API spec, tool definition format, query design best practices, frontend templates, and dynamic head optimization tips, while the README helps the agent understand the overall project structure. Together they provide everything needed to vibe-code a SimpleTool-powered game.
263
-
264
- ---
265
-
266
- ## Roadmap
267
-
268
- - [ ] **World Simulation** — Large-scale (1,000+ NPCs) real-time AI world simulation with < 200ms action latency per agent
269
- - [ ] **Speculative & Multi-Token Decoding** — Speculative decoding and multi-token prediction for further latency reduction
270
- - [ ] **Native Windows Support** — Windows game engine plugins and native runtime (no need for Docker or WSL)
271
- - [ ] **Apple Ecosystem** — Mac and iPhone on-device deployment (CoreML / Metal)
272
- - [ ] **v3 Architecture** — Fast thinking (real-time SimpleTool) + slow thinking (async meta-cognition) fusion
273
- - [ ] **Embodied Intelligence** — Virtual 3D digital humans, large-scale game engine integration demos
274
- - [ ] **Open Source Training** — Full training code and dataset release
275
-
276
- ---
277
-
278
- ## Demo Videos
279
-
280
- <p align="center">
281
- <a href="#"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
282
- <a href="#"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
283
- </p>
284
-
285
- > Video demos coming soon — showcasing real-time game AI, robotic arm control, and digital human animation.
286
-
287
- ---
288
-
289
- ## Citation
290
-
291
- ```bibtex
292
- @article{shi2026simpletool,
293
- title={SimpleTool: Parallel Decoding for Real-Time LLM Function Calling},
294
- author={Shi, Xiaoxin and Wan, Jiaxin and Dong, Linkang and Jiang, Wei and Liu, Yue and Huang, Zengfeng},
295
- journal={arXiv preprint arXiv:2603.00030},
296
- year={2026}
297
- }
298
- ```
299
-
300
- ## Contact
301
-
302
- - **Email**: cialtion737410@sjtu.edu.cn / cialtion@outlook.com
303
- - **QQ Group**: 861244702
304
- - **Bilibili**: [Cialtion](https://space.bilibili.com/Cialtion)
305
-
306
- ## License
307
-
308
- Apache 2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/README_zh-checkpoint.md DELETED
@@ -1,308 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - simpletool
5
- - tool-calling
6
- - parallel-decoding
7
- license: apache-2.0
8
- datasets:
9
- - your-dataset-name
10
- language:
11
- - en
12
- - zh
13
- pipeline_tag: text-generation
14
- arxiv: 2603.00030
15
- ---
16
- <p align="center">
17
- <a href="README.md">English</a> | <a href="README_zh.md">中文</a>
18
- </p>
19
- <h1 align="center">SimpleTool</h1>
20
-
21
- <p align="center">
22
- <b>面向实时 LLM 函数调用的并行解码架构</b>
23
- </p>
24
-
25
- <p align="center">
26
- <a href="https://arxiv.org/abs/2603.00030"><img src="https://img.shields.io/badge/arXiv-2603.00030-red"></a>
27
- <a href="https://huggingface.co/Cialtion/SimpleTool"><img src="https://img.shields.io/badge/🤗-Models-yellow"></a>
28
- <a href="https://www.modelscope.cn/models/cialtion/SimpleTool"><img src="https://img.shields.io/badge/ModelScope-Models-blue"></a>
29
- <a href="#演示视频"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
30
- <a href="#演示视频"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
31
- <a href="#许可证"><img src="https://img.shields.io/badge/License-Apache%202.0-green"></a>
32
- </p>
33
-
34
- <p align="center">
35
- 一个 4B 参数的 LLM,实现 <b>16 Hz 端到端实时函数调用</b>——足以驱动游戏 AI、机械臂控制和数字人动画。
36
- </p>
37
-
38
- ---
39
-
40
- SimpleTool 通过多头并行解码实现**实时 LLM 函数调用**。我们引入特殊 token 来压缩结构化输出中的冗余信息(4–6 倍压缩),并让函数名与各参数独立并行生成,从而实现**端到端 3–6 倍加速**,同时在三大应用场景——**游戏**、**机械臂控制**和**数字人动画**——中保持具有竞争力的准确率。
41
-
42
- <p align="center">
43
- <img src="assets/fig_title_panel_a.png" alt="SimpleTool 概览" width="700">
44
- </p>
45
-
46
- ## 工作原理
47
-
48
- 传统函数调用按顺序逐 token 生成——`function → arg1 → arg2 → ...`——延迟随输出长度线性增长。SimpleTool 基于两个关键观察:
49
-
50
- 1. **Token 冗余**:结构化输出中存在大量可预测的 token(括号、参数名、引号等),可以压缩为单个特殊 token。
51
- 2. **弱因果依赖**:函数的各个参数之间基本相互独立,可以并行生成。
52
-
53
- <p align="center">
54
- <img src="assets/overview.png" alt="SimpleTool 架构" width="600">
55
- </p>
56
-
57
- 将函数名和各参数作为共享同一前缀 KV 缓存的并行流进行解码,延迟从 `sum(所有token耗时)` 降为 `max(单头耗时)`。并行解码头利用了解码阶段显存带宽受限时的闲置算力,使得并行化几乎零开销。
58
-
59
- 更多细节请参阅我们的 [arXiv 论文](https://arxiv.org/abs/2603.00030)。
60
-
61
- ---
62
-
63
- ## 快速上手
64
-
65
- ### 1. 配置环境
66
-
67
- ```bash
68
- git clone https://github.com/HaxxorCialtion/SimpleTool.git
69
- cd SimpleTool
70
- ```
71
-
72
- **方案 A — uv(推荐)**
73
- ```bash
74
- uv venv env_rt -p python3.12
75
- source env_rt/bin/activate
76
- uv pip install -r requirements.txt
77
- ```
78
-
79
- **方案 B — conda**
80
- ```bash
81
- conda create -n simpletool python=3.12 -y
82
- conda activate simpletool
83
- pip install -r requirements.txt
84
- ```
85
-
86
- **方案 C — pip**
87
- ```bash
88
- python3.12 -m venv env_rt
89
- source env_rt/bin/activate
90
- pip install -r requirements.txt
91
- ```
92
-
93
- ### 2. 下载模型
94
-
95
- 默认推荐模型为 **RT-Qwen3-4B-AWQ-v2**(4B 参数,AWQ W4A16 量化,v2 提示格式)。所有脚本默认路径为 `./models/RT-Qwen3-4B-AWQ-v2`。
96
-
97
- ```bash
98
- # HuggingFace
99
- huggingface-cli download Cialtion/SimpleTool \
100
- --include "RT-Qwen3-4B-AWQ-v2/*" --local-dir ./models
101
-
102
- # 或者 ModelScope(国内推荐)
103
- modelscope download --model cialtion/SimpleTool \
104
- --include "RT-Qwen3-4B-AWQ-v2/*" --local_dir ./models
105
- ```
106
-
107
- <details>
108
- <summary><b>全部可用模型</b></summary>
109
-
110
- | 模型 | 参数量 | 延迟 | HuggingFace | ModelScope |
111
- |------|--------|------|-------------|------------|
112
- | RT-Qwen2.5-0.5B-AWQ | 0.5B | ~30ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-0.5B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-0.5B-AWQ) |
113
- | RT-Qwen2.5-1.5B-AWQ | 1.5B | ~40ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-1.5B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-1.5B-AWQ) |
114
- | RT-Qwen2.5-3B-AWQ | 3B | ~50ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-3B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-3B-AWQ) |
115
- | **RT-Qwen3-4B-AWQ-v2** | **4B** | **~60ms** | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ-v2) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ-v2) |
116
- | RT-Qwen3-4B-AWQ | 4B | ~60ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ) |
117
- | RT-Qwen2.5-7B-AWQ | 7B | ~70ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-7B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-7B-AWQ) |
118
- | RT-Qwen2.5-14B-AWQ | 14B | ~130ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-14B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-14B-AWQ) |
119
- | RT-Qwen3-30B-A3B-AWQ | 30B(A3B) | ~ | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-30B_awq_w4a16) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-30B_awq_w4a16) |
120
-
121
- > 延迟数据在 RTX 4090 上使用 vLLM 前缀缓存测得。v2 模型采用改进的提示格式,包含领域专用系统提示;v1 模型使用通用的多头指令头。
122
-
123
- </details>
124
-
125
- ### 3. 运行基准测试(无需启动服务)
126
-
127
- `01_benchmark.py` 通过 vLLM 直接运行多头并行解码,覆盖三大应用场景——游戏 AI、机械臂控制和数字人动画——并输出冷启动 / 热预填充 / 解码瓶颈分析。
128
-
129
- ```bash
130
- # v2 模型(默认)
131
- python 01_benchmark.py --version v2
132
-
133
- # v1 模型
134
- python 01_benchmark.py --version v1 --model ./models/RT-Qwen3-4B-AWQ
135
-
136
- # 自动检测每个场景的最优头数
137
- python 01_benchmark.py --n-args auto
138
- ```
139
-
140
- 输出示例:
141
- ```
142
- PARALLEL TEST (v2)
143
-
144
- ─── Game — Tower Defense ───
145
- PASS use_skill(Amiya)
146
- function use_skill 4 OK
147
- arg1 Amiya 4 FILL
148
- arg2 <|null|> 3 NULL
149
- e2e=24.6ms max_tok=4
150
-
151
- ─── Robotic Arm — Assembly ───
152
- PASS move_to(300,150,50,slow)
153
- function move_to 4 OK
154
- arg1 300 5 FILL
155
- arg2 150 5 FILL
156
- arg3 500 5 FILL
157
- arg4 slow 3 FILL
158
- e2e=39.9ms max_tok=5
159
-
160
- ─── Digital Human — Streamer ───
161
- PASS speak(welcome,cheerful)
162
- function speak 4 OK
163
- arg1 Welcome! 4 FILL
164
- arg2 cheerful 5 FILL
165
- e2e=29.1ms max_tok=5
166
-
167
- SUMMARY (v2)
168
- Accuracy : 3/3
169
- Cold start avg : 56.1ms
170
- Hot prefill avg: 29.3ms
171
- E2E avg (hot) : 31.2ms
172
- E2E / max_tok : 6.7ms/tok (decode bottleneck)
173
- ```
174
-
175
- 脚本还会打印完整的提示结构和重构后的多头输出,便于检查调试。
176
-
177
- ### 4. 启动服务
178
-
179
- `02_server.py` 将推理引擎封装为 FastAPI 服务,支持 CORS 跨域。HTML 游戏客户端通过它连接模型。
180
-
181
- ```bash
182
- python 02_server.py
183
- ```
184
-
185
- 服务启动于 `http://localhost:8899`,提供以下接口:
186
-
187
- | 接口 | 方法 | 说明 |
188
- |------|------|------|
189
- | `/health` | GET | 健康检查,返回模型版本信息 |
190
- | `/v1/function_call` | POST | 多头并行函数调用 |
191
-
192
- 编辑 `02_server.py` 顶部的 `MODEL_PATH` 和 `MODEL_VERSION` 即可切换 v1/v2 模型。
193
-
194
- ### 5. 测试服务
195
-
196
- 服务运行后,在另一个终端中执行:
197
-
198
- ```bash
199
- python 03_test_server.py
200
- ```
201
-
202
- 该脚本向服务端 API 发送三大场景(游戏、机械臂、数字人)的测试请求,报告准确率、冷启动/热启动延迟及各头输出。
203
-
204
- ```bash
205
- # 自定义服务地址
206
- python 03_test_server.py --url http://192.168.1.100:8899
207
-
208
- # 增加热启动轮数
209
- python 03_test_server.py --rounds 10
210
- ```
211
-
212
- ### 6. 体验 Demo
213
-
214
- 在浏览器中打开 Demo HTML 文件,它们会连接到正在运行的 SimpleTool 服务。
215
-
216
- | Demo | 说明 | 文件 |
217
- |------|------|------|
218
- | **Pong** | AI 对战人类的弹球游戏 | `demos/pong_game.html` |
219
- | **Neon Arena** | 多 AI 对战射击游戏 | `demos/neon_arena.html` |
220
-
221
- 部分游戏需要额外资源文件:
222
- ```bash
223
- cd demos/neon_arena
224
- python3 -m http.server 8080 --bind 127.0.0.1
225
- ```
226
- 然后打开 http://127.0.0.1:8080/neon_arena.html,输入 SimpleTool 服务地址(默认:`http://localhost:8899`)。
227
-
228
- <p align="center">
229
- <video src="https://github.com/user-attachments/assets/436e3b97-e8ab-4d36-9fa0-8f1962da4a38" autoplay loop muted width="400"></video>
230
- <video src="https://github.com/user-attachments/assets/f9b127da-b65e-4a06-b48f-836e759a6029" autoplay loop muted width="400"></video>
231
- </p>
232
-
233
- ---
234
-
235
- ## 项目结构
236
-
237
- ```
238
- SimpleTool/
239
- ├── 01_benchmark.py # 第 1 步:直接并行解码基准测试
240
- ├── 02_server.py # 第 2 步:FastAPI vLLM 推理服务
241
- ├── 03_test_server.py # 第 3 步:服务端 API 测试客户端
242
- ├── prompts/ # 外部提示词与场景文件
243
- │ ├── v1_system.txt # v1 多头系统提示
244
- │ ├── scenarios.json # 3 大场景测试用例
245
- │ ├── tools_game.jsonl # 塔防游戏工具定义
246
- │ ├── tools_arm.jsonl # 机械臂工具定义
247
- │ └── tools_avatar.jsonl # 数字人工具定义
248
- ├── models/ # 模型下载目录
249
- │ └── RT-Qwen3-4B-AWQ-v2/ # 默认模型
250
- ├── demos/ # HTML 游戏客户端
251
- │ ├── pong_game.html
252
- │ └── neon_arena/
253
- ├── assets/ # README 配图
254
- ├── requirements.txt
255
- ├── simpletool-game.skill.md # 用 AI 构建新游戏的指南
256
- ├── README.md
257
- └── README_zh.md
258
- ```
259
-
260
- ## 构建你自己的游戏
261
-
262
- 将 **`simpletool-game.skill.md`** 和本项目的 **`README.md`** 一起喂给你的 AI 编程智能体(Claude Code、Codex、Antigravity 等)即可开始 vibe coding。Skill 文件涵盖服务端 API 规格、工具定义格式、Query 设计最佳实践、前端模板及动态头数优化技巧;README 则帮助 AI 理解整体项目结构。两者配合,即可上手开发基于 SimpleTool 的游戏。
263
-
264
- ---
265
-
266
- ## 路线图
267
-
268
- - [ ] **世界模拟** — 大规模(1,000+ NPC)实时 AI 异步世界模拟,单智能体行动端到端延迟 < 200ms
269
- - [ ] **推测解码与多 Token 预测** — 引入推测解码(Speculative Decoding)和多 Token 预测,进一步压缩推理延迟
270
- - [ ] **Windows 原生支持** — Windows 游戏引擎插件与原生运行(无需 Docker 或 WSL)
271
- - [ ] **Apple 生态** — Mac 和 iPhone 端侧部署(CoreML / Metal)
272
- - [ ] **v3 架构** — 快思考(实时 SimpleTool)+ 慢思考(异步元认知)融合
273
- - [ ] **具身智能** — 虚拟 3D 数字人,大型游戏引擎集成演示
274
- - [ ] **开源训练** — 完整训练代码与数据集开放
275
-
276
- ---
277
-
278
- ## 演示视频
279
-
280
- <p align="center">
281
- <a href="#"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
282
- <a href="#"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
283
- </p>
284
-
285
- > 演示视频即将上线——展示实时游戏 AI、机械臂控制和数字人动画效果。
286
-
287
- ---
288
-
289
- ## 引用
290
-
291
- ```bibtex
292
- @article{shi2026simpletool,
293
- title={SimpleTool: Parallel Decoding for Real-Time LLM Function Calling},
294
- author={Shi, Xiaoxin and Wan, Jiaxin and Dong, Linkang and Jiang, Wei and Liu, Yue and Huang, Zengfeng},
295
- journal={arXiv preprint arXiv:2603.00030},
296
- year={2026}
297
- }
298
- ```
299
-
300
- ## 联系方式
301
-
302
- - **邮箱**:cialtion737410@sjtu.edu.cn / cialtion@outlook.com
303
- - **QQ 群**:861244702
304
- - **Bilibili**:[Cialtion](https://space.bilibili.com/Cialtion)
305
-
306
- ## 许可证
307
-
308
- Apache 2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/rt_server-checkpoint.py DELETED
@@ -1,336 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- SimpleTool vLLM Server - Multi-Head Parallel Decoding for Real-Time Function Calling
4
- Supports both v1 and v2 prompt formats. HTML clients need zero changes.
5
- """
6
-
7
- import json
8
- import time
9
- import os
10
- from typing import List, Dict, Any, Optional
11
- from contextlib import asynccontextmanager
12
-
13
- from fastapi import FastAPI, HTTPException
14
- from fastapi.middleware.cors import CORSMiddleware
15
- from pydantic import BaseModel
16
- import uvicorn
17
-
18
- from vllm import LLM, SamplingParams
19
-
20
- # ==================== Config ====================
21
- MODEL_PATH = "../../RT-Qwen3-4B-v2" # v2 model path
22
- MODEL_VERSION = "v2" # "v1" or "v2"
23
- SERVER_HOST = "0.0.0.0"
24
- SERVER_PORT = 8899
25
- MAX_HISTORY = 6
26
-
27
- os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
28
-
29
- # ==================== Multi-Head Tags ====================
30
- HEAD_TAGS = ["<content>", "<function>", "<arg1>", "<arg2>", "<arg3>", "<arg4>", "<arg5>", "<arg6>"]
31
- STOP_TOKENS = ["<|null|>", "</content>", "</function>", "</arg1>", "</arg2>", "</arg3>", "</arg4>", "</arg5>", "</arg6>", "<|im_end|>"]
32
-
33
- # ── v1: generic head-format instructions in system, domain context in user ──
34
- V1_SYSTEM_TEMPLATE = """<|im_start|>system
35
- You are a multi-head parallel function calling model.
36
- ## Output Heads
37
-
38
- **Head 0 - <content>**: Natural language response
39
- - Format: <content>response text</content>
40
-
41
- **Head 1 - <function>**: Function names to call
42
- - Format: <function>name</function>
43
-
44
- **Head 2-7 - <arg1>-<arg6>**: Function arguments by position
45
- - Format: <argN>value</argN>
46
- - If Unnecessary: <argN><|null|></argN>
47
-
48
- ## Available Tools:
49
-
50
- {tools_json}
51
- <|im_end|>
52
- """
53
-
54
- V1_USER_TEMPLATE = "<|im_start|>user\nenvironment: {env}\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
55
-
56
- # ── v2: domain system prompt + tools in system, leaner user turn ──
57
- V2_SYSTEM_TEMPLATE = """<|im_start|>system
58
- {system_prompt}
59
-
60
- ## Available Tools:
61
-
62
- {tools_json}
63
- <|im_end|>
64
- """
65
-
66
- V2_USER_TEMPLATE = "<|im_start|>user\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
67
-
68
- # Default system prompt when HTML client doesn't send one (backward compat)
69
- V2_DEFAULT_SYSTEM = "You are a real-time function calling assistant. Convert user commands into function calls using the available tools."
70
-
71
-
72
- # ==================== Data Models ====================
73
- class Message(BaseModel):
74
- role: str
75
- content: str
76
-
77
-
78
- class FCRequest(BaseModel):
79
- messages: List[Message]
80
- tools: List[Dict[str, Any]]
81
- # ── v1 fields (still accepted, used when version=v1) ──
82
- environment: Optional[List[str]] = None
83
- history: Optional[List[str]] = None
84
- # ── v2 optional: domain system prompt ──
85
- system: Optional[str] = None
86
- # ── shared ──
87
- max_tokens: int = 32
88
- temperature: float = 0.0
89
- include_content_head: bool = False
90
-
91
-
92
- class FCResponse(BaseModel):
93
- success: bool
94
- function: Optional[str] = None
95
- args: Dict[str, Any] = {}
96
- heads: Dict[str, str] = {}
97
- content: Optional[str] = None
98
- latency_ms: float = 0
99
- error: Optional[str] = None
100
-
101
-
102
- # ==================== SimpleTool Engine ====================
103
- class SimpleToolEngine:
104
- def __init__(self, model_path: str, version: str = "v2"):
105
- self.model_path = model_path
106
- self.version = version
107
- self.llm: Optional[LLM] = None
108
- self.sampling_params = None
109
-
110
- def initialize(self):
111
- print(f"[SimpleTool] Loading model ({self.version}): {self.model_path}")
112
- self.llm = LLM(
113
- model=self.model_path,
114
- trust_remote_code=True,
115
- enable_prefix_caching=True,
116
- tensor_parallel_size=1,
117
- gpu_memory_utilization=0.8,
118
- max_model_len=4096,
119
- dtype="auto",
120
- )
121
- self.sampling_params = SamplingParams(
122
- temperature=0.0,
123
- max_tokens=32,
124
- stop=STOP_TOKENS,
125
- include_stop_str_in_output=True
126
- )
127
- print(f"[SimpleTool] Model loaded! (version={self.version})")
128
- self._warmup()
129
-
130
- def _warmup(self):
131
- print("[SimpleTool] Warming up...")
132
- dummy_tools = '{"type":"function","function":{"name":"test","parameters":{}}}'
133
- if self.version == "v1":
134
- prefix = V1_SYSTEM_TEMPLATE.format(tools_json=dummy_tools)
135
- prefix += V1_USER_TEMPLATE.format(env="[]", hist="", query="test")
136
- else:
137
- prefix = V2_SYSTEM_TEMPLATE.format(system_prompt=V2_DEFAULT_SYSTEM, tools_json=dummy_tools)
138
- prefix += V2_USER_TEMPLATE.format(hist="", query="test")
139
- prompts = [prefix + tag for tag in HEAD_TAGS[:2]] # function + arg1 enough
140
- self.llm.generate(prompts, self.sampling_params)
141
- print("[SimpleTool] Warmup complete!")
142
-
143
- def _build_tools_json(self, tools: List[Dict]) -> str:
144
- return "\n".join(json.dumps(t, ensure_ascii=False) for t in tools)
145
-
146
- def _extract_param_info(self, tools: List[Dict]) -> List[str]:
147
- names = []
148
- for tool in tools:
149
- func = tool.get("function", {})
150
- params = func.get("parameters", {}).get("properties", {})
151
- for name in params.keys():
152
- if name not in names:
153
- names.append(name)
154
- return names[:6]
155
-
156
- def _get_max_args(self, tools: List[Dict]) -> int:
157
- max_args = 0
158
- for tool in tools:
159
- func = tool.get("function", {})
160
- params = func.get("parameters", {}).get("properties", {})
161
- max_args = max(max_args, len(params))
162
- return min(max_args, 6)
163
-
164
- def _build_prompt(self, request: FCRequest) -> str:
165
- """Build the shared prefix according to version."""
166
- tools_json = self._build_tools_json(request.tools)
167
-
168
- # Extract query from messages
169
- query = ""
170
- for msg in request.messages:
171
- if msg.role == "user":
172
- query = msg.content
173
-
174
- hist_list = (request.history or [])[-MAX_HISTORY:]
175
- hist_str = ", ".join(hist_list) if hist_list else ""
176
-
177
- if self.version == "v1":
178
- # ── v1: head descriptions + tools in system, env+history+query in user ──
179
- env_str = json.dumps(request.environment or [], ensure_ascii=False)
180
- system_part = V1_SYSTEM_TEMPLATE.format(tools_json=tools_json)
181
- user_part = V1_USER_TEMPLATE.format(env=env_str, hist=hist_str, query=query)
182
- else:
183
- # ── v2: domain system + tools in system, history+query in user ──
184
- # If client sends a system prompt, use it; otherwise use default.
185
- # For legacy HTML clients that send environment[], fold it into query.
186
- system_prompt = request.system or V2_DEFAULT_SYSTEM
187
- system_part = V2_SYSTEM_TEMPLATE.format(
188
- system_prompt=system_prompt,
189
- tools_json=tools_json
190
- )
191
- # Backward compat: if environment is provided (old HTML clients),
192
- # prepend it to the query so the model still sees context.
193
- env_prefix = ""
194
- if request.environment:
195
- env_prefix = "environment: " + json.dumps(request.environment, ensure_ascii=False) + "\n"
196
- user_part = V2_USER_TEMPLATE.format(
197
- hist=hist_str,
198
- query=env_prefix + query
199
- )
200
-
201
- return system_part + user_part
202
-
203
- def call(self, request: FCRequest) -> FCResponse:
204
- start = time.perf_counter()
205
-
206
- full_prefix = self._build_prompt(request)
207
-
208
- # Dynamic head selection based on max args
209
- max_args = self._get_max_args(request.tools)
210
- active_tags = ["<function>"] + [f"<arg{i}>" for i in range(1, max_args + 1)]
211
- if request.include_content_head:
212
- active_tags = ["<content>"] + active_tags
213
-
214
- prompts = [full_prefix + tag for tag in active_tags]
215
- outputs = self.llm.generate(prompts, self.sampling_params)
216
-
217
- latency_ms = (time.perf_counter() - start) * 1000
218
-
219
- # Parse outputs
220
- heads = {}
221
- head_names = []
222
- if request.include_content_head:
223
- head_names.append("content")
224
- head_names.append("function")
225
- head_names.extend([f"arg{i}" for i in range(1, max_args + 1)])
226
-
227
- for i, output in enumerate(outputs):
228
- text = output.outputs[0].text.strip()
229
- for stop in STOP_TOKENS:
230
- if text.endswith(stop):
231
- text = text[:-len(stop)].strip()
232
- break
233
- heads[head_names[i]] = text
234
-
235
- func_name = heads.get("function", "").strip()
236
- if not func_name or func_name == "<|null|>":
237
- return FCResponse(
238
- success=False,
239
- heads=heads,
240
- content=heads.get("content"),
241
- latency_ms=latency_ms,
242
- error="No function called"
243
- )
244
-
245
- param_names = self._extract_param_info(request.tools)
246
- args = {}
247
- for i, name in enumerate(param_names):
248
- val = heads.get(f"arg{i+1}", "").strip()
249
- if val and val != "<|null|>":
250
- if val.isdigit():
251
- args[name] = int(val)
252
- elif val.lstrip('-').replace('.', '', 1).isdigit():
253
- args[name] = float(val)
254
- else:
255
- args[name] = val.lower().strip()
256
-
257
- return FCResponse(
258
- success=True,
259
- function=func_name,
260
- args=args,
261
- heads=heads,
262
- content=heads.get("content"),
263
- latency_ms=latency_ms
264
- )
265
-
266
-
267
- # ==================== FastAPI ====================
268
- engine: Optional[SimpleToolEngine] = None
269
-
270
-
271
- @asynccontextmanager
272
- async def lifespan(app: FastAPI):
273
- global engine
274
- engine = SimpleToolEngine(MODEL_PATH, version=MODEL_VERSION)
275
- engine.initialize()
276
- yield
277
- print("[Server] Shutdown")
278
-
279
-
280
- app = FastAPI(title="SimpleTool Server", version="2.0.0", lifespan=lifespan)
281
-
282
- app.add_middleware(
283
- CORSMiddleware,
284
- allow_origins=["*"],
285
- allow_credentials=True,
286
- allow_methods=["*"],
287
- allow_headers=["*"],
288
- )
289
-
290
-
291
- @app.get("/health")
292
- async def health():
293
- return {
294
- "status": "ok",
295
- "loaded": engine is not None and engine.llm is not None,
296
- "model": MODEL_PATH,
297
- "version": MODEL_VERSION,
298
- }
299
-
300
-
301
- @app.post("/v1/function_call", response_model=FCResponse)
302
- async def function_call(request: FCRequest):
303
- if engine is None or engine.llm is None:
304
- raise HTTPException(503, "Model not loaded")
305
- try:
306
- return engine.call(request)
307
- except Exception as e:
308
- import traceback
309
- traceback.print_exc()
310
- return FCResponse(success=False, error=str(e), latency_ms=0)
311
-
312
-
313
- if __name__ == "__main__":
314
- print(r"""
315
- ╔════════════════════════════════════════════════════════════════════╗
316
- ║ ║
317
- ║ ███████╗██╗███╗ ███╗██████╗ ██╗ ███████╗ ║
318
- ║ ██╔════╝██║████╗ ████║██╔══██╗██║ ██╔════╝ ║
319
- ║ ███████╗██║██╔████╔██║██████╔╝██║ █████╗ ║
320
- ║ ╚════██║██║██║╚██╔╝██║██╔═══╝ ██║ ██╔══╝ ║
321
- ║ ███████║██║██║ ╚═╝ ██║██║ ███████╗███████╗ ║
322
- ║ ╚══════╝╚═╝╚═╝ ╚═╝╚═╝ ╚══════╝╚══════╝ ║
323
- ║ ║
324
- ║ SimpleTool vLLM-Server v2.0 ║
325
- ║ Multi-Head Parallel Decoding — v1/v2 Compatible ║
326
- ║ ║
327
- ║ Run Demos: Open demos/*.html in browser ║
328
- ║ Build New: Send simpletool-game-guide.md to AI(Claude Gemini...) ║
329
- ║ for Building new your own HTML games easily ║
330
- ║ Endpoints: ║
331
- ║ GET /health - Health check (+ version info) ║
332
- ║ POST /v1/function_call - Function call API (v1 & v2) ║
333
- ║ ║
334
- ╚════════════════════════════════════════════════════════════════════╝
335
- """)
336
- uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/simpletool-game.skill-checkpoint.md DELETED
@@ -1,318 +0,0 @@
1
- # SimpleTool Skill — Real-Time AI Application Development
2
-
3
- > **This is a skill file.** Feed it to any AI coding assistant (Claude, Gemini, GPT, Cursor, etc.) as context, then describe the app you want. The AI will generate a working SimpleTool-powered application.
4
- >
5
- > Example prompt: *"Read the attached SimpleTool skill, then build me a Pong game where AI controls one paddle in real-time."*
6
-
7
- ---
8
-
9
- ## 1. What is SimpleTool?
10
-
11
- SimpleTool is a **multi-head parallel decoding** server for real-time LLM function calling. It runs on vLLM and decodes function name + arguments simultaneously instead of sequentially.
12
-
13
- ```
14
- Traditional: function → arg1 → arg2 → ... (sequential, ~200-500ms)
15
- SimpleTool: [function, arg1, arg2, ...] (parallel, ~25-60ms)
16
- ```
17
-
18
- **Application domains**: game AI, robotic arm control, digital human animation, IoT automation — anything that needs < 100ms LLM decision-making.
19
-
20
- ## 2. Server API
21
-
22
- Server default: `http://localhost:8899`
23
-
24
- ### Endpoints
25
- | Method | Path | Description |
26
- |--------|------|-------------|
27
- | GET | `/health` | Health check, returns `{status, version, model}` |
28
- | POST | `/v1/function_call` | Multi-head parallel function call |
29
-
30
- ### Request Format (v2)
31
- ```javascript
32
- {
33
- messages: [{role: 'user', content: 'your query'}],
34
- tools: [...], // OpenAI-format tool definitions
35
- system: "domain prompt", // Domain-specific system prompt (v2)
36
- environment: [...], // Current state info (string array, optional)
37
- history: [...], // Action history (string array, max 6)
38
- include_content_head: false // Whether to generate <content> head
39
- }
40
- ```
41
-
42
- The `system` field lets you inject a domain-specific system prompt (e.g., "You are a robotic arm controller"). If omitted, the server uses a generic default. The `environment` field is optional context folded into the user message.
43
-
44
- ### Response Format
45
- ```javascript
46
- {
47
- success: true,
48
- function: "move",
49
- args: {direction: "up", speed: "fast"}, // Named args (param names from tool def)
50
- heads: { // Raw per-head output
51
- function: "move",
52
- arg1: "up",
53
- arg2: "fast",
54
- arg3: "<|null|>"
55
- },
56
- content: null, // Only if include_content_head was true
57
- latency_ms: 35.2
58
- }
59
- ```
60
-
61
- ## 3. Dynamic Head Count (Critical for Latency!)
62
-
63
- **The server automatically prunes unused heads.** If your tools have at most 2 parameters, only 3 heads are spawned (`<function>`, `<arg1>`, `<arg2>`), not 8. This saves ~40% latency.
64
-
65
- ```
66
- Active heads = [<function>] + [<arg1>...<argN>]
67
- where N = max parameter count across all tool definitions
68
- ```
69
-
70
- **Design tip**: Keep your tools to 1–3 parameters when possible. Fewer params = fewer heads = lower latency.
71
-
72
- ## 4. Tool Definition
73
-
74
- ### Constraints
75
- - Maximum **6 arguments** per function (arg1–arg6)
76
- - Arguments map to `arg1, arg2, ...` in the order defined in `properties`
77
- - Server auto-converts types: numeric strings → int/float, otherwise lowercase string
78
- - Use `enum` to constrain options — this dramatically improves accuracy
79
-
80
- ### Template
81
- ```javascript
82
- const TOOLS = [{
83
- type: "function",
84
- function: {
85
- name: "action_name",
86
- description: "Clear, concise — what this action does and when to use it",
87
- parameters: {
88
- type: "object",
89
- properties: {
90
- param1: {
91
- type: "string",
92
- enum: ["opt_a", "opt_b", "opt_c"], // Constrain! Improves accuracy
93
- description: "What this param controls"
94
- },
95
- param2: {
96
- type: "number",
97
- description: "Numeric value with unit, e.g. 'Force in Newtons'"
98
- }
99
- },
100
- required: ["param1"]
101
- }
102
- }
103
- }];
104
- ```
105
-
106
- ### Multi-Tool Example (Game)
107
- ```javascript
108
- const TOOLS = [
109
- {type:"function", function:{name:"move", description:"Move unit to position", parameters:{type:"object", properties:{unit:{type:"string"}, target:{type:"string", enum:["north","south","east","west"]}}}}},
110
- {type:"function", function:{name:"attack", description:"Attack enemy", parameters:{type:"object", properties:{unit:{type:"string"}, target:{type:"string"}}}}},
111
- {type:"function", function:{name:"retreat", description:"Pull back unit", parameters:{type:"object", properties:{unit:{type:"string"}}}}},
112
- {type:"function", function:{name:"pass", description:"Do nothing this turn", parameters:{type:"object", properties:{}}}}
113
- ];
114
- // Max params = 2 → only 3 heads spawned
115
- ```
116
-
117
- ## 5. Query Design
118
-
119
- ### Principles
120
- 1. **Be imperative** — tell the model what to decide, not just describe state
121
- 2. **Include decision context** — "Ball is BELOW paddle, intercept it" not "Ball y=250"
122
- 3. **List valid options** — "Choose: up/down/stay"
123
- 4. **Keep it short** — shorter query = faster prefill
124
-
125
- ### Good vs Bad
126
- ```
127
- ✅ "Ball 50px BELOW paddle, approaching fast. Move DOWN to intercept. Choose: up/down/stay"
128
- ❌ "Ball position: 250, Paddle position: 200. What should I do?"
129
-
130
- ✅ "Red gear at (300,150,50). Move arm there slowly for pickup."
131
- ❌ "There is a gear somewhere on the table. The arm needs to go to it."
132
-
133
- ✅ "Stream starting, viewers saying hello. Greet them warmly."
134
- ❌ "Viewers are in the chat. Do something appropriate."
135
- ```
136
-
137
- ### Environment & History
138
- ```javascript
139
- // Environment: current state as key=value strings
140
- const env = [
141
- `ball_y=${ballY}`,
142
- `paddle_y=${paddleY}`,
143
- `gap=${gap}`,
144
- `approaching=true`
145
- ];
146
-
147
- // History: recent actions (max 6, server trims automatically)
148
- const history = [
149
- "move(up)", "move(up)", "stay()"
150
- ];
151
- ```
152
-
153
- ### Domain System Prompts (v2)
154
- For v2 server, set a domain-specific system prompt:
155
- ```javascript
156
- // Game AI
157
- const SYSTEM = "You are the AI controller for a Pong game. Move the paddle to intercept the ball. React quickly.";
158
-
159
- // Robotic arm
160
- const SYSTEM = "You are the voice controller for a 6-axis robotic arm. Convert commands to precise function calls. Coordinates in mm.";
161
-
162
- // Digital human
163
- const SYSTEM = "You are the animation controller for a virtual streamer. Convert director instructions to expression and speech calls.";
164
- ```
165
-
166
- ## 6. Frontend Code Standards
167
-
168
- ### Required: Type-Safe Value Extraction
169
- ```javascript
170
- // Values in args may be int, not string — always coerce
171
- function safeStr(v) {
172
- if (v === null || v === undefined) return '';
173
- return String(v).trim().toLowerCase();
174
- }
175
-
176
- // Extract with args (named) first, heads (positional) as fallback
177
- let direction = safeStr(d.args?.direction) || safeStr(d.heads?.arg1);
178
- ```
179
-
180
- ### Required: Validate Return Values
181
- ```javascript
182
- const VALID = ['up', 'down', 'stay'];
183
- if (!VALID.includes(direction)) {
184
- console.warn(`Invalid: "${direction}", fallback to stay`);
185
- direction = 'stay';
186
- }
187
- ```
188
-
189
- ### Required: Error Handling with Fallback
190
- ```javascript
191
- async function callAI() {
192
- try {
193
- const r = await fetch(SERVER_URL + '/v1/function_call', {
194
- method: 'POST',
195
- headers: {'Content-Type': 'application/json'},
196
- body: JSON.stringify(request)
197
- });
198
- const data = await r.json();
199
- if (!data.success) throw new Error(data.error);
200
- applyAction(data);
201
- } catch (e) {
202
- console.error('[AI] Failed:', e);
203
- applyFallbackAI(); // MUST have fallback — never freeze the app
204
- }
205
- }
206
- ```
207
-
208
- ### Required: Logging
209
- ```javascript
210
- console.log(`[Game] Query: ${query}`);
211
- console.log(`[Game] → ${data.function}(${JSON.stringify(data.args)}) ${data.latency_ms.toFixed(0)}ms`);
212
- ```
213
-
214
- ### Recommended: Debug UI Overlay
215
- Show in a corner of your app: current query, raw response, latency (current + rolling average).
216
-
217
- ## 7. Game Loop Pattern
218
-
219
- **Decouple AI from rendering.** The AI loop runs at 10–16 Hz; the render loop runs at 60 fps.
220
-
221
- ```javascript
222
- const AI_INTERVAL = 100; // 100ms = 10 Hz
223
- let aiPending = false;
224
-
225
- // Render loop (60fps) — never blocks on AI
226
- function gameLoop() {
227
- update();
228
- render();
229
- requestAnimationFrame(gameLoop);
230
- }
231
-
232
- // AI loop (async, non-blocking)
233
- async function aiLoop() {
234
- if (aiPending) return;
235
- aiPending = true;
236
- await callAI();
237
- aiPending = false;
238
- }
239
-
240
- setInterval(aiLoop, AI_INTERVAL);
241
- gameLoop();
242
- ```
243
-
244
- ## 8. FCClient Template
245
-
246
- Drop-in client class for any HTML/JS application:
247
-
248
- ```javascript
249
- class FCClient {
250
- constructor(url = 'http://localhost:8899') {
251
- this.url = url.replace(/\/$/, '');
252
- }
253
-
254
- async health() {
255
- try {
256
- const r = await fetch(`${this.url}/health`, {signal: AbortSignal.timeout(3000)});
257
- const d = await r.json();
258
- return {ok: d.loaded === true || d.status === 'ok', version: d.version};
259
- } catch (e) {
260
- return {ok: false};
261
- }
262
- }
263
-
264
- async call({query, tools, system, env, history, includeContent = false}) {
265
- const t0 = performance.now();
266
- try {
267
- const r = await fetch(`${this.url}/v1/function_call`, {
268
- method: 'POST',
269
- headers: {'Content-Type': 'application/json'},
270
- body: JSON.stringify({
271
- messages: [{role: 'user', content: query}],
272
- tools,
273
- system, // v2: domain system prompt
274
- environment: env,
275
- history,
276
- include_content_head: includeContent
277
- })
278
- });
279
- const d = await r.json();
280
- return {...d, wall_ms: performance.now() - t0};
281
- } catch (e) {
282
- return {success: false, error: e.message, wall_ms: performance.now() - t0};
283
- }
284
- }
285
- }
286
- ```
287
-
288
- Usage:
289
- ```javascript
290
- const ai = new FCClient('http://localhost:8899');
291
-
292
- const result = await ai.call({
293
- query: "Ball is BELOW. Move down. Choose: up/down/stay",
294
- tools: TOOLS,
295
- system: "You are a Pong AI. Move paddle to intercept ball.",
296
- env: ["ball_y=300", "paddle_y=200", "gap=100"],
297
- history: ["move(down)", "move(down)"]
298
- });
299
-
300
- if (result.success) {
301
- console.log(`${result.function}(${JSON.stringify(result.args)}) in ${result.latency_ms}ms`);
302
- }
303
- ```
304
-
305
- ## 9. Troubleshooting
306
-
307
- | Symptom | Cause | Fix |
308
- |---------|-------|-----|
309
- | AI stuck / no movement | Query too vague | Add decision hints: "Move DOWN to intercept" |
310
- | `.trim is not a function` | `args` values may be int | Use `String(v)` before `.trim()` |
311
- | High latency (>100ms) | Too many heads / long query | Reduce tool params, shorten query/env |
312
- | Wrong function called | Ambiguous tool descriptions | Add `enum`, improve `description` fields |
313
- | `<|null|>` in all args | Model confused | Check tool param order matches expectations |
314
-
315
- ---
316
-
317
- **Skill Version**: 2.0 — Supports v1/v2 server, multi-domain (game, robotics, avatar)
318
- **Last Updated**: 2026-03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/test_server-checkpoint.py DELETED
@@ -1,250 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- test_server.py — Hit running rt_server /v1/function_call with 4 scenarios
4
- Usage: python test_server.py [--url http://localhost:8899]
5
- """
6
-
7
- import argparse, json, time, sys, requests
8
-
9
- # ==================== Test Scenarios ====================
10
- SCENARIOS = [
11
- # ── 1. Game: Tower Defense (from benchmark) ──
12
- {
13
- "name": "Game — Tower Defense",
14
- "desc": "use_skill(Amiya)",
15
- "expected_fn": "use_skill",
16
- "request": {
17
- "messages": [{"role": "user", "content":
18
- "Wave 5, BOSS appeared, 8 enemies remaining\n"
19
- "Operators: Blaze(north,HP50%,skill ready) Amiya(center,HP90%,skill ready)\n"
20
- "Enemy direction: concentrated north\n\n"
21
- "Amiya use skill now"
22
- }],
23
- "tools": [
24
- {"type":"function","function":{"name":"move","description":"Move a deployed operator to a new position on the battlefield. Use this when the player wants to reposition a unit to a different lane or strategic point.","parameters":{"type":"object","properties":{
25
- "unit_id":{"type":"string","description":"The name of the operator to move. Must match one of the currently deployed operators shown in the battlefield state. Supports fuzzy matching for ASR input, e.g. 'blaze', 'Blaze', 'BLAZE' all refer to the same operator."},
26
- "target":{"type":"string","description":"The destination position on the battlefield grid. Must be one of: 'north' (top lane), 'south' (bottom lane), 'east' (right/enemy side), 'west' (left/base side), 'center' (middle area). Choose based on the player's spoken direction."}},"required":["unit_id","target"]}}},
27
- {"type":"function","function":{"name":"use_skill","description":"Activate the special skill of a deployed operator. Each operator has a unique skill that can be triggered when the skill gauge is ready. The skill effect depends on the operator type (e.g. AoE damage, healing, buff).","parameters":{"type":"object","properties":{
28
- "unit_id":{"type":"string","description":"The name of the operator whose skill should be activated. The operator must be currently deployed on the battlefield and have their skill ready (skill gauge full). Supports fuzzy name matching for ASR input."},
29
- "skill_id":{"type":"string","description":"Optional skill identifier when an operator has multiple skills. If the operator only has one skill or the player did not specify which skill, this can be omitted. Format: 's1', 's2', 's3' for skill slot 1/2/3."}},"required":["unit_id"]}}},
30
- {"type":"function","function":{"name":"retreat","description":"Withdraw a single operator from the battlefield back to the reserve bench. The operator's redeployment timer starts after retreat. Use when the player wants to pull back a specific unit to save them or free up a deployment slot.","parameters":{"type":"object","properties":{
31
- "unit_id":{"type":"string","description":"The name of the operator to retreat. Must be currently deployed on the battlefield. After retreat, this operator enters cooldown before they can be redeployed. Supports fuzzy name matching for ASR input."}},"required":["unit_id"]}}},
32
- {"type":"function","function":{"name":"set_stance","description":"Change the combat behavior mode of a deployed operator. This affects how the operator selects targets and whether they prioritize attacking or surviving.","parameters":{"type":"object","properties":{
33
- "unit_id":{"type":"string","description":"The name of the operator whose stance should be changed. Must be currently deployed on the battlefield. Supports fuzzy name matching for ASR input."},
34
- "stance":{"type":"string","description":"The behavior mode to set. Must be one of: 'aggressive' (prioritize attacking nearest enemy, maximize DPS), 'defensive' (prioritize blocking and damage reduction, focus on survival), 'hold' (stay in position and only attack enemies in range, do not chase)."}},"required":["unit_id","stance"]}}},
35
- {"type":"function","function":{"name":"retreat_all","description":"Emergency retreat of all currently deployed operators from the battlefield at once. Use only when the player explicitly requests a full withdrawal, typically in dire situations. All operators enter redeployment cooldown simultaneously.","parameters":{"type":"object","properties":{}}}},
36
- {"type":"function","function":{"name":"pass","description":"Take no action this turn. Use when the player's command has already been fulfilled in history, or when the player explicitly says to wait, skip, or do nothing. Also use when the voice input is ambiguous and no clear command can be extracted.","parameters":{"type":"object","properties":{}}}}
37
- ],
38
- "system": "You are the voice command interpreter for a real-time tower defense game. The player issues orders by voice. You convert ASR-transcribed commands into function calls.\n\nRules:\n- One function call per command\n- Fuzzy match operator names\n- Positions: north, south, east, west, center\n- If all tasks in history are done, call pass",
39
- "history": []
40
- }
41
- },
42
- # ── 2. Robotic Arm — Assembly (from benchmark) ──
43
- {
44
- "name": "Robotic Arm — Assembly",
45
- "desc": "move_to(300,150,50,slow)",
46
- "expected_fn": "move_to",
47
- "request": {
48
- "messages": [{"role": "user", "content":
49
- "Arm at home (0,0,500), gripper open\n"
50
- "Workpiece: red gear at (300,150,50), target tray at (600,0,80)\n\n"
51
- "Move to the red gear position slowly"
52
- }],
53
- "tools": [
54
- {"type":"function","function":{"name":"move_to","description":"Move the robotic arm end-effector (tool center point) to a specified 3D coordinate in the workspace. The arm plans a collision-free path from its current position to the target. Optionally control movement speed for precision tasks.","parameters":{"type":"object","properties":{
55
- "x":{"type":"number","description":"Target X coordinate in millimeters, relative to the robot base frame origin. Positive X points forward (away from the robot base). Valid range depends on arm reach, typically -800 to 800 mm."},
56
- "y":{"type":"number","description":"Target Y coordinate in millimeters, relative to the robot base frame origin. Positive Y points to the left when facing the robot. Valid range depends on arm reach, typically -800 to 800 mm."},
57
- "z":{"type":"number","description":"Target Z coordinate in millimeters, relative to the robot base frame origin (table surface = 0). Positive Z points upward. Must be >= 0 to avoid collision with the work surface. Typical range: 0 to 500 mm."},
58
- "speed":{"type":"string","description":"Movement speed profile for the path. 'slow' (25% max velocity) for precision placement and delicate parts, 'normal' (50% max velocity) for standard pick-and-place, 'fast' (100% max velocity) for rapid repositioning when precision is not critical. Default: 'normal'."}},"required":["x","y","z"]}}},
59
- {"type":"function","function":{"name":"grip","description":"Close the gripper jaws to grasp an object at the current end-effector position. The gripper applies the specified force and holds it. Must be called after positioning the arm above/around the target object.","parameters":{"type":"object","properties":{
60
- "force":{"type":"number","description":"Gripping force in Newtons applied by the gripper jaws. Choose based on object fragility: 10N for light/fragile items (electronics, thin plastic), 50N for medium items (standard gears, metal parts), 100N for heavy/robust items (large castings, steel blocks). Excessive force may damage delicate workpieces."}},"required":["force"]}}},
61
- {"type":"function","function":{"name":"release","description":"Open the gripper jaws to release the currently held object. The gripper fully opens to its maximum width. Should be called after positioning the arm at the target placement location. Ensure the object is at a safe height above the surface before releasing.","parameters":{"type":"object","properties":{}}}},
62
- {"type":"function","function":{"name":"rotate","description":"Rotate the end-effector around a specified axis without changing its position. Used to orient the gripper or tool for proper approach angle before grasping, or to rotate a held workpiece for assembly alignment.","parameters":{"type":"object","properties":{
63
- "axis":{"type":"string","description":"The rotation axis in the end-effector frame. 'roll' rotates around the approach direction (Z-axis of tool frame, like turning a screwdriver), 'pitch' tilts the end-effector up/down (like nodding), 'yaw' swings the end-effector left/right (like shaking head). Choose based on the desired orientation change."},
64
- "angle":{"type":"number","description":"Rotation angle in degrees. Positive values follow the right-hand rule around the specified axis. Typical range: -180 to 180 degrees. Small angles (< 15°) for fine adjustment, larger angles for major reorientation."}},"required":["axis","angle"]}}},
65
- {"type":"function","function":{"name":"home","description":"Return the robotic arm to its predefined home position (0, 0, 500) with the gripper pointing straight down and jaws open. Use as a safe starting/ending position for task sequences, or to clear the workspace. The arm takes a collision-free path at normal speed.","parameters":{"type":"object","properties":{}}}}
66
- ],
67
- "system": "You are the voice controller for an industrial 6-axis robotic arm. You convert spoken commands into function calls.\n\nRules:\n- One function call per command\n- Coordinates in mm, angles in degrees\n- Gripper force: light=10N, medium=50N, heavy=100N\n- Speed: slow/normal/fast",
68
- "history": []
69
- }
70
- },
71
- # ── 3. Digital Human — Streamer (from benchmark) ──
72
- {
73
- "name": "Digital Human — Streamer",
74
- "desc": "speak(welcome,cheerful)",
75
- "expected_fn": "speak",
76
- "request": {
77
- "messages": [{"role": "user", "content":
78
- "Stream just started, viewers flooding in\n"
79
- "Chat: \"Hello streamer!\" \"Good evening!\"\n"
80
- "Director: greet the audience warmly, say welcome and look at camera"
81
- }],
82
- "tools": [
83
- {"type":"function","function":{"name":"set_expression","description":"Set the facial expression of the digital human avatar. Controls the blend shapes for eyes, eyebrows, and mouth to display the target emotion. The expression persists until changed by another set_expression call or overridden by a speak animation.","parameters":{"type":"object","properties":{
84
- "emotion":{"type":"string","description":"The target facial expression to display. Must be one of: 'happy' (smile, raised cheeks), 'sad' (downturned mouth, drooping eyebrows), 'surprised' (wide eyes, raised eyebrows, open mouth), 'angry' (furrowed brows, tight lips), 'neutral' (relaxed default face), 'thinking' (slightly furrowed brow, eyes looking up/away, subtle lip purse)."},
85
- "intensity":{"type":"number","description":"The strength of the facial expression blend, from 0.0 (barely visible, subtle hint) to 1.0 (maximum exaggeration, full expression). Recommended: 0.3-0.5 for natural conversation, 0.6-0.8 for reactive moments, 0.9-1.0 for comedic or dramatic emphasis."}},"required":["emotion","intensity"]}}},
86
- {"type":"function","function":{"name":"speak","description":"Make the digital human speak the given text with lip-sync animation and appropriate facial expressions. The TTS engine converts text to audio while the avatar performs real-time viseme-based lip synchronization. The tone parameter affects both voice prosody and accompanying facial micro-expressions.","parameters":{"type":"object","properties":{
87
- "text":{"type":"string","description":"The speech content for the digital human to say aloud. Should be natural conversational language appropriate for a live stream context. Keep sentences concise (under 50 characters preferred for real-time responsiveness). May include casual expressions, emoji descriptions, or audience interaction phrases."},
88
- "tone":{"type":"string","description":"The vocal tone and emotional coloring of the speech delivery. Must be one of: 'cheerful' (upbeat, warm, higher pitch, for greetings and positive moments), 'calm' (steady, soothing, moderate pace, for explanations and transitions), 'serious' (lower pitch, measured pace, for important announcements), 'excited' (high energy, faster pace, emphasis peaks, for reactions and hype moments)."}},"required":["text","tone"]}}},
89
- {"type":"function","function":{"name":"gesture","description":"Trigger a pre-defined body gesture animation on the digital human avatar. The gesture plays once and blends back to the idle pose. Can be combined with speak or set_expression for more natural multi-channel communication.","parameters":{"type":"object","properties":{
90
- "type":{"type":"string","description":"The gesture animation to play. Must be one of: 'wave' (friendly hand wave, for greetings and farewells), 'nod' (head nod, to show agreement or acknowledgment), 'shake_head' (head shake, to express disagreement or disbelief), 'bow' (respectful bow, for gratitude or formal greeting), 'point' (index finger pointing forward, to direct attention), 'thumbs_up' (approval gesture, for positive feedback), 'clap' (both hands clapping, for celebration or applause)."}},"required":["type"]}}},
91
- {"type":"function","function":{"name":"look_at","description":"Direct the digital human's eye gaze and subtle head orientation toward a specified target. Creates natural eye contact or directional attention. The gaze shift is smoothly interpolated over ~200ms for realistic movement.","parameters":{"type":"object","properties":{
92
- "target":{"type":"string","description":"The gaze target direction. Must be one of: 'camera' (look directly at the audience through the camera lens, creates eye contact with viewers), 'left' (glance to the left side of the screen, e.g. toward a chat panel or co-host), 'right' (glance to the right, e.g. toward a game screen or secondary content), 'up' (look upward, conveys thinking or reacting to something above), 'down' (look downward, conveys reading chat, shyness, or sadness)."}},"required":["target"]}}},
93
- {"type":"function","function":{"name":"idle","description":"Return the digital human to its default idle animation loop. Resets any active expression to neutral, stops ongoing gestures, and returns gaze to a soft forward direction with natural idle micro-movements (subtle breathing, occasional blinks, slight sway). Use during pauses or transitions between active segments.","parameters":{"type":"object","properties":{}}}}
94
- ],
95
- "system": "You are the expression controller for a virtual digital human streamer. You convert director instructions into animation function calls.\n\nRules:\n- One function call per instruction\n- Emotion intensity: 0.0-1.0\n- Speech text should be natural\n- Tone: cheerful/calm/serious/excited",
96
- "history": []
97
- }
98
- },
99
- # ── 4. Neon Arena (what HTML actually sends, legacy env style) ──
100
- {
101
- "name": "Neon Arena — Legacy HTML",
102
- "desc": "fire(left) or move(left)",
103
- "expected_fn": "fire",
104
- "request": {
105
- "messages": [{"role": "user", "content":
106
- "Arena 900x600. FIRE left! Aligned horizontally. Call move(dir) or fire(dir). dir:up/down/left/right"
107
- }],
108
- "tools": [
109
- {"type":"function","function":{"name":"move","description":"Move the player's spaceship in the specified direction by one step on the 900x600 arena grid. Use to reposition for better firing angle, dodge incoming bullets, or approach/retreat from enemies.","parameters":{"type":"object","properties":{
110
- "direction":{"type":"string","enum":["up","down","left","right"],"description":"The movement direction on the arena. 'up' decreases Y (toward top edge), 'down' increases Y (toward bottom edge), 'left' decreases X (toward left edge), 'right' increases X (toward right edge). Choose based on tactical positioning relative to the player and arena walls."}},"required":["direction"]}}},
111
- {"type":"function","function":{"name":"fire","description":"Fire a bullet from the spaceship in the specified direction. The bullet travels in a straight line until it hits a target or exits the arena boundary. Use when aligned with the player's position on the horizontal or vertical axis for best hit probability.","parameters":{"type":"object","properties":{
112
- "direction":{"type":"string","enum":["up","down","left","right"],"description":"The firing direction of the bullet. 'up' fires toward top edge, 'down' fires toward bottom edge, 'left' fires toward left edge (toward player's side), 'right' fires toward right edge. Choose based on current alignment with the player: fire horizontally when align_h=true, vertically when align_v=true."}},"required":["direction"]}}}
113
- ],
114
- "environment": ["pos=700,300","player=100,310","dist=600","align_h=true","align_v=false","cd=0","wall=no"],
115
- "history": ["fire(left)","move(up)","fire(left)"]
116
- }
117
- },
118
- ]
119
-
120
-
121
- def check_health(url: str) -> dict:
122
- r = requests.get(f"{url}/health", timeout=5)
123
- return r.json()
124
-
125
-
126
- def call_fc(url: str, req: dict) -> dict:
127
- t0 = time.perf_counter()
128
- r = requests.post(f"{url}/v1/function_call", json=req, timeout=30)
129
- wall_ms = (time.perf_counter() - t0) * 1000
130
- d = r.json()
131
- d["_wall_ms"] = wall_ms
132
- return d
133
-
134
-
135
- def fmt_heads(heads: dict) -> str:
136
- lines = []
137
- for k in ["function","arg1","arg2","arg3","arg4","arg5","arg6","content"]:
138
- if k in heads:
139
- v = heads[k]
140
- tag = "NULL" if (not v or v == "<|null|>") else v
141
- lines.append(f" {k:<10} = {tag}")
142
- return "\n".join(lines)
143
-
144
-
145
- def main():
146
- ap = argparse.ArgumentParser(description="Test SimpleTool server")
147
- ap.add_argument("--url", default="http://localhost:8899")
148
- ap.add_argument("--rounds", type=int, default=3, help="hot rounds per scenario")
149
- args = ap.parse_args()
150
-
151
- url = args.url.rstrip("/")
152
-
153
- # ── Health ──
154
- print(f"\n{'='*65}")
155
- print(f" SimpleTool Server Test")
156
- print(f" Target: {url}")
157
- print(f"{'='*65}\n")
158
-
159
- try:
160
- h = check_health(url)
161
- print(f" /health → {json.dumps(h)}")
162
- if not h.get("loaded") and h.get("status") != "ok":
163
- print(" ⚠ Model not loaded!"); sys.exit(1)
164
- except Exception as e:
165
- print(f" ✗ Cannot connect: {e}"); sys.exit(1)
166
-
167
- version = h.get("version", "unknown")
168
- print(f" Server version: {version}\n")
169
-
170
- # ── Cold start (first call warms KV cache) ──
171
- print(f"{'='*65}")
172
- print(f" COLD START")
173
- print(f"{'='*65}")
174
- cold_ms = []
175
- for sc in SCENARIOS:
176
- r = call_fc(url, sc["request"])
177
- ms = r.get("latency_ms", r.get("_wall_ms", 0))
178
- cold_ms.append(ms)
179
- ok = "✓" if r.get("function", "") == sc["expected_fn"] else "✗"
180
- print(f" {ok} {sc['name']:<35} {ms:7.1f}ms → {r.get('function','?')}({r.get('args',{})})")
181
- print()
182
-
183
- # ── Hot rounds ──
184
- print(f"{'='*65}")
185
- print(f" HOT ROUNDS (×{args.rounds})")
186
- print(f"{'='*65}")
187
- hot_ms = [[] for _ in SCENARIOS]
188
- for rd in range(args.rounds):
189
- parts = []
190
- for i, sc in enumerate(SCENARIOS):
191
- r = call_fc(url, sc["request"])
192
- ms = r.get("latency_ms", r.get("_wall_ms", 0))
193
- hot_ms[i].append(ms)
194
- parts.append(f"{ms:6.1f}ms")
195
- print(f" Round {rd+1}: {' '.join(parts)}")
196
- print()
197
-
198
- # ── Detailed test ──
199
- print(f"{'='*65}")
200
- print(f" DETAILED RESULTS")
201
- print(f"{'='*65}\n")
202
-
203
- results = []
204
- for i, sc in enumerate(SCENARIOS):
205
- r = call_fc(url, sc["request"])
206
- fn = r.get("function", "")
207
- ok = fn == sc["expected_fn"]
208
- results.append((sc, r, ok))
209
-
210
- status = "PASS ✓" if ok else "FAIL ✗"
211
- ms_server = r.get("latency_ms", 0)
212
- ms_wall = r.get("_wall_ms", 0)
213
-
214
- print(f"─── {sc['name']} ───")
215
- print(f" {status} expected={sc['expected_fn']} got={fn}")
216
- print(f" args: {json.dumps(r.get('args', {}), ensure_ascii=False)}")
217
- print(f" server={ms_server:.1f}ms wall={ms_wall:.1f}ms overhead={ms_wall-ms_server:.1f}ms")
218
- if r.get("heads"):
219
- print(f" heads:")
220
- print(fmt_heads(r["heads"]))
221
- if r.get("error"):
222
- print(f" error: {r['error']}")
223
- print()
224
-
225
- # ── Summary ──
226
- n = len(results)
227
- passed = sum(1 for _, _, ok in results if ok)
228
- avg_cold = sum(cold_ms) / n
229
- avg_hot = sum(sum(h) for h in hot_ms) / sum(len(h) for h in hot_ms) if hot_ms else 0
230
- avg_detail = sum(r.get("latency_ms", 0) for _, r, _ in results) / n
231
-
232
- print(f"{'='*65}")
233
- print(f" SUMMARY")
234
- print(f"{'='*65}")
235
- print(f" Server version : {version}")
236
- print(f" Accuracy : {passed}/{n}")
237
- print(f" Cold start avg : {avg_cold:.1f}ms")
238
- print(f" Hot avg : {avg_hot:.1f}ms")
239
- print(f" Detail avg : {avg_detail:.1f}ms")
240
- print()
241
- print(f" {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'Detail':>7} {'Status':>6}")
242
- print(f" {'─'*65}")
243
- for i, (sc, r, ok) in enumerate(results):
244
- havg = sum(hot_ms[i]) / len(hot_ms[i]) if hot_ms[i] else 0
245
- print(f" {sc['name']:<35} {cold_ms[i]:6.1f} {havg:6.1f} {r.get('latency_ms',0):6.1f} {'✓' if ok else '✗':>5}")
246
- print()
247
-
248
-
249
- if __name__ == "__main__":
250
- main()