Delete .ipynb_checkpoints

Browse files

Files changed (8) hide show

.ipynb_checkpoints/01_benchmark-checkpoint.py +0 -129
.ipynb_checkpoints/02_server-checkpoint.py +0 -336
.ipynb_checkpoints/03_test_server-checkpoint.py +0 -250
.ipynb_checkpoints/README-checkpoint.md +0 -308
.ipynb_checkpoints/README_zh-checkpoint.md +0 -308
.ipynb_checkpoints/rt_server-checkpoint.py +0 -336
.ipynb_checkpoints/simpletool-game.skill-checkpoint.md +0 -318
.ipynb_checkpoints/test_server-checkpoint.py +0 -250

.ipynb_checkpoints/01_benchmark-checkpoint.py DELETED Viewed

@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-"""SimpleTool multi-head parallel decode — vLLM, v1/v2, external prompts
-python 01_benchmark.py --version v2                    # v2 default model
-python 01_benchmark.py --version v1                    # v1 default model
-python 01_benchmark.py --version v2 --n-args 3         # fixed three arg heads
-python 01_benchmark.py --version v1 --model /my/model  # customed model path
-"""
-import argparse, json, time, os
-from pathlib import Path
-DIR = Path("./prompts")
-HEADS = [("function","<function>","</function>")] + [(f"arg{i}",f"<arg{i}>",f"</arg{i}>") for i in range(1,7)]
-STOPS = ["</function>"] + [f"</arg{i}>" for i in range(1,7)] + ["</content>","<|null|>","<|im_end|>"]
-MODELS = {"v1":"./models/RT-Qwen3-4B-AWQ", "v2":"./models/RT-Qwen3-4B-AWQ-v2"}
-def load_scenarios():
-    scs = json.loads((DIR/"scenarios.json").read_text())
-    for sc in scs:
-        sc["tools"] = (DIR/sc["tools_file"]).read_text().strip()
-    return scs
-def max_tool_params(tools_str):
-    m = 0
-    for l in tools_str.strip().split("\n"):
-        try: m = max(m, len(json.loads(l)["function"]["parameters"]["properties"]))
-        except: pass
-    return m
-def build_prompt(sc, ver):
-    t = sc["tools"]
-    if ver == "v1":
-        v1sys = (DIR/"v1_system.txt").read_text()
-        return (f"<|im_start|>system\n{v1sys}\n## Available Tools:\n\n{t}<|im_end|>\n"
-                f"<|im_start|>user\nenvironment: []\nhistory: {sc['history']}\n\n{sc['system']}\n\n{sc['query']}<|im_end|>\n"
-                f"<|im_start|>assistant\n")
-    return (f"<|im_start|>system\n{sc['system']}\n\n## Available Tools:\n\n{t}<|im_end|>\n"
-            f"<|im_start|>user\nhistory: {sc['history']}\n\n{sc['query']}<|im_end|>\n"
-            f"<|im_start|>assistant\n")
-def clean(t):
-    t = t.strip()
-    return "<|null|>" if "<|null|>" in t or t == "" else t.split("</")[0].strip()
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--model", default=None)
-    ap.add_argument("--version", default="v2", choices=["v1","v2"])
-    ap.add_argument("--n-args", default="auto")
-    ap.add_argument("--gpu", type=int, default=0)
-    ap.add_argument("--max-model-len", type=int, default=4096)
-    a = ap.parse_args()
-    a.model = a.model or MODELS[a.version]
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(a.gpu)
-    from vllm import LLM, SamplingParams
-    SC = load_scenarios()
-    print(f"\n{'='*60}\n  {a.version} | {a.model}\n{'='*60}")
-    llm = LLM(model=a.model, trust_remote_code=True, dtype="auto", gpu_memory_utilization=0.80,
-              max_model_len=a.max_model_len, max_num_seqs=8, enable_prefix_caching=True)
-    sp = SamplingParams(temperature=0.0, max_tokens=128, stop=STOPS, include_stop_str_in_output=True)
-    na = [min(max_tool_params(s["tools"]),6) if a.n_args=="auto" else max(1,min(6,int(a.n_args))) for s in SC]
-    for s,n in zip(SC,na): print(f"  {s['name']:<35} heads={1+n}")
-    def run(sc, n):
-        hd = HEADS[:1+n]; base = build_prompt(sc, a.version)
-        t0 = time.perf_counter()
-        outs = llm.generate([base+op for _,op,_ in hd], sp)
-        ms = (time.perf_counter()-t0)*1000
-        raw, toks, full = {}, {}, {}
-        for j,(nm,_,_) in enumerate(hd):
-            if j<len(outs) and outs[j].outputs:
-                o = outs[j].outputs[0]; full[nm]=o.text; raw[nm]=clean(o.text); toks[nm]=len(o.token_ids)
-            else: raw[nm],toks[nm],full[nm] = "<|null|>",0,""
-        return raw, toks, full, ms, hd
-    # Cold
-    print(f"\n{'='*60}\n  COLD START\n{'='*60}")
-    cold = []
-    for i,s in enumerate(SC): _,_,_,ms,_=run(s,na[i]); cold.append(ms); print(f"  {s['name']:<35} {ms:7.1f}ms")
-    # Hot x3
-    print(f"\n{'='*60}\n  HOT WARMUP (3 rounds)\n{'='*60}")
-    hot = [[] for _ in SC]
-    for r in range(3):
-        for i,s in enumerate(SC): _,_,_,ms,_=run(s,na[i]); hot[i].append(ms)
-        print(f"  Round {r+1}: "+"  ".join(f"{hot[j][-1]:6.1f}ms" for j in range(len(SC))))
-    # Test
-    print(f"\n{'='*60}\n  PARALLEL TEST ({a.version})\n{'='*60}\n")
-    res = []
-    for i,s in enumerate(SC):
-        raw,toks,full,ms,hd = run(s,na[i]); mt=max(toks.values()) if toks else 0
-        ok = raw.get("function","") == s["expected"]; res.append((s,raw,toks,full,ms,mt,hd,ok))
-        print(f"─── {s['name']} ───\n{'PASS' if ok else 'FAIL'}  {s['desc']}")
-        for nm,_,_ in hd:
-            v,tc = raw.get(nm,""),toks.get(nm,0); d=v if len(v)<=43 else v[:43]+"…"
-            st = ("OK" if ok else f"WRONG({v})") if nm=="function" else ("NULL" if v=="<|null|>" else "FILL")
-            print(f"  {nm:<10} {d:<45} {tc:<4} {st}")
-        print(f"  e2e={ms:.1f}ms  max_tok={mt}\n")
-    # Summary
-    N=len(res); np_=sum(r[7] for r in res); ae=sum(r[4] for r in res)/N; amt=sum(r[5] for r in res)/N
-    print(f"{'='*60}\n  SUMMARY ({a.version})\n{'='*60}")
-    print(f"  Accuracy       : {np_}/{N}\n  Cold start avg : {sum(cold)/N:.1f}ms\n  Hot prefill avg: {sum(sum(h) for h in hot)/sum(len(h) for h in hot):.1f}ms")
-    print(f"  E2E avg (hot)  : {ae:.1f}ms\n  Max head tokens: {amt:.1f} avg\n  E2E / max_tok  : {ae/amt:.1f}ms/tok (decode bottleneck)\n")
-    print(f"  {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'E2E':>7} {'MaxTk':>6} {'ms/tk':>6}\n  {'─'*70}")
-    for i,(s,_,_,_,ms,mt,_,_) in enumerate(res):
-        print(f"  {s['name']:<35} {cold[i]:6.1f}  {sum(hot[i])/3:6.1f}  {ms:6.1f}  {mt:>5}  {ms/mt if mt else 0:5.1f}")
-    # Example dump
-    s,raw,toks,full,ms,mt,hd,ok = res[0]; base=build_prompt(s,a.version)
-    print(f"\n{'='*60}\n  EXAMPLE ({a.version}): {s['name']}\n{'='*60}")
-    print(f"\n┌─ Shared Prefix ({len(base)} chars) ────────────────────")
-    for ln in base.split("\n"): print(f"│ {ln}")
-    print(f"└──────────────────────────────────────────────────")
-    print(f"\n┌─ Per-Head Trigger Tokens ─────────────────────────")
-    for nm,op,_ in hd: print(f"│  {nm:<10} → {op}")
-    print(f"└──────────────────────────────────────────────────")
-    print(f"\n┌─ Decode Output (all tokens, incl. stop) ──────────")
-    for nm,op,_ in hd: print(f"│  {nm:<10} [{toks.get(nm,0):>2} tok]  {op}{full.get(nm,'')}")
-    print(f"└──────────────────────────────────────────────────")
-    print(f"\n  Reconstructed multi-head response:")
-    for nm,op,cl in hd:
-        if raw.get(nm,"")=="<|null|>": print(f"    {op}<|null|>")
-        else:
-            ft=full.get(nm,""); print(f"    {op}{ft}" if any(ft.rstrip().endswith(x) for x in STOPS) else f"    {op}{ft}{cl}")
-    print()
-if __name__ == "__main__": main()

.ipynb_checkpoints/02_server-checkpoint.py DELETED Viewed

@@ -1,336 +0,0 @@
-#!/usr/bin/env python3
-"""
-SimpleTool vLLM Server - Multi-Head Parallel Decoding for Real-Time Function Calling
-Supports both v1 and v2 prompt formats. HTML clients need zero changes.
-"""
-import json
-import time
-import os
-from typing import List, Dict, Any, Optional
-from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import uvicorn
-from vllm import LLM, SamplingParams
-# ==================== Config ====================
-MODEL_PATH = "./models/RT-Qwen3-4B-AWQ-v2"       # v2 model path
-MODEL_VERSION = "v2"                           # "v1" or "v2"
-SERVER_HOST = "0.0.0.0"
-SERVER_PORT = 8899
-MAX_HISTORY = 6
-os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
-# ==================== Multi-Head Tags ====================
-HEAD_TAGS = ["<content>", "<function>", "<arg1>", "<arg2>", "<arg3>", "<arg4>", "<arg5>", "<arg6>"]
-STOP_TOKENS = ["<|null|>", "</content>", "</function>", "</arg1>", "</arg2>", "</arg3>", "</arg4>", "</arg5>", "</arg6>", "<|im_end|>"]
-# ── v1: generic head-format instructions in system, domain context in user ──
-V1_SYSTEM_TEMPLATE = """<|im_start|>system
-You are a multi-head parallel function calling model.
-## Output Heads
-**Head 0 - <content>**: Natural language response
-- Format: <content>response text</content>
-**Head 1 - <function>**: Function names to call
-- Format: <function>name</function>
-**Head 2-7 - <arg1>-<arg6>**: Function arguments by position
-- Format: <argN>value</argN>
-- If Unnecessary: <argN><|null|></argN>
-## Available Tools:
-{tools_json}
-<|im_end|>
-"""
-V1_USER_TEMPLATE = "<|im_start|>user\nenvironment: {env}\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
-# ── v2: domain system prompt + tools in system, leaner user turn ──
-V2_SYSTEM_TEMPLATE = """<|im_start|>system
-{system_prompt}
-## Available Tools:
-{tools_json}
-<|im_end|>
-"""
-V2_USER_TEMPLATE = "<|im_start|>user\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
-# Default system prompt when HTML client doesn't send one (backward compat)
-V2_DEFAULT_SYSTEM = "You are a real-time function calling assistant. Convert user commands into function calls using the available tools."
-# ==================== Data Models ====================
-class Message(BaseModel):
-    role: str
-    content: str
-class FCRequest(BaseModel):
-    messages: List[Message]
-    tools: List[Dict[str, Any]]
-    # ── v1 fields (still accepted, used when version=v1) ──
-    environment: Optional[List[str]] = None
-    history: Optional[List[str]] = None
-    # ── v2 optional: domain system prompt ──
-    system: Optional[str] = None
-    # ── shared ──
-    max_tokens: int = 32
-    temperature: float = 0.0
-    include_content_head: bool = False
-class FCResponse(BaseModel):
-    success: bool
-    function: Optional[str] = None
-    args: Dict[str, Any] = {}
-    heads: Dict[str, str] = {}
-    content: Optional[str] = None
-    latency_ms: float = 0
-    error: Optional[str] = None
-# ==================== SimpleTool Engine ====================
-class SimpleToolEngine:
-    def __init__(self, model_path: str, version: str = "v2"):
-        self.model_path = model_path
-        self.version = version
-        self.llm: Optional[LLM] = None
-        self.sampling_params = None
-    def initialize(self):
-        print(f"[SimpleTool] Loading model ({self.version}): {self.model_path}")
-        self.llm = LLM(
-            model=self.model_path,
-            trust_remote_code=True,
-            enable_prefix_caching=True,
-            tensor_parallel_size=1,
-            gpu_memory_utilization=0.8,
-            max_model_len=4096,
-            dtype="auto",
-        )
-        self.sampling_params = SamplingParams(
-            temperature=0.0,
-            max_tokens=32,
-            stop=STOP_TOKENS,
-            include_stop_str_in_output=True
-        )
-        print(f"[SimpleTool] Model loaded! (version={self.version})")
-        self._warmup()
-    def _warmup(self):
-        print("[SimpleTool] Warming up...")
-        dummy_tools = '{"type":"function","function":{"name":"test","parameters":{}}}'
-        if self.version == "v1":
-            prefix = V1_SYSTEM_TEMPLATE.format(tools_json=dummy_tools)
-            prefix += V1_USER_TEMPLATE.format(env="[]", hist="", query="test")
-        else:
-            prefix = V2_SYSTEM_TEMPLATE.format(system_prompt=V2_DEFAULT_SYSTEM, tools_json=dummy_tools)
-            prefix += V2_USER_TEMPLATE.format(hist="", query="test")
-        prompts = [prefix + tag for tag in HEAD_TAGS[:2]]  # function + arg1 enough
-        self.llm.generate(prompts, self.sampling_params)
-        print("[SimpleTool] Warmup complete!")
-    def _build_tools_json(self, tools: List[Dict]) -> str:
-        return "\n".join(json.dumps(t, ensure_ascii=False) for t in tools)
-    def _extract_param_info(self, tools: List[Dict]) -> List[str]:
-        names = []
-        for tool in tools:
-            func = tool.get("function", {})
-            params = func.get("parameters", {}).get("properties", {})
-            for name in params.keys():
-                if name not in names:
-                    names.append(name)
-        return names[:6]
-    def _get_max_args(self, tools: List[Dict]) -> int:
-        max_args = 0
-        for tool in tools:
-            func = tool.get("function", {})
-            params = func.get("parameters", {}).get("properties", {})
-            max_args = max(max_args, len(params))
-        return min(max_args, 6)
-    def _build_prompt(self, request: FCRequest) -> str:
-        """Build the shared prefix according to version."""
-        tools_json = self._build_tools_json(request.tools)
-        # Extract query from messages
-        query = ""
-        for msg in request.messages:
-            if msg.role == "user":
-                query = msg.content
-        hist_list = (request.history or [])[-MAX_HISTORY:]
-        hist_str = ", ".join(hist_list) if hist_list else ""
-        if self.version == "v1":
-            # ── v1: head descriptions + tools in system, env+history+query in user ──
-            env_str = json.dumps(request.environment or [], ensure_ascii=False)
-            system_part = V1_SYSTEM_TEMPLATE.format(tools_json=tools_json)
-            user_part = V1_USER_TEMPLATE.format(env=env_str, hist=hist_str, query=query)
-        else:
-            # ── v2: domain system + tools in system, history+query in user ──
-            # If client sends a system prompt, use it; otherwise use default.
-            # For legacy HTML clients that send environment[], fold it into query.
-            system_prompt = request.system or V2_DEFAULT_SYSTEM
-            system_part = V2_SYSTEM_TEMPLATE.format(
-                system_prompt=system_prompt,
-                tools_json=tools_json
-            )
-            # Backward compat: if environment is provided (old HTML clients),
-            # prepend it to the query so the model still sees context.
-            env_prefix = ""
-            if request.environment:
-                env_prefix = "environment: " + json.dumps(request.environment, ensure_ascii=False) + "\n"
-            user_part = V2_USER_TEMPLATE.format(
-                hist=hist_str,
-                query=env_prefix + query
-            )
-        return system_part + user_part
-    def call(self, request: FCRequest) -> FCResponse:
-        start = time.perf_counter()
-        full_prefix = self._build_prompt(request)
-        # Dynamic head selection based on max args
-        max_args = self._get_max_args(request.tools)
-        active_tags = ["<function>"] + [f"<arg{i}>" for i in range(1, max_args + 1)]
-        if request.include_content_head:
-            active_tags = ["<content>"] + active_tags
-        prompts = [full_prefix + tag for tag in active_tags]
-        outputs = self.llm.generate(prompts, self.sampling_params)
-        latency_ms = (time.perf_counter() - start) * 1000
-        # Parse outputs
-        heads = {}
-        head_names = []
-        if request.include_content_head:
-            head_names.append("content")
-        head_names.append("function")
-        head_names.extend([f"arg{i}" for i in range(1, max_args + 1)])
-        for i, output in enumerate(outputs):
-            text = output.outputs[0].text.strip()
-            for stop in STOP_TOKENS:
-                if text.endswith(stop):
-                    text = text[:-len(stop)].strip()
-                    break
-            heads[head_names[i]] = text
-        func_name = heads.get("function", "").strip()
-        if not func_name or func_name == "<|null|>":
-            return FCResponse(
-                success=False,
-                heads=heads,
-                content=heads.get("content"),
-                latency_ms=latency_ms,
-                error="No function called"
-            )
-        param_names = self._extract_param_info(request.tools)
-        args = {}
-        for i, name in enumerate(param_names):
-            val = heads.get(f"arg{i+1}", "").strip()
-            if val and val != "<|null|>":
-                if val.isdigit():
-                    args[name] = int(val)
-                elif val.lstrip('-').replace('.', '', 1).isdigit():
-                    args[name] = float(val)
-                else:
-                    args[name] = val.lower().strip()
-        return FCResponse(
-            success=True,
-            function=func_name,
-            args=args,
-            heads=heads,
-            content=heads.get("content"),
-            latency_ms=latency_ms
-        )
-# ==================== FastAPI ====================
-engine: Optional[SimpleToolEngine] = None
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global engine
-    engine = SimpleToolEngine(MODEL_PATH, version=MODEL_VERSION)
-    engine.initialize()
-    yield
-    print("[Server] Shutdown")
-app = FastAPI(title="SimpleTool Server", version="2.0.0", lifespan=lifespan)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.get("/health")
-async def health():
-    return {
-        "status": "ok",
-        "loaded": engine is not None and engine.llm is not None,
-        "model": MODEL_PATH,
-        "version": MODEL_VERSION,
-    }
-@app.post("/v1/function_call", response_model=FCResponse)
-async def function_call(request: FCRequest):
-    if engine is None or engine.llm is None:
-        raise HTTPException(503, "Model not loaded")
-    try:
-        return engine.call(request)
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return FCResponse(success=False, error=str(e), latency_ms=0)
-if __name__ == "__main__":
-    print(r"""
-╔════════════════════════════════════════════════════════════════════╗
-║                                                                    ║
-║   ███████╗██╗███╗   ███╗██████╗ ██╗     ███████╗                   ║
-║   ██╔════╝██║████╗ ████║██╔══██╗██║     ██╔════╝                   ║
-║   ███████╗██║██╔████╔██║██████╔╝██║     █████╗                     ║
-║   ╚════██║██║██║╚██╔╝██║██╔═══╝ ██║     ██╔══╝                     ║
-║   ███████║██║██║ ╚═╝ ██║██║     ███████╗███████╗                   ║
-║   ╚══════╝╚═╝╚═╝     ╚═╝╚═╝     ╚══════╝╚══════╝                   ║
-║                                                                    ║
-║          SimpleTool vLLM-Server v2.0                               ║
-║          Multi-Head Parallel Decoding — v1/v2 Compatible           ║
-║                                                                    ║
-║   Run Demos: Open demos/*.html in browser                          ║
-║   Build New: Send simpletool-game-guide.md to AI(Claude Gemini...) ║
-║              for Building new your own HTML games easily           ║
-║   Endpoints:                                                       ║
-║     GET  /health           - Health check (+ version info)         ║
-║     POST /v1/function_call - Function call API (v1 & v2)          ║
-║                                                                    ║
-╚════════════════════════════════════════════════════════════════════╝
-    """)
-    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

.ipynb_checkpoints/03_test_server-checkpoint.py DELETED Viewed

@@ -1,250 +0,0 @@
-#!/usr/bin/env python3
-"""
-test_server.py — Hit running rt_server /v1/function_call with 4 scenarios
-Usage:  python test_server.py [--url http://localhost:8899]
-"""
-import argparse, json, time, sys, requests
-# ==================== Test Scenarios ====================
-SCENARIOS = [
-    # ── 1. Game: Tower Defense (from benchmark) ──
-    {
-        "name": "Game — Tower Defense",
-        "desc": "use_skill(Amiya)",
-        "expected_fn": "use_skill",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Wave 5, BOSS appeared, 8 enemies remaining\n"
-                "Operators: Blaze(north,HP50%,skill ready) Amiya(center,HP90%,skill ready)\n"
-                "Enemy direction: concentrated north\n\n"
-                "Amiya use skill now"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move","description":"Move a deployed operator to a new position on the battlefield. Use this when the player wants to reposition a unit to a different lane or strategic point.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator to move. Must match one of the currently deployed operators shown in the battlefield state. Supports fuzzy matching for ASR input, e.g. 'blaze', 'Blaze', 'BLAZE' all refer to the same operator."},
-                    "target":{"type":"string","description":"The destination position on the battlefield grid. Must be one of: 'north' (top lane), 'south' (bottom lane), 'east' (right/enemy side), 'west' (left/base side), 'center' (middle area). Choose based on the player's spoken direction."}},"required":["unit_id","target"]}}},
-                {"type":"function","function":{"name":"use_skill","description":"Activate the special skill of a deployed operator. Each operator has a unique skill that can be triggered when the skill gauge is ready. The skill effect depends on the operator type (e.g. AoE damage, healing, buff).","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator whose skill should be activated. The operator must be currently deployed on the battlefield and have their skill ready (skill gauge full). Supports fuzzy name matching for ASR input."},
-                    "skill_id":{"type":"string","description":"Optional skill identifier when an operator has multiple skills. If the operator only has one skill or the player did not specify which skill, this can be omitted. Format: 's1', 's2', 's3' for skill slot 1/2/3."}},"required":["unit_id"]}}},
-                {"type":"function","function":{"name":"retreat","description":"Withdraw a single operator from the battlefield back to the reserve bench. The operator's redeployment timer starts after retreat. Use when the player wants to pull back a specific unit to save them or free up a deployment slot.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator to retreat. Must be currently deployed on the battlefield. After retreat, this operator enters cooldown before they can be redeployed. Supports fuzzy name matching for ASR input."}},"required":["unit_id"]}}},
-                {"type":"function","function":{"name":"set_stance","description":"Change the combat behavior mode of a deployed operator. This affects how the operator selects targets and whether they prioritize attacking or surviving.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator whose stance should be changed. Must be currently deployed on the battlefield. Supports fuzzy name matching for ASR input."},
-                    "stance":{"type":"string","description":"The behavior mode to set. Must be one of: 'aggressive' (prioritize attacking nearest enemy, maximize DPS), 'defensive' (prioritize blocking and damage reduction, focus on survival), 'hold' (stay in position and only attack enemies in range, do not chase)."}},"required":["unit_id","stance"]}}},
-                {"type":"function","function":{"name":"retreat_all","description":"Emergency retreat of all currently deployed operators from the battlefield at once. Use only when the player explicitly requests a full withdrawal, typically in dire situations. All operators enter redeployment cooldown simultaneously.","parameters":{"type":"object","properties":{}}}},
-                {"type":"function","function":{"name":"pass","description":"Take no action this turn. Use when the player's command has already been fulfilled in history, or when the player explicitly says to wait, skip, or do nothing. Also use when the voice input is ambiguous and no clear command can be extracted.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the voice command interpreter for a real-time tower defense game. The player issues orders by voice. You convert ASR-transcribed commands into function calls.\n\nRules:\n- One function call per command\n- Fuzzy match operator names\n- Positions: north, south, east, west, center\n- If all tasks in history are done, call pass",
-            "history": []
-        }
-    },
-    # ── 2. Robotic Arm — Assembly (from benchmark) ──
-    {
-        "name": "Robotic Arm — Assembly",
-        "desc": "move_to(300,150,50,slow)",
-        "expected_fn": "move_to",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Arm at home (0,0,500), gripper open\n"
-                "Workpiece: red gear at (300,150,50), target tray at (600,0,80)\n\n"
-                "Move to the red gear position slowly"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move_to","description":"Move the robotic arm end-effector (tool center point) to a specified 3D coordinate in the workspace. The arm plans a collision-free path from its current position to the target. Optionally control movement speed for precision tasks.","parameters":{"type":"object","properties":{
-                    "x":{"type":"number","description":"Target X coordinate in millimeters, relative to the robot base frame origin. Positive X points forward (away from the robot base). Valid range depends on arm reach, typically -800 to 800 mm."},
-                    "y":{"type":"number","description":"Target Y coordinate in millimeters, relative to the robot base frame origin. Positive Y points to the left when facing the robot. Valid range depends on arm reach, typically -800 to 800 mm."},
-                    "z":{"type":"number","description":"Target Z coordinate in millimeters, relative to the robot base frame origin (table surface = 0). Positive Z points upward. Must be >= 0 to avoid collision with the work surface. Typical range: 0 to 500 mm."},
-                    "speed":{"type":"string","description":"Movement speed profile for the path. 'slow' (25% max velocity) for precision placement and delicate parts, 'normal' (50% max velocity) for standard pick-and-place, 'fast' (100% max velocity) for rapid repositioning when precision is not critical. Default: 'normal'."}},"required":["x","y","z"]}}},
-                {"type":"function","function":{"name":"grip","description":"Close the gripper jaws to grasp an object at the current end-effector position. The gripper applies the specified force and holds it. Must be called after positioning the arm above/around the target object.","parameters":{"type":"object","properties":{
-                    "force":{"type":"number","description":"Gripping force in Newtons applied by the gripper jaws. Choose based on object fragility: 10N for light/fragile items (electronics, thin plastic), 50N for medium items (standard gears, metal parts), 100N for heavy/robust items (large castings, steel blocks). Excessive force may damage delicate workpieces."}},"required":["force"]}}},
-                {"type":"function","function":{"name":"release","description":"Open the gripper jaws to release the currently held object. The gripper fully opens to its maximum width. Should be called after positioning the arm at the target placement location. Ensure the object is at a safe height above the surface before releasing.","parameters":{"type":"object","properties":{}}}},
-                {"type":"function","function":{"name":"rotate","description":"Rotate the end-effector around a specified axis without changing its position. Used to orient the gripper or tool for proper approach angle before grasping, or to rotate a held workpiece for assembly alignment.","parameters":{"type":"object","properties":{
-                    "axis":{"type":"string","description":"The rotation axis in the end-effector frame. 'roll' rotates around the approach direction (Z-axis of tool frame, like turning a screwdriver), 'pitch' tilts the end-effector up/down (like nodding), 'yaw' swings the end-effector left/right (like shaking head). Choose based on the desired orientation change."},
-                    "angle":{"type":"number","description":"Rotation angle in degrees. Positive values follow the right-hand rule around the specified axis. Typical range: -180 to 180 degrees. Small angles (< 15°) for fine adjustment, larger angles for major reorientation."}},"required":["axis","angle"]}}},
-                {"type":"function","function":{"name":"home","description":"Return the robotic arm to its predefined home position (0, 0, 500) with the gripper pointing straight down and jaws open. Use as a safe starting/ending position for task sequences, or to clear the workspace. The arm takes a collision-free path at normal speed.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the voice controller for an industrial 6-axis robotic arm. You convert spoken commands into function calls.\n\nRules:\n- One function call per command\n- Coordinates in mm, angles in degrees\n- Gripper force: light=10N, medium=50N, heavy=100N\n- Speed: slow/normal/fast",
-            "history": []
-        }
-    },
-    # ── 3. Digital Human — Streamer (from benchmark) ──
-    {
-        "name": "Digital Human — Streamer",
-        "desc": "speak(welcome,cheerful)",
-        "expected_fn": "speak",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Stream just started, viewers flooding in\n"
-                "Chat: \"Hello streamer!\" \"Good evening!\"\n"
-                "Director: greet the audience warmly, say welcome and look at camera"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"set_expression","description":"Set the facial expression of the digital human avatar. Controls the blend shapes for eyes, eyebrows, and mouth to display the target emotion. The expression persists until changed by another set_expression call or overridden by a speak animation.","parameters":{"type":"object","properties":{
-                    "emotion":{"type":"string","description":"The target facial expression to display. Must be one of: 'happy' (smile, raised cheeks), 'sad' (downturned mouth, drooping eyebrows), 'surprised' (wide eyes, raised eyebrows, open mouth), 'angry' (furrowed brows, tight lips), 'neutral' (relaxed default face), 'thinking' (slightly furrowed brow, eyes looking up/away, subtle lip purse)."},
-                    "intensity":{"type":"number","description":"The strength of the facial expression blend, from 0.0 (barely visible, subtle hint) to 1.0 (maximum exaggeration, full expression). Recommended: 0.3-0.5 for natural conversation, 0.6-0.8 for reactive moments, 0.9-1.0 for comedic or dramatic emphasis."}},"required":["emotion","intensity"]}}},
-                {"type":"function","function":{"name":"speak","description":"Make the digital human speak the given text with lip-sync animation and appropriate facial expressions. The TTS engine converts text to audio while the avatar performs real-time viseme-based lip synchronization. The tone parameter affects both voice prosody and accompanying facial micro-expressions.","parameters":{"type":"object","properties":{
-                    "text":{"type":"string","description":"The speech content for the digital human to say aloud. Should be natural conversational language appropriate for a live stream context. Keep sentences concise (under 50 characters preferred for real-time responsiveness). May include casual expressions, emoji descriptions, or audience interaction phrases."},
-                    "tone":{"type":"string","description":"The vocal tone and emotional coloring of the speech delivery. Must be one of: 'cheerful' (upbeat, warm, higher pitch, for greetings and positive moments), 'calm' (steady, soothing, moderate pace, for explanations and transitions), 'serious' (lower pitch, measured pace, for important announcements), 'excited' (high energy, faster pace, emphasis peaks, for reactions and hype moments)."}},"required":["text","tone"]}}},
-                {"type":"function","function":{"name":"gesture","description":"Trigger a pre-defined body gesture animation on the digital human avatar. The gesture plays once and blends back to the idle pose. Can be combined with speak or set_expression for more natural multi-channel communication.","parameters":{"type":"object","properties":{
-                    "type":{"type":"string","description":"The gesture animation to play. Must be one of: 'wave' (friendly hand wave, for greetings and farewells), 'nod' (head nod, to show agreement or acknowledgment), 'shake_head' (head shake, to express disagreement or disbelief), 'bow' (respectful bow, for gratitude or formal greeting), 'point' (index finger pointing forward, to direct attention), 'thumbs_up' (approval gesture, for positive feedback), 'clap' (both hands clapping, for celebration or applause)."}},"required":["type"]}}},
-                {"type":"function","function":{"name":"look_at","description":"Direct the digital human's eye gaze and subtle head orientation toward a specified target. Creates natural eye contact or directional attention. The gaze shift is smoothly interpolated over ~200ms for realistic movement.","parameters":{"type":"object","properties":{
-                    "target":{"type":"string","description":"The gaze target direction. Must be one of: 'camera' (look directly at the audience through the camera lens, creates eye contact with viewers), 'left' (glance to the left side of the screen, e.g. toward a chat panel or co-host), 'right' (glance to the right, e.g. toward a game screen or secondary content), 'up' (look upward, conveys thinking or reacting to something above), 'down' (look downward, conveys reading chat, shyness, or sadness)."}},"required":["target"]}}},
-                {"type":"function","function":{"name":"idle","description":"Return the digital human to its default idle animation loop. Resets any active expression to neutral, stops ongoing gestures, and returns gaze to a soft forward direction with natural idle micro-movements (subtle breathing, occasional blinks, slight sway). Use during pauses or transitions between active segments.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the expression controller for a virtual digital human streamer. You convert director instructions into animation function calls.\n\nRules:\n- One function call per instruction\n- Emotion intensity: 0.0-1.0\n- Speech text should be natural\n- Tone: cheerful/calm/serious/excited",
-            "history": []
-        }
-    },
-    # ── 4. Neon Arena (what HTML actually sends, legacy env style) ──
-    {
-        "name": "Neon Arena — Legacy HTML",
-        "desc": "fire(left) or move(left)",
-        "expected_fn": "move",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Arena 900x600. FIRE or Call move(dir) or fire(dir). dir:up/down/left/right"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move","description":"Move the player's spaceship in the specified direction by one step on the 900x600 arena grid. Use to reposition for better firing angle, dodge incoming bullets, or approach/retreat from enemies.","parameters":{"type":"object","properties":{
-                    "direction":{"type":"string","enum":["up","down","left","right"],"description":"The movement direction on the arena. 'up' decreases Y (toward top edge), 'down' increases Y (toward bottom edge), 'left' decreases X (toward left edge), 'right' increases X (toward right edge). Choose based on tactical positioning relative to the player and arena walls."}},"required":["direction"]}}},
-                {"type":"function","function":{"name":"fire","description":"Fire a bullet from the spaceship in the specified direction. The bullet travels in a straight line until it hits a target or exits the arena boundary. Use when aligned with the player's position on the horizontal or vertical axis for best hit probability.","parameters":{"type":"object","properties":{
-                    "direction":{"type":"string","enum":["up","down","left","right"],"description":"The firing direction of the bullet. 'up' fires toward top edge, 'down' fires toward bottom edge, 'left' fires toward left edge (toward player's side), 'right' fires toward right edge. Choose based on current alignment with the player: fire horizontally when align_h=true, vertically when align_v=true."}},"required":["direction"]}}}
-            ],
-            "environment": ["pos=700,300","player=100,310","dist=600","align_h=true","align_v=false","cd=0","wall=no"],
-            "history": ["fire(left)","move(up)","fire(left)"]
-        }
-    },
-]
-def check_health(url: str) -> dict:
-    r = requests.get(f"{url}/health", timeout=5)
-    return r.json()
-def call_fc(url: str, req: dict) -> dict:
-    t0 = time.perf_counter()
-    r = requests.post(f"{url}/v1/function_call", json=req, timeout=30)
-    wall_ms = (time.perf_counter() - t0) * 1000
-    d = r.json()
-    d["_wall_ms"] = wall_ms
-    return d
-def fmt_heads(heads: dict) -> str:
-    lines = []
-    for k in ["function","arg1","arg2","arg3","arg4","arg5","arg6","content"]:
-        if k in heads:
-            v = heads[k]
-            tag = "NULL" if (not v or v == "<|null|>") else v
-            lines.append(f"    {k:<10} = {tag}")
-    return "\n".join(lines)
-def main():
-    ap = argparse.ArgumentParser(description="Test SimpleTool server")
-    ap.add_argument("--url", default="http://localhost:8899")
-    ap.add_argument("--rounds", type=int, default=3, help="hot rounds per scenario")
-    args = ap.parse_args()
-    url = args.url.rstrip("/")
-    # ── Health ──
-    print(f"\n{'='*65}")
-    print(f"  SimpleTool Server Test")
-    print(f"  Target: {url}")
-    print(f"{'='*65}\n")
-    try:
-        h = check_health(url)
-        print(f"  /health → {json.dumps(h)}")
-        if not h.get("loaded") and h.get("status") != "ok":
-            print("  ⚠ Model not loaded!"); sys.exit(1)
-    except Exception as e:
-        print(f"  ✗ Cannot connect: {e}"); sys.exit(1)
-    version = h.get("version", "unknown")
-    print(f"  Server version: {version}\n")
-    # ── Cold start (first call warms KV cache) ──
-    print(f"{'='*65}")
-    print(f"  COLD START")
-    print(f"{'='*65}")
-    cold_ms = []
-    for sc in SCENARIOS:
-        r = call_fc(url, sc["request"])
-        ms = r.get("latency_ms", r.get("_wall_ms", 0))
-        cold_ms.append(ms)
-        ok = "✓" if r.get("function", "") == sc["expected_fn"] else "✗"
-        print(f"  {ok} {sc['name']:<35} {ms:7.1f}ms  → {r.get('function','?')}({r.get('args',{})})")
-    print()
-    # ── Hot rounds ──
-    print(f"{'='*65}")
-    print(f"  HOT ROUNDS (×{args.rounds})")
-    print(f"{'='*65}")
-    hot_ms = [[] for _ in SCENARIOS]
-    for rd in range(args.rounds):
-        parts = []
-        for i, sc in enumerate(SCENARIOS):
-            r = call_fc(url, sc["request"])
-            ms = r.get("latency_ms", r.get("_wall_ms", 0))
-            hot_ms[i].append(ms)
-            parts.append(f"{ms:6.1f}ms")
-        print(f"  Round {rd+1}: {'  '.join(parts)}")
-    print()
-    # ── Detailed test ──
-    print(f"{'='*65}")
-    print(f"  DETAILED RESULTS")
-    print(f"{'='*65}\n")
-    results = []
-    for i, sc in enumerate(SCENARIOS):
-        r = call_fc(url, sc["request"])
-        fn = r.get("function", "")
-        ok = fn == sc["expected_fn"]
-        results.append((sc, r, ok))
-        status = "PASS ✓" if ok else "FAIL ✗"
-        ms_server = r.get("latency_ms", 0)
-        ms_wall = r.get("_wall_ms", 0)
-        print(f"─── {sc['name']} ───")
-        print(f"  {status}  expected={sc['expected_fn']}  got={fn}")
-        print(f"  args: {json.dumps(r.get('args', {}), ensure_ascii=False)}")
-        print(f"  server={ms_server:.1f}ms  wall={ms_wall:.1f}ms  overhead={ms_wall-ms_server:.1f}ms")
-        if r.get("heads"):
-            print(f"  heads:")
-            print(fmt_heads(r["heads"]))
-        if r.get("error"):
-            print(f"  error: {r['error']}")
-        print()
-    # ── Summary ──
-    n = len(results)
-    passed = sum(1 for _, _, ok in results if ok)
-    avg_cold = sum(cold_ms) / n
-    avg_hot = sum(sum(h) for h in hot_ms) / sum(len(h) for h in hot_ms) if hot_ms else 0
-    avg_detail = sum(r.get("latency_ms", 0) for _, r, _ in results) / n
-    print(f"{'='*65}")
-    print(f"  SUMMARY")
-    print(f"{'='*65}")
-    print(f"  Server version  : {version}")
-    print(f"  Accuracy        : {passed}/{n}")
-    print(f"  Cold start avg  : {avg_cold:.1f}ms")
-    print(f"  Hot avg         : {avg_hot:.1f}ms")
-    print(f"  Detail avg      : {avg_detail:.1f}ms")
-    print()
-    print(f"  {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'Detail':>7} {'Status':>6}")
-    print(f"  {'─'*65}")
-    for i, (sc, r, ok) in enumerate(results):
-        havg = sum(hot_ms[i]) / len(hot_ms[i]) if hot_ms[i] else 0
-        print(f"  {sc['name']:<35} {cold_ms[i]:6.1f}  {havg:6.1f}  {r.get('latency_ms',0):6.1f}  {'✓' if ok else '✗':>5}")
-    print()
-if __name__ == "__main__":
-    main()

.ipynb_checkpoints/README-checkpoint.md DELETED Viewed

@@ -1,308 +0,0 @@
----
-library_name: transformers
-tags:
-- simpletool
-- tool-calling
-- parallel-decoding
-license: apache-2.0
-datasets:
-- your-dataset-name
-language:
-- en
-- zh
-pipeline_tag: text-generation
-arxiv: 2603.00030
----
-<p align="center">
-  <a href="README.md">English</a> | <a href="README_zh.md">中文</a>
-</p>
-<h1 align="center">SimpleTool</h1>
-<p align="center">
-  <b>Parallel Decoding for Real-Time LLM Function Calling</b>
-</p>
-<p align="center">
-  <a href="https://arxiv.org/abs/2603.00030"><img src="https://img.shields.io/badge/arXiv-2603.00030-red"></a>
-  <a href="https://huggingface.co/Cialtion/SimpleTool"><img src="https://img.shields.io/badge/🤗-Models-yellow"></a>
-  <a href="https://www.modelscope.cn/models/cialtion/SimpleTool"><img src="https://img.shields.io/badge/ModelScope-Models-blue"></a>
-  <a href="#demo-videos"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
-  <a href="#demo-videos"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
-  <a href="#license"><img src="https://img.shields.io/badge/License-Apache%202.0-green"></a>
-</p>
-<p align="center">
-  A 4B-parameter LLM achieving <b>16 Hz end-to-end real-time function calling</b> — fast enough to drive game AI, robotic arms, and digital humans.
-</p>
----
-SimpleTool enables **real-time LLM function calling** through multi-head parallel decoding. By introducing special tokens that compress redundant structured output (4–6×) and enable independent generation of function name and arguments, we achieve **3–6× end-to-end speedup** while maintaining competitive accuracy across three application domains: **games**, **robotic control**, and **digital human animation**.
-<p align="center">
-  <img src="assets/fig_title_panel_a.png" alt="SimpleTool Overview" width="700">
-</p>
-## How It Works
-Traditional function calling generates tokens sequentially — `function → arg1 → arg2 → ...` — so latency scales linearly with output length. SimpleTool exploits two key observations:
-1. **Token Redundancy**: Structured outputs contain predictable tokens (brackets, parameter names, quotes) that can be compressed into single special tokens.
-2. **Weak Causal Dependencies**: Function arguments are largely independent of each other and can be generated in parallel.
-<p align="center">
-  <img src="assets/overview.png" alt="SimpleTool Architecture" width="600">
-</p>
-By decoding function name and arguments as parallel streams sharing the same prefix KV cache, latency drops from `sum(all_token_times)` to `max(per_head_time)`. The parallel heads utilize idle compute capacity within the memory-bandwidth-bound decode phase, making parallelization nearly free.
-For more details, see our [arXiv paper](https://arxiv.org/abs/2603.00030).
----
-## Quick Start
-### 1. Setup Environment
-```bash
-git clone https://github.com/HaxxorCialtion/SimpleTool.git
-cd SimpleTool
-```
-**Option A — uv (recommended)**
-```bash
-uv venv env_rt -p python3.12
-source env_rt/bin/activate
-uv pip install -r requirements.txt
-```
-**Option B — conda**
-```bash
-conda create -n simpletool python=3.12 -y
-conda activate simpletool
-pip install -r requirements.txt
-```
-**Option C — pip**
-```bash
-python3.12 -m venv env_rt
-source env_rt/bin/activate
-pip install -r requirements.txt
-```
-### 2. Download Model
-The recommended default model is **RT-Qwen3-4B-AWQ-v2** (4B parameters, AWQ W4A16 quantized, v2 prompt format). All scripts default to `./models/RT-Qwen3-4B-AWQ-v2`.
-```bash
-# HuggingFace
-huggingface-cli download Cialtion/SimpleTool \
-  --include "RT-Qwen3-4B-AWQ-v2/*" --local-dir ./models
-# Or ModelScope
-modelscope download --model cialtion/SimpleTool \
-  --include "RT-Qwen3-4B-AWQ-v2/*" --local_dir ./models
-```
-<details>
-<summary><b>All Available Models</b></summary>
-| Model | Params | Latency | HuggingFace | ModelScope |
-|-------|--------|---------|-------------|------------|
-| RT-Qwen2.5-0.5B-AWQ | 0.5B | ~30ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-0.5B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-0.5B-AWQ) |
-| RT-Qwen2.5-1.5B-AWQ | 1.5B | ~40ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-1.5B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-1.5B-AWQ) |
-| RT-Qwen2.5-3B-AWQ | 3B | ~50ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-3B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-3B-AWQ) |
-| **RT-Qwen3-4B-AWQ-v2** | **4B** | **~60ms** | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ-v2) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ-v2) |
-| RT-Qwen3-4B-AWQ | 4B | ~60ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ) |
-| RT-Qwen2.5-7B-AWQ | 7B | ~70ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-7B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-7B-AWQ) |
-| RT-Qwen2.5-14B-AWQ | 14B | ~130ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-14B-AWQ) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-14B-AWQ) |
-| RT-Qwen3-30B-A3B-AWQ | 30B(A3B) | ~ | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-30B_awq_w4a16) | [Link](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-30B_awq_w4a16) |
-> Latency measured on RTX 4090 with vLLM prefix caching. v2 models use an improved prompt format with domain-specific system prompts; v1 models use a generic multi-head instruction header.
-</details>
-### 3. Run Benchmark (No Server Needed)
-`01_benchmark.py` runs multi-head parallel decoding directly via vLLM across three application domains — game AI, robotic arm control, and digital human animation — with cold start / hot prefill / decode bottleneck analysis.
-```bash
-# v2 model (default)
-python 01_benchmark.py --version v2
-# v1 model
-python 01_benchmark.py --version v1 --model ./models/RT-Qwen3-4B-AWQ
-# Auto-detect optimal head count per scenario
-python 01_benchmark.py --n-args auto
-```
-Example output:
-```
-  PARALLEL TEST (v2)
-─── Game — Tower Defense ───
-PASS  use_skill(Amiya)
-  function   use_skill                                     4    OK
-  arg1       Amiya                                         4    FILL
-  arg2       <|null|>                                      3    NULL
-  e2e=24.6ms  max_tok=4
-─── Robotic Arm — Assembly ───
-PASS  move_to(300,150,50,slow)
-  function   move_to                                       4    OK
-  arg1       300                                           5    FILL
-  arg2       150                                           5    FILL
-  arg3       500                                           5    FILL
-  arg4       slow                                          3    FILL
-  e2e=39.9ms  max_tok=5
-─── Digital Human — Streamer ───
-PASS  speak(welcome,cheerful)
-  function   speak                                         4    OK
-  arg1       Welcome!                                      4    FILL
-  arg2       cheerful                                      5    FILL
-  e2e=29.1ms  max_tok=5
-  SUMMARY (v2)
-  Accuracy       : 3/3
-  Cold start avg : 56.1ms
-  Hot prefill avg: 29.3ms
-  E2E avg (hot)  : 31.2ms
-  E2E / max_tok  : 6.7ms/tok (decode bottleneck)
-```
-The script also prints the full prompt structure and reconstructed multi-head output for inspection.
-### 4. Start Server
-`02_server.py` wraps the engine in a FastAPI server with CORS support. HTML game clients connect to it.
-```bash
-python 02_server.py
-```
-Server starts at `http://localhost:8899` with two endpoints:
-| Endpoint | Method | Description |
-|----------|--------|-------------|
-| `/health` | GET | Health check, model version info |
-| `/v1/function_call` | POST | Multi-head parallel function call |
-Edit `MODEL_PATH` and `MODEL_VERSION` at the top of `02_server.py` to switch between v1/v2 models.
-### 5. Test Server
-With the server running, test it from another terminal:
-```bash
-python 03_test_server.py
-```
-This sends the same three domain scenarios (game, robotic arm, digital human) to the server API and reports accuracy, cold/hot latency, and per-head output.
-```bash
-# Custom server URL
-python 03_test_server.py --url http://192.168.1.100:8899
-# More hot rounds
-python 03_test_server.py --rounds 10
-```
-### 6. Play Demos
-Open demo HTML files in your browser. They connect to the running SimpleTool server.
-| Demo | Description | File |
-|------|-------------|------|
-| **Pong** | AI vs Human paddle game | `demos/pong_game.html` |
-| **Neon Arena** | Multi-AI battle shooter | `demos/neon_arena.html` |
-For games with extra assets:
-```bash
-cd demos/neon_arena
-python3 -m http.server 8080 --bind 127.0.0.1
-```
-Then open http://127.0.0.1:8080/neon_arena.html and enter your SimpleTool server URL (default: `http://localhost:8899`).
-<p align="center">
-  <video src="https://github.com/user-attachments/assets/436e3b97-e8ab-4d36-9fa0-8f1962da4a38" autoplay loop muted width="400"></video>
-  <video src="https://github.com/user-attachments/assets/f9b127da-b65e-4a06-b48f-836e759a6029" autoplay loop muted width="400"></video>
-</p>
----
-## Project Structure
-```
-SimpleTool/
-├── 01_benchmark.py          # Step 1: Direct parallel decode benchmark
-├── 02_server.py             # Step 2: FastAPI vLLM server
-├── 03_test_server.py        # Step 3: Server API test client
-├── prompts/                 # External prompt & scenario files
-│   ├── v1_system.txt        #   v1 multi-head system prompt
-│   ├── scenarios.json       #   3 domain test scenarios
-│   ├─�� tools_game.jsonl     #   Tower defense tool definitions
-│   ├── tools_arm.jsonl      #   Robotic arm tool definitions
-│   └── tools_avatar.jsonl   #   Digital human tool definitions
-├── models/                  # Downloaded models go here
-│   └── RT-Qwen3-4B-AWQ-v2/ #   Default model
-├── demos/                   # HTML game clients
-│   ├── pong_game.html
-│   └── neon_arena/
-├── assets/                  # Figures for README
-├── requirements.txt
-├── simpletool-game.skill.md # Guide for building new games with AI
-├── README.md
-└── README_zh.md
-```
-## Build Your Own Game
-Feed **`simpletool-game.skill.md`** along with this **`README.md`** into your AI coding agent (Claude Code, Codex, Antigravity, etc.) — the skill file covers server API spec, tool definition format, query design best practices, frontend templates, and dynamic head optimization tips, while the README helps the agent understand the overall project structure. Together they provide everything needed to vibe-code a SimpleTool-powered game.
----
-## Roadmap
-- [ ] **World Simulation** — Large-scale (1,000+ NPCs) real-time AI world simulation with < 200ms action latency per agent
-- [ ] **Speculative & Multi-Token Decoding** — Speculative decoding and multi-token prediction for further latency reduction
-- [ ] **Native Windows Support** — Windows game engine plugins and native runtime (no need for Docker or WSL)
-- [ ] **Apple Ecosystem** — Mac and iPhone on-device deployment (CoreML / Metal)
-- [ ] **v3 Architecture** — Fast thinking (real-time SimpleTool) + slow thinking (async meta-cognition) fusion
-- [ ] **Embodied Intelligence** — Virtual 3D digital humans, large-scale game engine integration demos
-- [ ] **Open Source Training** — Full training code and dataset release
----
-## Demo Videos
-<p align="center">
-  <a href="#"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
-  <a href="#"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
-</p>
-> Video demos coming soon — showcasing real-time game AI, robotic arm control, and digital human animation.
----
-## Citation
-```bibtex
-@article{shi2026simpletool,
-  title={SimpleTool: Parallel Decoding for Real-Time LLM Function Calling},
-  author={Shi, Xiaoxin and Wan, Jiaxin and Dong, Linkang and Jiang, Wei and Liu, Yue and Huang, Zengfeng},
-  journal={arXiv preprint arXiv:2603.00030},
-  year={2026}
-}
-```
-## Contact
-- **Email**: cialtion737410@sjtu.edu.cn / cialtion@outlook.com
-- **QQ Group**: 861244702
-- **Bilibili**: [Cialtion](https://space.bilibili.com/Cialtion)
-## License
-Apache 2.0

.ipynb_checkpoints/README_zh-checkpoint.md DELETED Viewed

@@ -1,308 +0,0 @@
----
-library_name: transformers
-tags:
-- simpletool
-- tool-calling
-- parallel-decoding
-license: apache-2.0
-datasets:
-- your-dataset-name
-language:
-- en
-- zh
-pipeline_tag: text-generation
-arxiv: 2603.00030
----
-<p align="center">
-  <a href="README.md">English</a> | <a href="README_zh.md">中文</a>
-</p>
-<h1 align="center">SimpleTool</h1>
-<p align="center">
-  <b>面向实时 LLM 函数调用的并行解码架构</b>
-</p>
-<p align="center">
-  <a href="https://arxiv.org/abs/2603.00030"><img src="https://img.shields.io/badge/arXiv-2603.00030-red"></a>
-  <a href="https://huggingface.co/Cialtion/SimpleTool"><img src="https://img.shields.io/badge/🤗-Models-yellow"></a>
-  <a href="https://www.modelscope.cn/models/cialtion/SimpleTool"><img src="https://img.shields.io/badge/ModelScope-Models-blue"></a>
-  <a href="#演示视频"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
-  <a href="#演示视频"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
-  <a href="#许可证"><img src="https://img.shields.io/badge/License-Apache%202.0-green"></a>
-</p>
-<p align="center">
-  一个 4B 参数的 LLM，实现 <b>16 Hz 端到端实时函数调用</b>——足以驱动游戏 AI、机械臂控制和数字人动画。
-</p>
----
-SimpleTool 通过多头并行解码实现**实时 LLM 函数调用**。我们引入特殊 token 来压缩结构化输出中的冗余信息（4–6 倍压缩），并让函数名与各参数独立并行生成，从而实现**端到端 3–6 倍加速**，同时在三大应用场景——**游戏**、**机械臂控制**和**数字人动画**——中保持具有竞争力的准确率。
-<p align="center">
-  <img src="assets/fig_title_panel_a.png" alt="SimpleTool 概览" width="700">
-</p>
-## 工作原理
-传统函数调用按顺序逐 token 生成——`function → arg1 → arg2 → ...`——延迟随输出长度线性增长。SimpleTool 基于两个关键观察：
-1. **Token 冗余**：结构化输出中存在大量可预测的 token（括号、参数名、引号等），可以压缩为单个特殊 token。
-2. **弱因果依赖**：函数的各个参数之间基本相互独立，可以并行生成。
-<p align="center">
-  <img src="assets/overview.png" alt="SimpleTool 架构" width="600">
-</p>
-将函数名和各参数作为共享同一前缀 KV 缓存的并行流进行解码，延迟从 `sum(所有token耗时)` 降为 `max(单头耗时)`。并行解码头利用了解码阶段显存带宽受限时的闲置算力，使得并行化几乎零开销。
-更多细节请参阅我们的 [arXiv 论文](https://arxiv.org/abs/2603.00030)。
----
-## 快速上手
-### 1. 配置环境
-```bash
-git clone https://github.com/HaxxorCialtion/SimpleTool.git
-cd SimpleTool
-```
-**方案 A — uv（推荐）**
-```bash
-uv venv env_rt -p python3.12
-source env_rt/bin/activate
-uv pip install -r requirements.txt
-```
-**方案 B — conda**
-```bash
-conda create -n simpletool python=3.12 -y
-conda activate simpletool
-pip install -r requirements.txt
-```
-**方案 C — pip**
-```bash
-python3.12 -m venv env_rt
-source env_rt/bin/activate
-pip install -r requirements.txt
-```
-### 2. 下载模型
-默认推荐模型为 **RT-Qwen3-4B-AWQ-v2**（4B 参数，AWQ W4A16 量化，v2 提示格式）。所有脚本默认路径为 `./models/RT-Qwen3-4B-AWQ-v2`。
-```bash
-# HuggingFace
-huggingface-cli download Cialtion/SimpleTool \
-  --include "RT-Qwen3-4B-AWQ-v2/*" --local-dir ./models
-# 或者 ModelScope（国内推荐）
-modelscope download --model cialtion/SimpleTool \
-  --include "RT-Qwen3-4B-AWQ-v2/*" --local_dir ./models
-```
-<details>
-<summary><b>全部可用模型</b></summary>
-| 模型 | 参数量 | 延迟 | HuggingFace | ModelScope |
-|------|--------|------|-------------|------------|
-| RT-Qwen2.5-0.5B-AWQ | 0.5B | ~30ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-0.5B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-0.5B-AWQ) |
-| RT-Qwen2.5-1.5B-AWQ | 1.5B | ~40ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-1.5B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-1.5B-AWQ) |
-| RT-Qwen2.5-3B-AWQ | 3B | ~50ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-3B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-3B-AWQ) |
-| **RT-Qwen3-4B-AWQ-v2** | **4B** | **~60ms** | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ-v2) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ-v2) |
-| RT-Qwen3-4B-AWQ | 4B | ~60ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-4B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-4B-AWQ) |
-| RT-Qwen2.5-7B-AWQ | 7B | ~70ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-7B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-7B-AWQ) |
-| RT-Qwen2.5-14B-AWQ | 14B | ~130ms | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen2.5-14B-AWQ) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen2.5-14B-AWQ) |
-| RT-Qwen3-30B-A3B-AWQ | 30B(A3B) | ~ | [🤗](https://huggingface.co/Cialtion/SimpleTool/tree/main/RT-Qwen3-30B_awq_w4a16) | [链接](https://www.modelscope.cn/models/cialtion/SimpleTool/tree/master/RT-Qwen3-30B_awq_w4a16) |
-> 延迟数据在 RTX 4090 上使用 vLLM 前缀缓存测得。v2 模型采用改进的提示格式，包含领域专用系统提示；v1 模型使用通用的多头指令头。
-</details>
-### 3. 运行基准测试（无需启动服务）
-`01_benchmark.py` 通过 vLLM 直接运行多头并行解码，覆盖三大应用场景——游戏 AI、机械臂控制和数字人动画——并输出冷启动 / 热预填充 / 解码瓶颈分析。
-```bash
-# v2 模型（默认）
-python 01_benchmark.py --version v2
-# v1 模型
-python 01_benchmark.py --version v1 --model ./models/RT-Qwen3-4B-AWQ
-# 自动检测每个场景的最优头数
-python 01_benchmark.py --n-args auto
-```
-输出示例：
-```
-  PARALLEL TEST (v2)
-─── Game — Tower Defense ───
-PASS  use_skill(Amiya)
-  function   use_skill                                     4    OK
-  arg1       Amiya                                         4    FILL
-  arg2       <|null|>                                      3    NULL
-  e2e=24.6ms  max_tok=4
-─── Robotic Arm — Assembly ───
-PASS  move_to(300,150,50,slow)
-  function   move_to                                       4    OK
-  arg1       300                                           5    FILL
-  arg2       150                                           5    FILL
-  arg3       500                                           5    FILL
-  arg4       slow                                          3    FILL
-  e2e=39.9ms  max_tok=5
-─── Digital Human — Streamer ───
-PASS  speak(welcome,cheerful)
-  function   speak                                         4    OK
-  arg1       Welcome!                                      4    FILL
-  arg2       cheerful                                      5    FILL
-  e2e=29.1ms  max_tok=5
-  SUMMARY (v2)
-  Accuracy       : 3/3
-  Cold start avg : 56.1ms
-  Hot prefill avg: 29.3ms
-  E2E avg (hot)  : 31.2ms
-  E2E / max_tok  : 6.7ms/tok (decode bottleneck)
-```
-脚本还会打印完整的提示结构和重构后的多头输出，便于检查调试。
-### 4. 启动服务
-`02_server.py` 将推理引擎封装为 FastAPI 服务，支持 CORS 跨域。HTML 游戏客户端通过它连接模型。
-```bash
-python 02_server.py
-```
-服务启动于 `http://localhost:8899`，提供以下接口：
-| 接口 | 方法 | 说明 |
-|------|------|------|
-| `/health` | GET | 健康检查，返回模型版本信息 |
-| `/v1/function_call` | POST | 多头并行函数调用 |
-编辑 `02_server.py` 顶部的 `MODEL_PATH` 和 `MODEL_VERSION` 即可切换 v1/v2 模型。
-### 5. 测试服务
-服务运行后，在另一个终端中执行：
-```bash
-python 03_test_server.py
-```
-该脚本向服务端 API 发送三大场景（游戏、机械臂、数字人）的测试请求，报告准确率、冷启动/热启动延迟及各头输出。
-```bash
-# 自定义服务地址
-python 03_test_server.py --url http://192.168.1.100:8899
-# 增加热启动轮数
-python 03_test_server.py --rounds 10
-```
-### 6. 体验 Demo
-在浏览器中打开 Demo HTML 文件，它们会连接到正在运行的 SimpleTool 服务。
-| Demo | 说明 | 文件 |
-|------|------|------|
-| **Pong** | AI 对战人类的弹球游戏 | `demos/pong_game.html` |
-| **Neon Arena** | 多 AI 对战射击游戏 | `demos/neon_arena.html` |
-部分游戏需要额外资源文件：
-```bash
-cd demos/neon_arena
-python3 -m http.server 8080 --bind 127.0.0.1
-```
-然后打开 http://127.0.0.1:8080/neon_arena.html，输入 SimpleTool 服务地址（默认：`http://localhost:8899`）。
-<p align="center">
-  <video src="https://github.com/user-attachments/assets/436e3b97-e8ab-4d36-9fa0-8f1962da4a38" autoplay loop muted width="400"></video>
-  <video src="https://github.com/user-attachments/assets/f9b127da-b65e-4a06-b48f-836e759a6029" autoplay loop muted width="400"></video>
-</p>
----
-## 项目结构
-```
-SimpleTool/
-├── 01_benchmark.py          # 第 1 步：直接并行解码基准测试
-├── 02_server.py             # 第 2 步：FastAPI vLLM 推理服务
-├── 03_test_server.py        # 第 3 步：服务端 API 测试客户端
-├── prompts/                 # 外部提示词与场景文件
-│   ├── v1_system.txt        #   v1 多头系统提示
-│   ├── scenarios.json       #   3 大场景测试用例
-│   ├── tools_game.jsonl     #   塔防游戏工具定义
-│   ├── tools_arm.jsonl      #   机械臂工具定义
-│   └── tools_avatar.jsonl   #   数字人工具定义
-├── models/                  # 模型下载目录
-│   └── RT-Qwen3-4B-AWQ-v2/ #   默认模型
-├── demos/                   # HTML 游戏客户端
-│   ├── pong_game.html
-│   └── neon_arena/
-├── assets/                  # README 配图
-├── requirements.txt
-├── simpletool-game.skill.md # 用 AI 构建新游戏的指南
-├── README.md
-└── README_zh.md
-```
-## 构建你自己的游戏
-将 **`simpletool-game.skill.md`** 和本项目的 **`README.md`** 一起喂给你的 AI 编程智能体（Claude Code、Codex、Antigravity 等）即可开始 vibe coding。Skill 文件涵盖服务端 API 规格、工具定义格式、Query 设计最佳实践、前端模板及动态头数优化技巧；README 则帮助 AI 理解整体项目结构。两者配合，即可上手开发基于 SimpleTool 的游戏。
----
-## 路线图
-- [ ] **世界模拟** — 大规模（1,000+ NPC）实时 AI 异步世界模拟，单智能体行动端到端延迟 < 200ms
-- [ ] **推测解码与多 Token 预测** — 引入推测解码（Speculative Decoding）和多 Token 预测，进一步压缩推理延迟
-- [ ] **Windows 原生支持** — Windows 游戏引擎插件与原生运行（无需 Docker 或 WSL）
-- [ ] **Apple 生态** — Mac 和 iPhone 端侧部署（CoreML / Metal）
-- [ ] **v3 架构** — 快思考（实时 SimpleTool）+ 慢思考（异步元认知）融合
-- [ ] **具身智能** — 虚拟 3D 数字人，大型游戏引擎集成演示
-- [ ] **开源训练** — 完整训练代码与数据集开放
----
-## 演示视频
-<p align="center">
-  <a href="#"><img src="https://img.shields.io/badge/Bilibili-Demo-00A1D6?logo=bilibili&logoColor=white"></a>
-  <a href="#"><img src="https://img.shields.io/badge/YouTube-Demo-FF0000?logo=youtube&logoColor=white"></a>
-</p>
-> 演示视频即将上线——展示实时游戏 AI、机械臂控制和数字人动画效果。
----
-## 引用
-```bibtex
-@article{shi2026simpletool,
-  title={SimpleTool: Parallel Decoding for Real-Time LLM Function Calling},
-  author={Shi, Xiaoxin and Wan, Jiaxin and Dong, Linkang and Jiang, Wei and Liu, Yue and Huang, Zengfeng},
-  journal={arXiv preprint arXiv:2603.00030},
-  year={2026}
-}
-```
-## 联系方式
-- **邮箱**：cialtion737410@sjtu.edu.cn / cialtion@outlook.com
-- **QQ 群**：861244702
-- **Bilibili**：[Cialtion](https://space.bilibili.com/Cialtion)
-## 许可证
-Apache 2.0

.ipynb_checkpoints/rt_server-checkpoint.py DELETED Viewed

@@ -1,336 +0,0 @@
-#!/usr/bin/env python3
-"""
-SimpleTool vLLM Server - Multi-Head Parallel Decoding for Real-Time Function Calling
-Supports both v1 and v2 prompt formats. HTML clients need zero changes.
-"""
-import json
-import time
-import os
-from typing import List, Dict, Any, Optional
-from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import uvicorn
-from vllm import LLM, SamplingParams
-# ==================== Config ====================
-MODEL_PATH = "../../RT-Qwen3-4B-v2"       # v2 model path
-MODEL_VERSION = "v2"                           # "v1" or "v2"
-SERVER_HOST = "0.0.0.0"
-SERVER_PORT = 8899
-MAX_HISTORY = 6
-os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
-# ==================== Multi-Head Tags ====================
-HEAD_TAGS = ["<content>", "<function>", "<arg1>", "<arg2>", "<arg3>", "<arg4>", "<arg5>", "<arg6>"]
-STOP_TOKENS = ["<|null|>", "</content>", "</function>", "</arg1>", "</arg2>", "</arg3>", "</arg4>", "</arg5>", "</arg6>", "<|im_end|>"]
-# ── v1: generic head-format instructions in system, domain context in user ──
-V1_SYSTEM_TEMPLATE = """<|im_start|>system
-You are a multi-head parallel function calling model.
-## Output Heads
-**Head 0 - <content>**: Natural language response
-- Format: <content>response text</content>
-**Head 1 - <function>**: Function names to call
-- Format: <function>name</function>
-**Head 2-7 - <arg1>-<arg6>**: Function arguments by position
-- Format: <argN>value</argN>
-- If Unnecessary: <argN><|null|></argN>
-## Available Tools:
-{tools_json}
-<|im_end|>
-"""
-V1_USER_TEMPLATE = "<|im_start|>user\nenvironment: {env}\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
-# ── v2: domain system prompt + tools in system, leaner user turn ──
-V2_SYSTEM_TEMPLATE = """<|im_start|>system
-{system_prompt}
-## Available Tools:
-{tools_json}
-<|im_end|>
-"""
-V2_USER_TEMPLATE = "<|im_start|>user\nhistory: [{hist}]\n\n{query}<|im_end|>\n<|im_start|>assistant\n"
-# Default system prompt when HTML client doesn't send one (backward compat)
-V2_DEFAULT_SYSTEM = "You are a real-time function calling assistant. Convert user commands into function calls using the available tools."
-# ==================== Data Models ====================
-class Message(BaseModel):
-    role: str
-    content: str
-class FCRequest(BaseModel):
-    messages: List[Message]
-    tools: List[Dict[str, Any]]
-    # ── v1 fields (still accepted, used when version=v1) ──
-    environment: Optional[List[str]] = None
-    history: Optional[List[str]] = None
-    # ── v2 optional: domain system prompt ──
-    system: Optional[str] = None
-    # ── shared ──
-    max_tokens: int = 32
-    temperature: float = 0.0
-    include_content_head: bool = False
-class FCResponse(BaseModel):
-    success: bool
-    function: Optional[str] = None
-    args: Dict[str, Any] = {}
-    heads: Dict[str, str] = {}
-    content: Optional[str] = None
-    latency_ms: float = 0
-    error: Optional[str] = None
-# ==================== SimpleTool Engine ====================
-class SimpleToolEngine:
-    def __init__(self, model_path: str, version: str = "v2"):
-        self.model_path = model_path
-        self.version = version
-        self.llm: Optional[LLM] = None
-        self.sampling_params = None
-    def initialize(self):
-        print(f"[SimpleTool] Loading model ({self.version}): {self.model_path}")
-        self.llm = LLM(
-            model=self.model_path,
-            trust_remote_code=True,
-            enable_prefix_caching=True,
-            tensor_parallel_size=1,
-            gpu_memory_utilization=0.8,
-            max_model_len=4096,
-            dtype="auto",
-        )
-        self.sampling_params = SamplingParams(
-            temperature=0.0,
-            max_tokens=32,
-            stop=STOP_TOKENS,
-            include_stop_str_in_output=True
-        )
-        print(f"[SimpleTool] Model loaded! (version={self.version})")
-        self._warmup()
-    def _warmup(self):
-        print("[SimpleTool] Warming up...")
-        dummy_tools = '{"type":"function","function":{"name":"test","parameters":{}}}'
-        if self.version == "v1":
-            prefix = V1_SYSTEM_TEMPLATE.format(tools_json=dummy_tools)
-            prefix += V1_USER_TEMPLATE.format(env="[]", hist="", query="test")
-        else:
-            prefix = V2_SYSTEM_TEMPLATE.format(system_prompt=V2_DEFAULT_SYSTEM, tools_json=dummy_tools)
-            prefix += V2_USER_TEMPLATE.format(hist="", query="test")
-        prompts = [prefix + tag for tag in HEAD_TAGS[:2]]  # function + arg1 enough
-        self.llm.generate(prompts, self.sampling_params)
-        print("[SimpleTool] Warmup complete!")
-    def _build_tools_json(self, tools: List[Dict]) -> str:
-        return "\n".join(json.dumps(t, ensure_ascii=False) for t in tools)
-    def _extract_param_info(self, tools: List[Dict]) -> List[str]:
-        names = []
-        for tool in tools:
-            func = tool.get("function", {})
-            params = func.get("parameters", {}).get("properties", {})
-            for name in params.keys():
-                if name not in names:
-                    names.append(name)
-        return names[:6]
-    def _get_max_args(self, tools: List[Dict]) -> int:
-        max_args = 0
-        for tool in tools:
-            func = tool.get("function", {})
-            params = func.get("parameters", {}).get("properties", {})
-            max_args = max(max_args, len(params))
-        return min(max_args, 6)
-    def _build_prompt(self, request: FCRequest) -> str:
-        """Build the shared prefix according to version."""
-        tools_json = self._build_tools_json(request.tools)
-        # Extract query from messages
-        query = ""
-        for msg in request.messages:
-            if msg.role == "user":
-                query = msg.content
-        hist_list = (request.history or [])[-MAX_HISTORY:]
-        hist_str = ", ".join(hist_list) if hist_list else ""
-        if self.version == "v1":
-            # ── v1: head descriptions + tools in system, env+history+query in user ──
-            env_str = json.dumps(request.environment or [], ensure_ascii=False)
-            system_part = V1_SYSTEM_TEMPLATE.format(tools_json=tools_json)
-            user_part = V1_USER_TEMPLATE.format(env=env_str, hist=hist_str, query=query)
-        else:
-            # ── v2: domain system + tools in system, history+query in user ──
-            # If client sends a system prompt, use it; otherwise use default.
-            # For legacy HTML clients that send environment[], fold it into query.
-            system_prompt = request.system or V2_DEFAULT_SYSTEM
-            system_part = V2_SYSTEM_TEMPLATE.format(
-                system_prompt=system_prompt,
-                tools_json=tools_json
-            )
-            # Backward compat: if environment is provided (old HTML clients),
-            # prepend it to the query so the model still sees context.
-            env_prefix = ""
-            if request.environment:
-                env_prefix = "environment: " + json.dumps(request.environment, ensure_ascii=False) + "\n"
-            user_part = V2_USER_TEMPLATE.format(
-                hist=hist_str,
-                query=env_prefix + query
-            )
-        return system_part + user_part
-    def call(self, request: FCRequest) -> FCResponse:
-        start = time.perf_counter()
-        full_prefix = self._build_prompt(request)
-        # Dynamic head selection based on max args
-        max_args = self._get_max_args(request.tools)
-        active_tags = ["<function>"] + [f"<arg{i}>" for i in range(1, max_args + 1)]
-        if request.include_content_head:
-            active_tags = ["<content>"] + active_tags
-        prompts = [full_prefix + tag for tag in active_tags]
-        outputs = self.llm.generate(prompts, self.sampling_params)
-        latency_ms = (time.perf_counter() - start) * 1000
-        # Parse outputs
-        heads = {}
-        head_names = []
-        if request.include_content_head:
-            head_names.append("content")
-        head_names.append("function")
-        head_names.extend([f"arg{i}" for i in range(1, max_args + 1)])
-        for i, output in enumerate(outputs):
-            text = output.outputs[0].text.strip()
-            for stop in STOP_TOKENS:
-                if text.endswith(stop):
-                    text = text[:-len(stop)].strip()
-                    break
-            heads[head_names[i]] = text
-        func_name = heads.get("function", "").strip()
-        if not func_name or func_name == "<|null|>":
-            return FCResponse(
-                success=False,
-                heads=heads,
-                content=heads.get("content"),
-                latency_ms=latency_ms,
-                error="No function called"
-            )
-        param_names = self._extract_param_info(request.tools)
-        args = {}
-        for i, name in enumerate(param_names):
-            val = heads.get(f"arg{i+1}", "").strip()
-            if val and val != "<|null|>":
-                if val.isdigit():
-                    args[name] = int(val)
-                elif val.lstrip('-').replace('.', '', 1).isdigit():
-                    args[name] = float(val)
-                else:
-                    args[name] = val.lower().strip()
-        return FCResponse(
-            success=True,
-            function=func_name,
-            args=args,
-            heads=heads,
-            content=heads.get("content"),
-            latency_ms=latency_ms
-        )
-# ==================== FastAPI ====================
-engine: Optional[SimpleToolEngine] = None
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global engine
-    engine = SimpleToolEngine(MODEL_PATH, version=MODEL_VERSION)
-    engine.initialize()
-    yield
-    print("[Server] Shutdown")
-app = FastAPI(title="SimpleTool Server", version="2.0.0", lifespan=lifespan)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.get("/health")
-async def health():
-    return {
-        "status": "ok",
-        "loaded": engine is not None and engine.llm is not None,
-        "model": MODEL_PATH,
-        "version": MODEL_VERSION,
-    }
-@app.post("/v1/function_call", response_model=FCResponse)
-async def function_call(request: FCRequest):
-    if engine is None or engine.llm is None:
-        raise HTTPException(503, "Model not loaded")
-    try:
-        return engine.call(request)
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return FCResponse(success=False, error=str(e), latency_ms=0)
-if __name__ == "__main__":
-    print(r"""
-╔════════════════════════════════════════════════════════════════════╗
-║                                                                    ║
-║   ███████╗██╗███╗   ███╗██████╗ ██╗     ███████╗                   ║
-║   ██╔════╝██║████╗ ████║██╔══██╗██║     ██╔════╝                   ║
-║   ███████╗██║██╔████╔██║██████╔╝██║     █████╗                     ║
-║   ╚════██║██║██║╚██╔╝██║██╔═══╝ ██║     ██╔══╝                     ║
-║   ███████║██║██║ ╚═╝ ██║██║     ███████╗███████╗                   ║
-║   ╚══════╝╚═╝╚═╝     ╚═╝╚═╝     ╚══════╝╚══════╝                   ║
-║                                                                    ║
-║          SimpleTool vLLM-Server v2.0                               ║
-║          Multi-Head Parallel Decoding — v1/v2 Compatible           ║
-║                                                                    ║
-║   Run Demos: Open demos/*.html in browser                          ║
-║   Build New: Send simpletool-game-guide.md to AI(Claude Gemini...) ║
-║              for Building new your own HTML games easily           ║
-║   Endpoints:                                                       ║
-║     GET  /health           - Health check (+ version info)         ║
-║     POST /v1/function_call - Function call API (v1 & v2)          ║
-║                                                                    ║
-╚════════════════════════════════════════════════════════════════════╝
-    """)
-    uvicorn.run(app, host=SERVER_HOST, port=SERVER_PORT)

.ipynb_checkpoints/simpletool-game.skill-checkpoint.md DELETED Viewed

@@ -1,318 +0,0 @@
-# SimpleTool Skill — Real-Time AI Application Development
-> **This is a skill file.** Feed it to any AI coding assistant (Claude, Gemini, GPT, Cursor, etc.) as context, then describe the app you want. The AI will generate a working SimpleTool-powered application.
->
-> Example prompt: *"Read the attached SimpleTool skill, then build me a Pong game where AI controls one paddle in real-time."*
----
-## 1. What is SimpleTool?
-SimpleTool is a **multi-head parallel decoding** server for real-time LLM function calling. It runs on vLLM and decodes function name + arguments simultaneously instead of sequentially.
-```
-Traditional:  function → arg1 → arg2 → ...  (sequential, ~200-500ms)
-SimpleTool:   [function, arg1, arg2, ...]    (parallel,   ~25-60ms)
-```
-**Application domains**: game AI, robotic arm control, digital human animation, IoT automation — anything that needs < 100ms LLM decision-making.
-## 2. Server API
-Server default: `http://localhost:8899`
-### Endpoints
-| Method | Path | Description |
-|--------|------|-------------|
-| GET | `/health` | Health check, returns `{status, version, model}` |
-| POST | `/v1/function_call` | Multi-head parallel function call |
-### Request Format (v2)
-```javascript
-{
-  messages: [{role: 'user', content: 'your query'}],
-  tools: [...],                // OpenAI-format tool definitions
-  system: "domain prompt",     // Domain-specific system prompt (v2)
-  environment: [...],          // Current state info (string array, optional)
-  history: [...],              // Action history (string array, max 6)
-  include_content_head: false  // Whether to generate <content> head
-}
-```
-The `system` field lets you inject a domain-specific system prompt (e.g., "You are a robotic arm controller"). If omitted, the server uses a generic default. The `environment` field is optional context folded into the user message.
-### Response Format
-```javascript
-{
-  success: true,
-  function: "move",
-  args: {direction: "up", speed: "fast"},   // Named args (param names from tool def)
-  heads: {                                   // Raw per-head output
-    function: "move",
-    arg1: "up",
-    arg2: "fast",
-    arg3: "<|null|>"
-  },
-  content: null,       // Only if include_content_head was true
-  latency_ms: 35.2
-}
-```
-## 3. Dynamic Head Count (Critical for Latency!)
-**The server automatically prunes unused heads.** If your tools have at most 2 parameters, only 3 heads are spawned (`<function>`, `<arg1>`, `<arg2>`), not 8. This saves ~40% latency.
-```
-Active heads = [<function>] + [<arg1>...<argN>]
-where N = max parameter count across all tool definitions
-```
-**Design tip**: Keep your tools to 1–3 parameters when possible. Fewer params = fewer heads = lower latency.
-## 4. Tool Definition
-### Constraints
-- Maximum **6 arguments** per function (arg1–arg6)
-- Arguments map to `arg1, arg2, ...` in the order defined in `properties`
-- Server auto-converts types: numeric strings → int/float, otherwise lowercase string
-- Use `enum` to constrain options — this dramatically improves accuracy
-### Template
-```javascript
-const TOOLS = [{
-  type: "function",
-  function: {
-    name: "action_name",
-    description: "Clear, concise — what this action does and when to use it",
-    parameters: {
-      type: "object",
-      properties: {
-        param1: {
-          type: "string",
-          enum: ["opt_a", "opt_b", "opt_c"],  // Constrain! Improves accuracy
-          description: "What this param controls"
-        },
-        param2: {
-          type: "number",
-          description: "Numeric value with unit, e.g. 'Force in Newtons'"
-        }
-      },
-      required: ["param1"]
-    }
-  }
-}];
-```
-### Multi-Tool Example (Game)
-```javascript
-const TOOLS = [
-  {type:"function", function:{name:"move",    description:"Move unit to position", parameters:{type:"object", properties:{unit:{type:"string"}, target:{type:"string", enum:["north","south","east","west"]}}}}},
-  {type:"function", function:{name:"attack",  description:"Attack enemy",          parameters:{type:"object", properties:{unit:{type:"string"}, target:{type:"string"}}}}},
-  {type:"function", function:{name:"retreat",  description:"Pull back unit",        parameters:{type:"object", properties:{unit:{type:"string"}}}}},
-  {type:"function", function:{name:"pass",     description:"Do nothing this turn",  parameters:{type:"object", properties:{}}}}
-];
-// Max params = 2 → only 3 heads spawned
-```
-## 5. Query Design
-### Principles
-1. **Be imperative** — tell the model what to decide, not just describe state
-2. **Include decision context** — "Ball is BELOW paddle, intercept it" not "Ball y=250"
-3. **List valid options** — "Choose: up/down/stay"
-4. **Keep it short** — shorter query = faster prefill
-### Good vs Bad
-```
-✅ "Ball 50px BELOW paddle, approaching fast. Move DOWN to intercept. Choose: up/down/stay"
-❌ "Ball position: 250, Paddle position: 200. What should I do?"
-✅ "Red gear at (300,150,50). Move arm there slowly for pickup."
-❌ "There is a gear somewhere on the table. The arm needs to go to it."
-✅ "Stream starting, viewers saying hello. Greet them warmly."
-❌ "Viewers are in the chat. Do something appropriate."
-```
-### Environment & History
-```javascript
-// Environment: current state as key=value strings
-const env = [
-  `ball_y=${ballY}`,
-  `paddle_y=${paddleY}`,
-  `gap=${gap}`,
-  `approaching=true`
-];
-// History: recent actions (max 6, server trims automatically)
-const history = [
-  "move(up)", "move(up)", "stay()"
-];
-```
-### Domain System Prompts (v2)
-For v2 server, set a domain-specific system prompt:
-```javascript
-// Game AI
-const SYSTEM = "You are the AI controller for a Pong game. Move the paddle to intercept the ball. React quickly.";
-// Robotic arm
-const SYSTEM = "You are the voice controller for a 6-axis robotic arm. Convert commands to precise function calls. Coordinates in mm.";
-// Digital human
-const SYSTEM = "You are the animation controller for a virtual streamer. Convert director instructions to expression and speech calls.";
-```
-## 6. Frontend Code Standards
-### Required: Type-Safe Value Extraction
-```javascript
-// Values in args may be int, not string — always coerce
-function safeStr(v) {
-  if (v === null || v === undefined) return '';
-  return String(v).trim().toLowerCase();
-}
-// Extract with args (named) first, heads (positional) as fallback
-let direction = safeStr(d.args?.direction) || safeStr(d.heads?.arg1);
-```
-### Required: Validate Return Values
-```javascript
-const VALID = ['up', 'down', 'stay'];
-if (!VALID.includes(direction)) {
-  console.warn(`Invalid: "${direction}", fallback to stay`);
-  direction = 'stay';
-}
-```
-### Required: Error Handling with Fallback
-```javascript
-async function callAI() {
-  try {
-    const r = await fetch(SERVER_URL + '/v1/function_call', {
-      method: 'POST',
-      headers: {'Content-Type': 'application/json'},
-      body: JSON.stringify(request)
-    });
-    const data = await r.json();
-    if (!data.success) throw new Error(data.error);
-    applyAction(data);
-  } catch (e) {
-    console.error('[AI] Failed:', e);
-    applyFallbackAI();  // MUST have fallback — never freeze the app
-  }
-}
-```
-### Required: Logging
-```javascript
-console.log(`[Game] Query: ${query}`);
-console.log(`[Game] → ${data.function}(${JSON.stringify(data.args)}) ${data.latency_ms.toFixed(0)}ms`);
-```
-### Recommended: Debug UI Overlay
-Show in a corner of your app: current query, raw response, latency (current + rolling average).
-## 7. Game Loop Pattern
-**Decouple AI from rendering.** The AI loop runs at 10–16 Hz; the render loop runs at 60 fps.
-```javascript
-const AI_INTERVAL = 100;  // 100ms = 10 Hz
-let aiPending = false;
-// Render loop (60fps) — never blocks on AI
-function gameLoop() {
-  update();
-  render();
-  requestAnimationFrame(gameLoop);
-}
-// AI loop (async, non-blocking)
-async function aiLoop() {
-  if (aiPending) return;
-  aiPending = true;
-  await callAI();
-  aiPending = false;
-}
-setInterval(aiLoop, AI_INTERVAL);
-gameLoop();
-```
-## 8. FCClient Template
-Drop-in client class for any HTML/JS application:
-```javascript
-class FCClient {
-  constructor(url = 'http://localhost:8899') {
-    this.url = url.replace(/\/$/, '');
-  }
-  async health() {
-    try {
-      const r = await fetch(`${this.url}/health`, {signal: AbortSignal.timeout(3000)});
-      const d = await r.json();
-      return {ok: d.loaded === true || d.status === 'ok', version: d.version};
-    } catch (e) {
-      return {ok: false};
-    }
-  }
-  async call({query, tools, system, env, history, includeContent = false}) {
-    const t0 = performance.now();
-    try {
-      const r = await fetch(`${this.url}/v1/function_call`, {
-        method: 'POST',
-        headers: {'Content-Type': 'application/json'},
-        body: JSON.stringify({
-          messages: [{role: 'user', content: query}],
-          tools,
-          system,                              // v2: domain system prompt
-          environment: env,
-          history,
-          include_content_head: includeContent
-        })
-      });
-      const d = await r.json();
-      return {...d, wall_ms: performance.now() - t0};
-    } catch (e) {
-      return {success: false, error: e.message, wall_ms: performance.now() - t0};
-    }
-  }
-}
-```
-Usage:
-```javascript
-const ai = new FCClient('http://localhost:8899');
-const result = await ai.call({
-  query: "Ball is BELOW. Move down. Choose: up/down/stay",
-  tools: TOOLS,
-  system: "You are a Pong AI. Move paddle to intercept ball.",
-  env: ["ball_y=300", "paddle_y=200", "gap=100"],
-  history: ["move(down)", "move(down)"]
-});
-if (result.success) {
-  console.log(`${result.function}(${JSON.stringify(result.args)}) in ${result.latency_ms}ms`);
-}
-```
-## 9. Troubleshooting
-| Symptom | Cause | Fix |
-|---------|-------|-----|
-| AI stuck / no movement | Query too vague | Add decision hints: "Move DOWN to intercept" |
-| `.trim is not a function` | `args` values may be int | Use `String(v)` before `.trim()` |
-| High latency (>100ms) | Too many heads / long query | Reduce tool params, shorten query/env |
-| Wrong function called | Ambiguous tool descriptions | Add `enum`, improve `description` fields |
-| `<|null|>` in all args | Model confused | Check tool param order matches expectations |
----
-**Skill Version**: 2.0 — Supports v1/v2 server, multi-domain (game, robotics, avatar)
-**Last Updated**: 2026-03

.ipynb_checkpoints/test_server-checkpoint.py DELETED Viewed

@@ -1,250 +0,0 @@
-#!/usr/bin/env python3
-"""
-test_server.py — Hit running rt_server /v1/function_call with 4 scenarios
-Usage:  python test_server.py [--url http://localhost:8899]
-"""
-import argparse, json, time, sys, requests
-# ==================== Test Scenarios ====================
-SCENARIOS = [
-    # ── 1. Game: Tower Defense (from benchmark) ──
-    {
-        "name": "Game — Tower Defense",
-        "desc": "use_skill(Amiya)",
-        "expected_fn": "use_skill",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Wave 5, BOSS appeared, 8 enemies remaining\n"
-                "Operators: Blaze(north,HP50%,skill ready) Amiya(center,HP90%,skill ready)\n"
-                "Enemy direction: concentrated north\n\n"
-                "Amiya use skill now"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move","description":"Move a deployed operator to a new position on the battlefield. Use this when the player wants to reposition a unit to a different lane or strategic point.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator to move. Must match one of the currently deployed operators shown in the battlefield state. Supports fuzzy matching for ASR input, e.g. 'blaze', 'Blaze', 'BLAZE' all refer to the same operator."},
-                    "target":{"type":"string","description":"The destination position on the battlefield grid. Must be one of: 'north' (top lane), 'south' (bottom lane), 'east' (right/enemy side), 'west' (left/base side), 'center' (middle area). Choose based on the player's spoken direction."}},"required":["unit_id","target"]}}},
-                {"type":"function","function":{"name":"use_skill","description":"Activate the special skill of a deployed operator. Each operator has a unique skill that can be triggered when the skill gauge is ready. The skill effect depends on the operator type (e.g. AoE damage, healing, buff).","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator whose skill should be activated. The operator must be currently deployed on the battlefield and have their skill ready (skill gauge full). Supports fuzzy name matching for ASR input."},
-                    "skill_id":{"type":"string","description":"Optional skill identifier when an operator has multiple skills. If the operator only has one skill or the player did not specify which skill, this can be omitted. Format: 's1', 's2', 's3' for skill slot 1/2/3."}},"required":["unit_id"]}}},
-                {"type":"function","function":{"name":"retreat","description":"Withdraw a single operator from the battlefield back to the reserve bench. The operator's redeployment timer starts after retreat. Use when the player wants to pull back a specific unit to save them or free up a deployment slot.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator to retreat. Must be currently deployed on the battlefield. After retreat, this operator enters cooldown before they can be redeployed. Supports fuzzy name matching for ASR input."}},"required":["unit_id"]}}},
-                {"type":"function","function":{"name":"set_stance","description":"Change the combat behavior mode of a deployed operator. This affects how the operator selects targets and whether they prioritize attacking or surviving.","parameters":{"type":"object","properties":{
-                    "unit_id":{"type":"string","description":"The name of the operator whose stance should be changed. Must be currently deployed on the battlefield. Supports fuzzy name matching for ASR input."},
-                    "stance":{"type":"string","description":"The behavior mode to set. Must be one of: 'aggressive' (prioritize attacking nearest enemy, maximize DPS), 'defensive' (prioritize blocking and damage reduction, focus on survival), 'hold' (stay in position and only attack enemies in range, do not chase)."}},"required":["unit_id","stance"]}}},
-                {"type":"function","function":{"name":"retreat_all","description":"Emergency retreat of all currently deployed operators from the battlefield at once. Use only when the player explicitly requests a full withdrawal, typically in dire situations. All operators enter redeployment cooldown simultaneously.","parameters":{"type":"object","properties":{}}}},
-                {"type":"function","function":{"name":"pass","description":"Take no action this turn. Use when the player's command has already been fulfilled in history, or when the player explicitly says to wait, skip, or do nothing. Also use when the voice input is ambiguous and no clear command can be extracted.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the voice command interpreter for a real-time tower defense game. The player issues orders by voice. You convert ASR-transcribed commands into function calls.\n\nRules:\n- One function call per command\n- Fuzzy match operator names\n- Positions: north, south, east, west, center\n- If all tasks in history are done, call pass",
-            "history": []
-        }
-    },
-    # ── 2. Robotic Arm — Assembly (from benchmark) ──
-    {
-        "name": "Robotic Arm — Assembly",
-        "desc": "move_to(300,150,50,slow)",
-        "expected_fn": "move_to",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Arm at home (0,0,500), gripper open\n"
-                "Workpiece: red gear at (300,150,50), target tray at (600,0,80)\n\n"
-                "Move to the red gear position slowly"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move_to","description":"Move the robotic arm end-effector (tool center point) to a specified 3D coordinate in the workspace. The arm plans a collision-free path from its current position to the target. Optionally control movement speed for precision tasks.","parameters":{"type":"object","properties":{
-                    "x":{"type":"number","description":"Target X coordinate in millimeters, relative to the robot base frame origin. Positive X points forward (away from the robot base). Valid range depends on arm reach, typically -800 to 800 mm."},
-                    "y":{"type":"number","description":"Target Y coordinate in millimeters, relative to the robot base frame origin. Positive Y points to the left when facing the robot. Valid range depends on arm reach, typically -800 to 800 mm."},
-                    "z":{"type":"number","description":"Target Z coordinate in millimeters, relative to the robot base frame origin (table surface = 0). Positive Z points upward. Must be >= 0 to avoid collision with the work surface. Typical range: 0 to 500 mm."},
-                    "speed":{"type":"string","description":"Movement speed profile for the path. 'slow' (25% max velocity) for precision placement and delicate parts, 'normal' (50% max velocity) for standard pick-and-place, 'fast' (100% max velocity) for rapid repositioning when precision is not critical. Default: 'normal'."}},"required":["x","y","z"]}}},
-                {"type":"function","function":{"name":"grip","description":"Close the gripper jaws to grasp an object at the current end-effector position. The gripper applies the specified force and holds it. Must be called after positioning the arm above/around the target object.","parameters":{"type":"object","properties":{
-                    "force":{"type":"number","description":"Gripping force in Newtons applied by the gripper jaws. Choose based on object fragility: 10N for light/fragile items (electronics, thin plastic), 50N for medium items (standard gears, metal parts), 100N for heavy/robust items (large castings, steel blocks). Excessive force may damage delicate workpieces."}},"required":["force"]}}},
-                {"type":"function","function":{"name":"release","description":"Open the gripper jaws to release the currently held object. The gripper fully opens to its maximum width. Should be called after positioning the arm at the target placement location. Ensure the object is at a safe height above the surface before releasing.","parameters":{"type":"object","properties":{}}}},
-                {"type":"function","function":{"name":"rotate","description":"Rotate the end-effector around a specified axis without changing its position. Used to orient the gripper or tool for proper approach angle before grasping, or to rotate a held workpiece for assembly alignment.","parameters":{"type":"object","properties":{
-                    "axis":{"type":"string","description":"The rotation axis in the end-effector frame. 'roll' rotates around the approach direction (Z-axis of tool frame, like turning a screwdriver), 'pitch' tilts the end-effector up/down (like nodding), 'yaw' swings the end-effector left/right (like shaking head). Choose based on the desired orientation change."},
-                    "angle":{"type":"number","description":"Rotation angle in degrees. Positive values follow the right-hand rule around the specified axis. Typical range: -180 to 180 degrees. Small angles (< 15°) for fine adjustment, larger angles for major reorientation."}},"required":["axis","angle"]}}},
-                {"type":"function","function":{"name":"home","description":"Return the robotic arm to its predefined home position (0, 0, 500) with the gripper pointing straight down and jaws open. Use as a safe starting/ending position for task sequences, or to clear the workspace. The arm takes a collision-free path at normal speed.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the voice controller for an industrial 6-axis robotic arm. You convert spoken commands into function calls.\n\nRules:\n- One function call per command\n- Coordinates in mm, angles in degrees\n- Gripper force: light=10N, medium=50N, heavy=100N\n- Speed: slow/normal/fast",
-            "history": []
-        }
-    },
-    # ── 3. Digital Human — Streamer (from benchmark) ──
-    {
-        "name": "Digital Human — Streamer",
-        "desc": "speak(welcome,cheerful)",
-        "expected_fn": "speak",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Stream just started, viewers flooding in\n"
-                "Chat: \"Hello streamer!\" \"Good evening!\"\n"
-                "Director: greet the audience warmly, say welcome and look at camera"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"set_expression","description":"Set the facial expression of the digital human avatar. Controls the blend shapes for eyes, eyebrows, and mouth to display the target emotion. The expression persists until changed by another set_expression call or overridden by a speak animation.","parameters":{"type":"object","properties":{
-                    "emotion":{"type":"string","description":"The target facial expression to display. Must be one of: 'happy' (smile, raised cheeks), 'sad' (downturned mouth, drooping eyebrows), 'surprised' (wide eyes, raised eyebrows, open mouth), 'angry' (furrowed brows, tight lips), 'neutral' (relaxed default face), 'thinking' (slightly furrowed brow, eyes looking up/away, subtle lip purse)."},
-                    "intensity":{"type":"number","description":"The strength of the facial expression blend, from 0.0 (barely visible, subtle hint) to 1.0 (maximum exaggeration, full expression). Recommended: 0.3-0.5 for natural conversation, 0.6-0.8 for reactive moments, 0.9-1.0 for comedic or dramatic emphasis."}},"required":["emotion","intensity"]}}},
-                {"type":"function","function":{"name":"speak","description":"Make the digital human speak the given text with lip-sync animation and appropriate facial expressions. The TTS engine converts text to audio while the avatar performs real-time viseme-based lip synchronization. The tone parameter affects both voice prosody and accompanying facial micro-expressions.","parameters":{"type":"object","properties":{
-                    "text":{"type":"string","description":"The speech content for the digital human to say aloud. Should be natural conversational language appropriate for a live stream context. Keep sentences concise (under 50 characters preferred for real-time responsiveness). May include casual expressions, emoji descriptions, or audience interaction phrases."},
-                    "tone":{"type":"string","description":"The vocal tone and emotional coloring of the speech delivery. Must be one of: 'cheerful' (upbeat, warm, higher pitch, for greetings and positive moments), 'calm' (steady, soothing, moderate pace, for explanations and transitions), 'serious' (lower pitch, measured pace, for important announcements), 'excited' (high energy, faster pace, emphasis peaks, for reactions and hype moments)."}},"required":["text","tone"]}}},
-                {"type":"function","function":{"name":"gesture","description":"Trigger a pre-defined body gesture animation on the digital human avatar. The gesture plays once and blends back to the idle pose. Can be combined with speak or set_expression for more natural multi-channel communication.","parameters":{"type":"object","properties":{
-                    "type":{"type":"string","description":"The gesture animation to play. Must be one of: 'wave' (friendly hand wave, for greetings and farewells), 'nod' (head nod, to show agreement or acknowledgment), 'shake_head' (head shake, to express disagreement or disbelief), 'bow' (respectful bow, for gratitude or formal greeting), 'point' (index finger pointing forward, to direct attention), 'thumbs_up' (approval gesture, for positive feedback), 'clap' (both hands clapping, for celebration or applause)."}},"required":["type"]}}},
-                {"type":"function","function":{"name":"look_at","description":"Direct the digital human's eye gaze and subtle head orientation toward a specified target. Creates natural eye contact or directional attention. The gaze shift is smoothly interpolated over ~200ms for realistic movement.","parameters":{"type":"object","properties":{
-                    "target":{"type":"string","description":"The gaze target direction. Must be one of: 'camera' (look directly at the audience through the camera lens, creates eye contact with viewers), 'left' (glance to the left side of the screen, e.g. toward a chat panel or co-host), 'right' (glance to the right, e.g. toward a game screen or secondary content), 'up' (look upward, conveys thinking or reacting to something above), 'down' (look downward, conveys reading chat, shyness, or sadness)."}},"required":["target"]}}},
-                {"type":"function","function":{"name":"idle","description":"Return the digital human to its default idle animation loop. Resets any active expression to neutral, stops ongoing gestures, and returns gaze to a soft forward direction with natural idle micro-movements (subtle breathing, occasional blinks, slight sway). Use during pauses or transitions between active segments.","parameters":{"type":"object","properties":{}}}}
-            ],
-            "system": "You are the expression controller for a virtual digital human streamer. You convert director instructions into animation function calls.\n\nRules:\n- One function call per instruction\n- Emotion intensity: 0.0-1.0\n- Speech text should be natural\n- Tone: cheerful/calm/serious/excited",
-            "history": []
-        }
-    },
-    # ── 4. Neon Arena (what HTML actually sends, legacy env style) ──
-    {
-        "name": "Neon Arena — Legacy HTML",
-        "desc": "fire(left) or move(left)",
-        "expected_fn": "fire",
-        "request": {
-            "messages": [{"role": "user", "content":
-                "Arena 900x600. FIRE left! Aligned horizontally. Call move(dir) or fire(dir). dir:up/down/left/right"
-            }],
-            "tools": [
-                {"type":"function","function":{"name":"move","description":"Move the player's spaceship in the specified direction by one step on the 900x600 arena grid. Use to reposition for better firing angle, dodge incoming bullets, or approach/retreat from enemies.","parameters":{"type":"object","properties":{
-                    "direction":{"type":"string","enum":["up","down","left","right"],"description":"The movement direction on the arena. 'up' decreases Y (toward top edge), 'down' increases Y (toward bottom edge), 'left' decreases X (toward left edge), 'right' increases X (toward right edge). Choose based on tactical positioning relative to the player and arena walls."}},"required":["direction"]}}},
-                {"type":"function","function":{"name":"fire","description":"Fire a bullet from the spaceship in the specified direction. The bullet travels in a straight line until it hits a target or exits the arena boundary. Use when aligned with the player's position on the horizontal or vertical axis for best hit probability.","parameters":{"type":"object","properties":{
-                    "direction":{"type":"string","enum":["up","down","left","right"],"description":"The firing direction of the bullet. 'up' fires toward top edge, 'down' fires toward bottom edge, 'left' fires toward left edge (toward player's side), 'right' fires toward right edge. Choose based on current alignment with the player: fire horizontally when align_h=true, vertically when align_v=true."}},"required":["direction"]}}}
-            ],
-            "environment": ["pos=700,300","player=100,310","dist=600","align_h=true","align_v=false","cd=0","wall=no"],
-            "history": ["fire(left)","move(up)","fire(left)"]
-        }
-    },
-]
-def check_health(url: str) -> dict:
-    r = requests.get(f"{url}/health", timeout=5)
-    return r.json()
-def call_fc(url: str, req: dict) -> dict:
-    t0 = time.perf_counter()
-    r = requests.post(f"{url}/v1/function_call", json=req, timeout=30)
-    wall_ms = (time.perf_counter() - t0) * 1000
-    d = r.json()
-    d["_wall_ms"] = wall_ms
-    return d
-def fmt_heads(heads: dict) -> str:
-    lines = []
-    for k in ["function","arg1","arg2","arg3","arg4","arg5","arg6","content"]:
-        if k in heads:
-            v = heads[k]
-            tag = "NULL" if (not v or v == "<|null|>") else v
-            lines.append(f"    {k:<10} = {tag}")
-    return "\n".join(lines)
-def main():
-    ap = argparse.ArgumentParser(description="Test SimpleTool server")
-    ap.add_argument("--url", default="http://localhost:8899")
-    ap.add_argument("--rounds", type=int, default=3, help="hot rounds per scenario")
-    args = ap.parse_args()
-    url = args.url.rstrip("/")
-    # ── Health ──
-    print(f"\n{'='*65}")
-    print(f"  SimpleTool Server Test")
-    print(f"  Target: {url}")
-    print(f"{'='*65}\n")
-    try:
-        h = check_health(url)
-        print(f"  /health → {json.dumps(h)}")
-        if not h.get("loaded") and h.get("status") != "ok":
-            print("  ⚠ Model not loaded!"); sys.exit(1)
-    except Exception as e:
-        print(f"  ✗ Cannot connect: {e}"); sys.exit(1)
-    version = h.get("version", "unknown")
-    print(f"  Server version: {version}\n")
-    # ── Cold start (first call warms KV cache) ──
-    print(f"{'='*65}")
-    print(f"  COLD START")
-    print(f"{'='*65}")
-    cold_ms = []
-    for sc in SCENARIOS:
-        r = call_fc(url, sc["request"])
-        ms = r.get("latency_ms", r.get("_wall_ms", 0))
-        cold_ms.append(ms)
-        ok = "✓" if r.get("function", "") == sc["expected_fn"] else "✗"
-        print(f"  {ok} {sc['name']:<35} {ms:7.1f}ms  → {r.get('function','?')}({r.get('args',{})})")
-    print()
-    # ── Hot rounds ──
-    print(f"{'='*65}")
-    print(f"  HOT ROUNDS (×{args.rounds})")
-    print(f"{'='*65}")
-    hot_ms = [[] for _ in SCENARIOS]
-    for rd in range(args.rounds):
-        parts = []
-        for i, sc in enumerate(SCENARIOS):
-            r = call_fc(url, sc["request"])
-            ms = r.get("latency_ms", r.get("_wall_ms", 0))
-            hot_ms[i].append(ms)
-            parts.append(f"{ms:6.1f}ms")
-        print(f"  Round {rd+1}: {'  '.join(parts)}")
-    print()
-    # ── Detailed test ──
-    print(f"{'='*65}")
-    print(f"  DETAILED RESULTS")
-    print(f"{'='*65}\n")
-    results = []
-    for i, sc in enumerate(SCENARIOS):
-        r = call_fc(url, sc["request"])
-        fn = r.get("function", "")
-        ok = fn == sc["expected_fn"]
-        results.append((sc, r, ok))
-        status = "PASS ✓" if ok else "FAIL ✗"
-        ms_server = r.get("latency_ms", 0)
-        ms_wall = r.get("_wall_ms", 0)
-        print(f"─── {sc['name']} ───")
-        print(f"  {status}  expected={sc['expected_fn']}  got={fn}")
-        print(f"  args: {json.dumps(r.get('args', {}), ensure_ascii=False)}")
-        print(f"  server={ms_server:.1f}ms  wall={ms_wall:.1f}ms  overhead={ms_wall-ms_server:.1f}ms")
-        if r.get("heads"):
-            print(f"  heads:")
-            print(fmt_heads(r["heads"]))
-        if r.get("error"):
-            print(f"  error: {r['error']}")
-        print()
-    # ── Summary ──
-    n = len(results)
-    passed = sum(1 for _, _, ok in results if ok)
-    avg_cold = sum(cold_ms) / n
-    avg_hot = sum(sum(h) for h in hot_ms) / sum(len(h) for h in hot_ms) if hot_ms else 0
-    avg_detail = sum(r.get("latency_ms", 0) for _, r, _ in results) / n
-    print(f"{'='*65}")
-    print(f"  SUMMARY")
-    print(f"{'='*65}")
-    print(f"  Server version  : {version}")
-    print(f"  Accuracy        : {passed}/{n}")
-    print(f"  Cold start avg  : {avg_cold:.1f}ms")
-    print(f"  Hot avg         : {avg_hot:.1f}ms")
-    print(f"  Detail avg      : {avg_detail:.1f}ms")
-    print()
-    print(f"  {'Scenario':<35} {'Cold':>7} {'Hot':>7} {'Detail':>7} {'Status':>6}")
-    print(f"  {'─'*65}")
-    for i, (sc, r, ok) in enumerate(results):
-        havg = sum(hot_ms[i]) / len(hot_ms[i]) if hot_ms[i] else 0
-        print(f"  {sc['name']:<35} {cold_ms[i]:6.1f}  {havg:6.1f}  {r.get('latency_ms',0):6.1f}  {'✓' if ok else '✗':>5}")
-    print()
-if __name__ == "__main__":
-    main()