Spaces:

Jdbbd
/

Hhh

Sleeping

App Files Files Community

Ksjsjjdj commited on Nov 24, 2025

Commit

aed88a2

verified ·

1 Parent(s): 526e758

Upload 42 files

Browse files

Files changed (18) hide show

README.md +25 -0
api_types.py +41 -0
app.py +874 -99
config.local.yaml +3 -0
config.production-modelscope.yaml +8 -0
config.production.yaml +15 -1
config.py +62 -13
models/.cache/huggingface/download/rwkv7-g1a-0.1b-20250728-ctx4096.pth.metadata +1 -1
tests/api_test.py +11 -0
tests/run_api_single_request.py +12 -0
tests/run_autodetect_flags.py +50 -0
tests/run_chat_response.py +11 -0
tests/run_chat_response_out.txt +0 -0
tests/run_detect.py +7 -0
tests/run_injected_tools.py +68 -0
tests/test_client_api.py +50 -0
tests/test_universal_and_detect.py +42 -0
utils.py +188 -5

README.md CHANGED Viewed

@@ -71,6 +71,11 @@ Advanced features:
 }
 ```
 Example: POST with `web_search` and reasoning enabled
 ```json
@@ -85,5 +90,25 @@ Example: POST with `web_search` and reasoning enabled
 The server will perform a web search for the prompt, aggregate the top 3 results, and inject those into the prompt, then run the model with reasoning enabled — all using the same model instead of an external reasoning or search model.
 Streaming behavior:
 - The API streams responses token-by-token by default (`stream: true`) and persists a `state_name` for the generation if requested (or will generate one). Provide `state_name` to resume continuation from where the previous stream stopped. The server stores model state in memory under `(model, state_name)` so subsequent requests with the same `state_name` can continue generation from that exact point.

 }
 ```
+API endpoints and model listing:
+- `GET /api/v1/models` — returns a JSON list of configured models, sampler defaults, and ALLOW_* flags. This lets clients build per-model UI toggles (web search, tools, reasoning) based on server-provided capabilities.
+Examples:
+- `curl http://127.0.0.1:7860/api/v1/models` will show configured models and their sampler defaults.
 Example: POST with `web_search` and reasoning enabled
 ```json
 The server will perform a web search for the prompt, aggregate the top 3 results, and inject those into the prompt, then run the model with reasoning enabled — all using the same model instead of an external reasoning or search model.
+Universal tool and model-initiated tool calls:
+- The `universal` tool returns a structured JSON/dict with the following fields: `action` (calc/web_search), `result` (string), and `metadata` (dict with `confidence`, query/expression, etc.).
+- Example `universal` result:
+```json
+{
+	"action": "calc",
+	"result": "14",
+	"metadata": {"expression": "2+3*4", "confidence": 0.98}
+}
+```
+- The model can also request tools mid-generation by emitting a sentinel tag, e.g.:
+```
+<tool-call>{"name":"calc","args":{"expression":"40+2"}}</tool-call>
+```
+	When the model emits such a sentinel, the server will execute the requested tool, inject the results into the prompt, and continue streaming output. The server will also emit a metadata-only streaming chunk so the client is aware a tool was executed mid-stream.
 Streaming behavior:
 - The API streams responses token-by-token by default (`stream: true`) and persists a `state_name` for the generation if requested (or will generate one). Provide `state_name` to resume continuation from where the previous stream stopped. The server stores model state in memory under `(model, state_name)` so subsequent requests with the same `state_name` can continue generation from that exact point.

api_types.py CHANGED Viewed

@@ -36,6 +36,33 @@ class ChatCompletionMessage(BaseModel):
     )
 class PromptTokensDetails(BaseModel):
     cached_tokens: int
@@ -80,3 +107,17 @@ class ChatCompletionChunk(BaseModel):
     model: str
     choices: List[ChatCompletionChoice]
     usage: Optional[Usage]

     )
+class SamplerConfig(BaseModel):
+    """Sampler configuration used in API requests and model defaults.
+    This mirrors the server-side `SamplerConfig` and exposes an optional
+    `ALLOW_*` set of fields that can be used to override the model/global
+    allow flags per-request (when present)."""
+    max_tokens: Optional[int] = Field(512)
+    temperature: Optional[float] = Field(1.0)
+    top_p: Optional[float] = Field(0.3)
+    presence_penalty: Optional[float] = Field(0.5)
+    count_penalty: Optional[float] = Field(0.5)
+    penalty_decay: Optional[float] = Field(0.996)
+    stop: Optional[List[str]] = Field(default_factory=lambda: ["\n\n"])
+    stop_tokens: Optional[List[int]] = Field(default_factory=lambda: [0])
+    ALLOW_WEB_SEARCH: Optional[bool] = Field(None)
+    ALLOW_TOOLS: Optional[bool] = Field(None)
+    ALLOW_REASONING: Optional[bool] = Field(None)
+    ALLOW_FILE_TOOL: Optional[bool] = Field(None, description="Per-sampler override for allowing file tools (uploads/file_read).")
+    # UI flags so a client can show the controls for toggles
+    SHOW_WEB_SEARCH_BUTTON: Optional[bool] = Field(None, description="Whether the UI should show a web-search toggle for this sampler")
+    SHOW_FILE_UPLOAD_BUTTON: Optional[bool] = Field(None, description="Whether the UI should show a file upload control for this sampler")
+    SHOW_REASONING_TOGGLE: Optional[bool] = Field(None, description="Whether the UI should show a reasoning toggle for this sampler")
+    # UI style hints. e.g. 'whatsapp' style, compact, or 'expanded'
+    UI_STYLE: Optional[str] = Field(None, description="UI style hint that clients may use to render controls (example: 'whatsapp' or 'compact')")
 class PromptTokensDetails(BaseModel):
     cached_tokens: int
     model: str
     choices: List[ChatCompletionChoice]
     usage: Optional[Usage]
+class UploadedFile(BaseModel):
+    file_id: str
+    filename: str
+    size: int
+    mime_type: Optional[str] = None
+    path: Optional[str] = None
+    uploaded_at: Optional[int] = None
+class FileUploadResponse(BaseModel):
+    success: bool = True
+    file: UploadedFile

app.py CHANGED Viewed

@@ -15,6 +15,8 @@ from utils import (
     remove_nested_think_tags_stack,
     format_bytes,
     log,
 )
 import copy, types, gc, sys, re, time, collections, asyncio
@@ -78,7 +80,7 @@ os.environ["RWKV_CUDA_ON"] = (
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
-from fastapi import FastAPI, HTTPException
 from starlette.background import BackgroundTask
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -94,6 +96,9 @@ from api_types import (
     PromptTokensDetails,
     ChatCompletionChoice,
     ChatCompletionMessage,
 )
@@ -109,25 +114,103 @@ DEFALUT_MODEL_NAME = None
 DEFAULT_REASONING_MODEL_NAME = None
 # In-memory model state store to support streaming continuation/resume per state_name.
-# Keys: (model_name, state_name) -> model_state object
 STATE_STORE: Dict[tuple, Any] = {}
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
 logGPUState()
-# Enforce single 0.1b model. If multiple models are present, select only the one
-# that matches '0.1b' literally in the service name, to obey policy of single model.
-filtered_models = [m for m in CONFIG.MODELS if '0.1b' in m.SERVICE_NAME]
-if len(filtered_models) == 0:
-    # If no explicit 0.1b model detected, fall back to the first provided model but warn.
-    logger.warning("No '0.1b' model detected in config; using the first available model. To ensure single 0.1b use, include a model name with '0.1b'.")
-    CONFIG.MODELS = [CONFIG.MODELS[0]]
-elif len(filtered_models) > 1:
-    logger.warning("Multiple '0.1b' models detected; selecting the first one as the single model.")
-    CONFIG.MODELS = [filtered_models[0]]
-else:
-    CONFIG.MODELS = [filtered_models[0]]
 for model_config in CONFIG.MODELS:
     logger.info(f"Load Model - {model_config.SERVICE_NAME}")
@@ -200,14 +283,35 @@ class ChatCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = Field(default=None)
     count_penalty: Optional[float] = Field(default=None)
     penalty_decay: Optional[float] = Field(default=None)
-    stream: Optional[bool] = Field(default=True, description="Whether to stream token-by-token responses by default")
     state_name: Optional[str] = Field(default=None)
     include_usage: Optional[bool] = Field(default=False)
     stop: Optional[list[str]] = Field(["\n\n"])
     stop_tokens: Optional[list[int]] = Field([0])
     web_search: Optional[bool] = Field(default=False, description="Whether to perform a web search and append results to the prompt")
     search_top_k: Optional[int] = Field(default=3, description="Number of web search results to retrieve")
     tools: Optional[List[Dict[str, Any]]] = Field(default=None, description="List of tools to execute server-side (e.g., {'name':'web_search','args':{'query':'x'}})")
     @model_validator(mode="before")
     @classmethod
@@ -237,6 +341,26 @@ app.add_middleware(
 app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
 async def runPrefill(
     request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state
 ):
@@ -363,12 +487,191 @@ async def chatResponse(
 ) -> ChatCompletion:
     createTimestamp = time.time()
-    prompt = (
-        f"{cleanMessages(request.messages or [])}\n\nAssistant:{' <think' if enableReasoning else ''}"
-        if request.prompt == None
-        else request.prompt.strip()
     )
-    # Process tools and web_search (tools executed server-side and results injected to prompt)
     if request.tools:
         try:
             for tool in request.tools:
@@ -381,27 +684,73 @@ async def chatResponse(
                     search_top_k = int(args.get('top_k') or request.search_top_k or 3)
                     search_str = web_search(search_q, search_top_k)
                     if search_str:
-                        prompt = (f"ToolResults:\n{search_str}\n\nUse these results to answer the prompt.\n\n" + prompt)
                 elif name == 'calc' or name == 'calculator':
                     from utils import calc
                     expr = args.get('expression')
                     if expr:
                         calc_res = calc(expr)
-                        prompt = (f"ToolResults:\nCalcResult:{expr} = {calc_res}\n\nUse this result to answer the prompt.\n\n" + prompt)
                 else:
                     # Unsupported tool - ignore or log
                     logger.info(f"Unsupported tool requested: {name}")
         except Exception as e:
             logger.info(f"Tool processing error: {e}")
-    elif request.web_search:
         try:
             from utils import web_search
             search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
             search_res = web_search(search_q, int(request.search_top_k or 3))
             if search_res:
-                prompt = f"WebSearchResults:\n{search_res}\n\n" + prompt
         except Exception:
             pass
     logger.info(f"[REQ] {completionId} - prompt - {prompt}")
@@ -411,9 +760,14 @@ async def chatResponse(
         state_key = (request.model, request.state_name)
         if state_key in STATE_STORE:
             stored = STATE_STORE[state_key]
-            model_state = stored.get('state', model_state)
             model_tokens = stored.get('model_tokens', [0])
-            out = None
         else:
             out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
     else:
@@ -425,32 +779,87 @@ async def chatResponse(
     fullResponse = " <think" if enableReasoning else ""
     completionTokenCount = 0
     finishReason = None
-    for chunk in generate(
-        request,
-        out,
-        model_tokens,
-        model_state,
-        max_tokens=(
-            64000
-            if "max_tokens" not in request.model_fields_set and enableReasoning
-            else (request.max_tokens or 2048)
-        ),
-    ):
-        # chunk['content'] is now expected to be a single token's decoded text
-        fullResponse += chunk["content"]
-        # Check stop sequences (multi-token) after each token
-        for stop_words in request.stop or []:
-            if stop_words in fullResponse:
-                finishReason = f"stop:words:{stop_words}"
-                break
-        completionTokenCount += 1
-        if chunk["finish_reason"]:
-            finishReason = chunk["finish_reason"]
-        await asyncio.sleep(0)
-    genenrateTime = time.time()
     responseLog = {
         "content": fullResponse,
@@ -458,7 +867,7 @@ async def chatResponse(
         "prefill_len": promptTokenCount,
         "prefill_tps": round(promptTokenCount / (prefillTime - createTimestamp), 2),
         "gen_len": completionTokenCount,
-        "gen_tps": round(completionTokenCount / (genenrateTime - prefillTime), 2),
     }
     logger.info(f"[RES] {completionId} - {responseLog}")
@@ -481,7 +890,7 @@ async def chatResponse(
                     role="Assistant",
                     content=content,
                     reasoning_content=reasoning_content if reasoning_content else None,
-                    tool_calls=None,
                 ),
                 logprobs=None,
                 finish_reason=finishReason,
@@ -496,6 +905,11 @@ async def chatResponse(
                 'state': model_state,
                 'model_tokens': model_tokens,
             }
     except Exception:
         pass
@@ -510,12 +924,68 @@ async def chatResponseStream(
 ):
     createTimestamp = int(time.time())
-    prompt = (
-        f"{cleanMessages(request.messages or [], enableReasoning)}\n\nAssistant:{' <think' if enableReasoning else ''}"
-        if request.prompt == None
-        else request.prompt.strip()
     )
-    # Process tools and web_search (tools executed server-side and results injected to prompt)
     if request.tools:
         try:
             for tool in request.tools:
@@ -528,26 +998,43 @@ async def chatResponseStream(
                     search_top_k = int(args.get('top_k') or request.search_top_k or 3)
                     search_str = web_search(search_q, search_top_k)
                     if search_str:
-                        prompt = (f"WebSearchResults:\n{search_str}\n\n" + prompt)
                 elif name == 'calc' or name == 'calculator':
                     from utils import calc
                     expr = args.get('expression')
                     if expr:
                         calc_res = calc(expr)
-                        prompt = (f"CalcResult:{expr} = {calc_res}\n\n" + prompt)
                 else:
                     logger.info(f"Unsupported tool requested: {name}")
         except Exception as e:
             logger.info(f"Tool processing error: {e}")
-    elif request.web_search:
         try:
             from utils import web_search
             search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
             search_res = web_search(search_q, int(request.search_top_k or 3))
             if search_res:
-                prompt = f"WebSearchResults:\n{search_res}\n\n" + prompt
         except Exception:
             pass
@@ -558,9 +1045,13 @@ async def chatResponseStream(
         state_key = (request.model, request.state_name)
         if state_key in STATE_STORE:
             stored = STATE_STORE[state_key]
-            model_state = stored.get('state', model_state)
             model_tokens = stored.get('model_tokens', [0])
-            out = None
         else:
             out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
     else:
@@ -571,6 +1062,9 @@ async def chatResponseStream(
     completionTokenCount = 0
     finishReason = None
     response = ChatCompletionChunk(
         id=completionId,
@@ -605,6 +1099,14 @@ async def chatResponseStream(
     # Attach state_name in the initial chunk so client can save it to continue later
     r_dict = response.model_dump()
     r_dict['state_name'] = request.state_name
     yield f"data: {r_dict}\n\n"
     buffer = []
@@ -771,15 +1273,73 @@ async def chatResponseStream(
                 delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
                 response.choices[0].delta = delta
             if delta.content != None or delta.reasoning_content != None:
-                # Save model state frequently (after each token) to allow resuming
                 try:
                     if request.state_name:
                         STATE_STORE[(request.model, request.state_name)] = {
                             'state': model_state,
                             'model_tokens': model_tokens,
                         }
                 except Exception:
                     pass
                 yield f"data: {response.model_dump_json()}\n\n"
                 # check stop sequences and stop streaming if we see them
                 for stop_words in request.stop or []:
@@ -791,39 +1351,139 @@ async def chatResponseStream(
         del streamConfig
     else:
-        for chunk in generate(request, out, model_tokens, model_state):
-            completionTokenCount += 1
-            buffer.append(chunk["content"])
-            if chunk["finish_reason"]:
-                finishReason = chunk["finish_reason"]
-            response = ChatCompletionChunk(
-                id=completionId,
-                created=createTimestamp,
-                model=request.model,
-                usage=(
-                    Usage(
-                        prompt_tokens=promptTokenCount,
-                        completion_tokens=completionTokenCount,
-                        total_tokens=promptTokenCount + completionTokenCount,
-                        prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
-                    )
-                    if request.include_usage
-                    else None
-                ),
-                choices=[
-                    ChatCompletionChoice(
-                        index=0,
-                        delta=ChatCompletionMessage(role="Assistant", content=chunk["content"], reasoning_content=None, tool_calls=None),
-                        logprobs=None,
-                        finish_reason=finishReason,
-                    )
-                ],
-            )
-            yield f"data: {response.model_dump_json()}\n\n"
-            await asyncio.sleep(0)
     genenrateTime = time.time()
@@ -858,7 +1518,13 @@ async def chat_completions(request: ChatCompletionRequest):
     completionId = str(next(CompletionIdGenerator))
     logger.info(f"[REQ] {completionId} - {request.model_dump()}")
     modelName = request.model.split(":")[0]
     enableReasoning = ":thinking" in request.model
     if "rwkv-latest" in request.model:
@@ -899,14 +1565,24 @@ async def chat_completions(request: ChatCompletionRequest):
         model_tokens_for_resume = stored.get('model_tokens', [0])
     request_dict = request.model_dump()
     for k, v in defaultSamplerConfig.model_dump().items():
         if k in request_dict and request_dict[k] is None:
             request_dict[k] = v
     realRequest = ChatCompletionRequest(**request_dict)
     logger.info(f"[REQ] {completionId} - Real - {request.model_dump()}")
-    if request.stream:
         r = StreamingResponse(
             chatResponseStream(realRequest, model_state, completionId, enableReasoning),
             media_type="text/event-stream",
@@ -928,10 +1604,109 @@ async def chat_completions(request: ChatCompletionRequest):
     return r
-if os.path.exists("dist-frontend"):
-    app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
-else:
-    logger.info("dist-frontend not found; skipping static files mount")
 if __name__ == "__main__":
     import uvicorn

     remove_nested_think_tags_stack,
     format_bytes,
     log,
+    detect_tools_and_reasoning,
+    universal_tool,
 )
 import copy, types, gc, sys, re, time, collections, asyncio
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
+from fastapi import FastAPI, HTTPException, UploadFile, File
 from starlette.background import BackgroundTask
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
     PromptTokensDetails,
     ChatCompletionChoice,
     ChatCompletionMessage,
+    SamplerConfig,
+    UploadedFile,
+    FileUploadResponse,
 )
 DEFAULT_REASONING_MODEL_NAME = None
 # In-memory model state store to support streaming continuation/resume per state_name.
+# Keys: (model_name, state_name) -> dict with 'state' and 'model_tokens'
 STATE_STORE: Dict[tuple, Any] = {}
+# Serialized state store file path and flush interval defined in CONFIG
+_STATE_STORE_PATH = getattr(CONFIG, 'STATE_STORE_PATH', './state_store.json')
+_LAST_STATE_STORE_WRITE = 0
+# sentinel for model-initiated tool calls: <tool-call>{json}</tool-call>
+TOOL_CALL_RE = re.compile(r"<tool-call>\s*(\{.*?\})\s*</tool-call>", re.S)
+# File uploads: simple in-memory index (persisted on disk via the files themselves)
+UPLOADED_FILES: Dict[str, dict] = {}
+def _serialize_state_store() -> dict:
+    # Save only model_tokens to disk; model_state (torch objects) are not serializable
+    serial = {}
+    for (model_name, state_name), entry in STATE_STORE.items():
+        try:
+            mt = entry.get('model_tokens') if isinstance(entry, dict) else None
+            if mt is None:
+                # if entry is a raw model_state, skip
+                continue
+            serial[f"{model_name}|{state_name}"] = {
+                'model': model_name,
+                'state_name': state_name,
+                'model_tokens': mt,
+            }
+        except Exception:
+            continue
+    return serial
+def _load_state_store_from_disk():
+    global STATE_STORE
+    try:
+        if os.path.exists(_STATE_STORE_PATH):
+            import json
+            with open(_STATE_STORE_PATH, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            for k, v in data.items():
+                model = v.get('model')
+                state_name = v.get('state_name')
+                model_tokens = v.get('model_tokens')
+                if model and state_name and isinstance(model_tokens, list):
+                    STATE_STORE[(model, state_name)] = {
+                        'state': None,
+                        'model_tokens': model_tokens,
+                    }
+            logger.info(f"Loaded {len(STATE_STORE)} entries from state store file {_STATE_STORE_PATH}")
+    except Exception as e:
+        logger.info(f"Failed to load state store from disk: {e}")
+def _save_state_store_to_disk(force=False):
+    global _LAST_STATE_STORE_WRITE
+    now = time.time()
+    if not force and now - _LAST_STATE_STORE_WRITE < getattr(CONFIG, 'STATE_STORE_FLUSH_INTERVAL', 5):
+        return
+    try:
+        serial = _serialize_state_store()
+        if not serial:
+            return
+        import json
+        tmp = _STATE_STORE_PATH + ".tmp"
+        with open(tmp, 'w', encoding='utf-8') as f:
+            json.dump(serial, f)
+        os.replace(tmp, _STATE_STORE_PATH)
+        _LAST_STATE_STORE_WRITE = now
+    except Exception as e:
+        logger.info(f"Write state store to disk failed: {e}")
+def _recompute_out_and_state_from_tokens(model_name: str, model_tokens: List[int]):
+    """
+    Recompute the `out` logits and `model_state` by forwarding through tokens in chunks.
+    Returns a tuple (out, model_state).
+    """
+    ms = MODEL_STORAGE.get(model_name)
+    if not ms or not ms.model:
+        return None, None
+    model_state = None
+    out = None
+    tokens = list(model_tokens) if isinstance(model_tokens, list) else [0]
+    while len(tokens) > 0:
+        out, model_state = ms.model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
+        tokens = tokens[CONFIG.CHUNK_LEN :]
+    return out, model_state
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
 logGPUState()
+# Keep any configured models intact; do not force selection by name/size.
+# The previous policy enforced a single '0.1b' model which hid additional configs; use the full list.
+logger.info(f"Configured {len(CONFIG.MODELS)} model(s) in ROOT config")
 for model_config in CONFIG.MODELS:
     logger.info(f"Load Model - {model_config.SERVICE_NAME}")
     presence_penalty: Optional[float] = Field(default=None)
     count_penalty: Optional[float] = Field(default=None)
     penalty_decay: Optional[float] = Field(default=None)
+    stream: Optional[bool] = Field(default=None, description="Whether to stream token-by-token responses. If None, uses CONFIG.DEFAULT_STREAM")
     state_name: Optional[str] = Field(default=None)
     include_usage: Optional[bool] = Field(default=False)
     stop: Optional[list[str]] = Field(["\n\n"])
     stop_tokens: Optional[list[int]] = Field([0])
     web_search: Optional[bool] = Field(default=False, description="Whether to perform a web search and append results to the prompt")
+    enable_web_search: Optional[bool] = Field(default=None, description="Explicitly enable web search (overrides auto/web_search) if set")
+    auto_web_search: Optional[bool] = Field(default=None, description="Whether to enable web_search based on auto-detected intent")
+    enable_tools: Optional[bool] = Field(default=None, description="Explicitly enable tools (overrides auto detection)")
+    auto_tools: Optional[bool] = Field(default=None, description="Whether to enable tools based on auto-detected intent")
+    enable_reasoning: Optional[bool] = Field(default=None, description="Explicitly override reasoning enablement")
+    auto_reasoning: Optional[bool] = Field(default=None, description="Whether to enable reasoning based on auto detection")
+    enable_universal: Optional[bool] = Field(default=None, description="Explicitly enable the universal tool execution")
+    auto_universal: Optional[bool] = Field(default=None, description="Whether to auto enable universal tool execution")
     search_top_k: Optional[int] = Field(default=3, description="Number of web search results to retrieve")
     tools: Optional[List[Dict[str, Any]]] = Field(default=None, description="List of tools to execute server-side (e.g., {'name':'web_search','args':{'query':'x'}})")
+    # Per-request sampler overrides for ALLOW_* flags. These let the user
+    # disable server-side features for this particular request if needed.
+    sampler_allow_web_search: Optional[bool] = Field(default=None, description="Per-request (sampler) override allowing web_search")
+    sampler_allow_tools: Optional[bool] = Field(default=None, description="Per-request (sampler) override allowing tools")
+    sampler_allow_reasoning: Optional[bool] = Field(default=None, description="Per-request (sampler) override allowing reasoning")
+    # Per-request sampler config object; if provided, these settings will
+    # override the model defaults for this request.
+    sampler: Optional[SamplerConfig] = Field(default=None, description="Per-request sampler settings (overrides model default)")
+    # File uploads: allow referencing uploaded files in the request
+    file_ids: Optional[List[str]] = Field(default=None, description="List of uploaded file IDs that the model may use for this request")
+    enable_file_tool: Optional[bool] = Field(default=None, description="Explicitly enable file-based tools for this request")
+    auto_file_tool: Optional[bool] = Field(default=None, description="Auto-detect whether file-based tools are needed")
+    sampler_allow_file_tool: Optional[bool] = Field(default=None, description="Per-request sampler override allowing file tools")
     @model_validator(mode="before")
     @classmethod
 app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
+@app.on_event("startup")
+async def _startup_state_load_and_persist_loop():
+    # Load previous persisted state (tokens only) at startup
+    _load_state_store_from_disk()
+    async def _persist_loop():
+        while True:
+            try:
+                _save_state_store_to_disk(force=False)
+            except Exception:
+                pass
+            await asyncio.sleep(getattr(CONFIG, 'STATE_STORE_FLUSH_INTERVAL', 5))
+    # Spawn background flush task
+    try:
+        asyncio.create_task(_persist_loop())
+    except Exception:
+        pass
 async def runPrefill(
     request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state
 ):
 ) -> ChatCompletion:
     createTimestamp = time.time()
+    # Build raw prompt for detection (prefer explicit request.prompt, else messages)
+    raw_prompt = request.prompt.strip() if request.prompt is not None else cleanMessages(request.messages or [])
+    # Intent detection: analyze raw_prompt or messages to auto-activate tools/web-search/reasoning
+    detection = detect_tools_and_reasoning(raw_prompt)
+    # After computing auto flags, build the actual prompt string to include <think> if needed
+    prompt = raw_prompt if request.prompt is not None else f"{cleanMessages(request.messages or [])}\n\nAssistant:{' <think' if enableReasoning else ''}"
+    # Decide whether web_search should be used based on explicit flags, auto flags, and config defaults
+    # Base computed web_search flag
+    web_search_enabled = (
+        True
+        if (request.enable_web_search is not None and request.enable_web_search)
+        else (
+            request.web_search
+            or (request.auto_web_search if request.auto_web_search is not None else CONFIG.AUTO_ENABLE_WEB_SEARCH and detection.get('need_web_search'))
+        )
     )
+    if not getattr(CONFIG, 'ENABLE_WEB_SEARCH_BY_DEFAULT', True) and request.enable_web_search is None and not request.web_search:
+        web_search_enabled = False
+    # If the root config says web search is disabled by default, honor it
+    if not getattr(CONFIG, 'ENABLE_WEB_SEARCH_BY_DEFAULT', True) and request.enable_web_search is None and not request.web_search:
+        web_search_enabled = False
+    # Next: respect per-request sampler override (sampler_allow_web_search) or request.sampler.ALLOW_WEB_SEARCH,
+    # then per-model/per-sampler ALLOW_* settings.
+    try:
+        # 1) per-request `sampler` object ALLOW_* if present, then
+        # 2) explicit per-request sampler_allow_* booleans (backwards compatible), else
+        # 3) model.DEFAULT_SAMPLER.ALLOW_* if set, else model.ALLOW_*.
+        if request.sampler and getattr(request.sampler, 'ALLOW_WEB_SEARCH', None) is not None:
+            web_search_enabled = bool(request.sampler.ALLOW_WEB_SEARCH)
+        elif hasattr(request, 'sampler_allow_web_search') and request.sampler_allow_web_search is not None:
+            web_search_enabled = bool(request.sampler_allow_web_search)
+        else:
+            ms = MODEL_STORAGE.get(request.model)
+            if ms and ms.MODEL_CONFIG:
+                if hasattr(ms.MODEL_CONFIG, 'DEFAULT_SAMPLER') and getattr(ms.MODEL_CONFIG.DEFAULT_SAMPLER, 'ALLOW_WEB_SEARCH', None) is not None:
+                    web_search_enabled = bool(ms.MODEL_CONFIG.DEFAULT_SAMPLER.ALLOW_WEB_SEARCH)
+                elif hasattr(ms.MODEL_CONFIG, 'ALLOW_WEB_SEARCH') and not ms.MODEL_CONFIG.ALLOW_WEB_SEARCH:
+                    web_search_enabled = False
+    except Exception:
+        pass
+    # Decide whether file tools should be used
+    if request.enable_file_tool is not None:
+        file_tool_enabled = bool(request.enable_file_tool)
+    else:
+        auto_file_flag = request.auto_file_tool if request.auto_file_tool is not None else CONFIG.AUTO_ENABLE_TOOLS
+        # Default to enabled when files are provided and global setting allows
+        file_tool_enabled = bool((request.file_ids and len(request.file_ids) > 0) or (auto_file_flag and request.file_ids))
+    # Respect root-level defaults
+    if not getattr(CONFIG, 'ALLOW_FILE_TOOL_BY_DEFAULT', True) and request.enable_file_tool is None:
+        file_tool_enabled = False
+    # Per-request sampler overrides
+    try:
+        if request.sampler and getattr(request.sampler, 'ALLOW_FILE_TOOL', None) is not None:
+            file_tool_enabled = bool(request.sampler.ALLOW_FILE_TOOL)
+        elif hasattr(request, 'sampler_allow_file_tool') and request.sampler_allow_file_tool is not None:
+            file_tool_enabled = bool(request.sampler_allow_file_tool)
+        else:
+            ms = MODEL_STORAGE.get(request.model)
+            if ms and ms.MODEL_CONFIG:
+                if hasattr(ms.MODEL_CONFIG, 'DEFAULT_SAMPLER') and getattr(ms.MODEL_CONFIG.DEFAULT_SAMPLER, 'ALLOW_FILE_TOOL', None) is not None:
+                    file_tool_enabled = bool(ms.MODEL_CONFIG.DEFAULT_SAMPLER.ALLOW_FILE_TOOL)
+                elif hasattr(ms.MODEL_CONFIG, 'ALLOW_FILE_TOOL') and not ms.MODEL_CONFIG.ALLOW_FILE_TOOL:
+                    file_tool_enabled = False
+    except Exception:
+        pass
+    # Decide whether tools should be used
+    if request.enable_tools is not None:
+        tools_enabled = bool(request.enable_tools)
+    else:
+        # if explicit tools provided, or enable by default config, or auto detection suggests
+        auto_tools_flag = request.auto_tools if request.auto_tools is not None else CONFIG.AUTO_ENABLE_TOOLS
+        tools_enabled = bool(request.tools) or CONFIG.ENABLE_TOOLS_BY_DEFAULT or (auto_tools_flag and (detection.get('need_calc') or detection.get('need_web_search')))
+    # Respect sampler-level override (request.sampler.ALLOW_TOOLS), then
+    # request.sampler_allow_tools, then sampler default and finally model-level allow
+    try:
+        if request.sampler and getattr(request.sampler, 'ALLOW_TOOLS', None) is not None:
+            tools_enabled = bool(request.sampler.ALLOW_TOOLS)
+        elif hasattr(request, 'sampler_allow_tools') and request.sampler_allow_tools is not None:
+            tools_enabled = bool(request.sampler_allow_tools)
+        else:
+            ms = MODEL_STORAGE.get(request.model)
+            if ms and ms.MODEL_CONFIG:
+                if hasattr(ms.MODEL_CONFIG, 'DEFAULT_SAMPLER') and getattr(ms.MODEL_CONFIG.DEFAULT_SAMPLER, 'ALLOW_TOOLS', None) is not None:
+                    if not ms.MODEL_CONFIG.DEFAULT_SAMPLER.ALLOW_TOOLS:
+                        tools_enabled = False
+                elif hasattr(ms.MODEL_CONFIG, 'ALLOW_TOOLS') and not ms.MODEL_CONFIG.ALLOW_TOOLS:
+                    tools_enabled = False
+    except Exception:
+        pass
+    # Decide whether reasoning should be enabled (in addition to :thinking or explicit)
+    reasoning_enabled = bool(
+        True
+        if (request.enable_reasoning is not None and request.enable_reasoning)
+        else (
+            bool(enableReasoning) or bool(request.auto_reasoning if request.auto_reasoning is not None else (CONFIG.AUTO_ENABLE_REASONING and bool(detection.get('need_reasoning'))))
+        )
+    )
+    # If the root config sets reasoning to disabled by default and no explicit request to enable, disable it
+    if not getattr(CONFIG, 'ENABLE_REASONING_BY_DEFAULT', True) and request.enable_reasoning is None:
+        reasoning_enabled = False
+    # Respect sampler-level override for reasoning: request.sampler.ALLOW_REASONING -> sampler_allow_reasoning -> sampler.default -> model
+    try:
+        if request.sampler and getattr(request.sampler, 'ALLOW_REASONING', None) is not None:
+            reasoning_enabled = bool(request.sampler.ALLOW_REASONING)
+        elif hasattr(request, 'sampler_allow_reasoning') and request.sampler_allow_reasoning is not None:
+            reasoning_enabled = bool(request.sampler_allow_reasoning)
+        else:
+            ms = MODEL_STORAGE.get(request.model)
+            if ms and ms.MODEL_CONFIG:
+                if hasattr(ms.MODEL_CONFIG, 'DEFAULT_SAMPLER') and getattr(ms.MODEL_CONFIG.DEFAULT_SAMPLER, 'ALLOW_REASONING', None) is not None:
+                    if not ms.MODEL_CONFIG.DEFAULT_SAMPLER.ALLOW_REASONING:
+                        reasoning_enabled = False
+                elif hasattr(ms.MODEL_CONFIG, 'ALLOW_REASONING') and not ms.MODEL_CONFIG.ALLOW_REASONING:
+                    reasoning_enabled = False
+    except Exception:
+        pass
+    # Keep the local boolean for generating content
+    enableReasoning = reasoning_enabled
+    try:
+        ms = MODEL_STORAGE.get(request.model)
+        if ms and ms.MODEL_CONFIG and hasattr(ms.MODEL_CONFIG, 'ALLOW_REASONING') and not ms.MODEL_CONFIG.ALLOW_REASONING:
+            enableReasoning = False
+    except Exception:
+        pass
+    # Ensure web_search property mirrors computed web_search_enabled if not explicitly provided
+    if request.enable_web_search is None:
+        request.web_search = web_search_enabled
+    # If tools should be automatically enabled, add detected ones
+    if tools_enabled and not request.tools:
+        if detection.get('detected_tools'):
+            request.tools = detection.get('detected_tools')
+    # If universal is needed and not explicitly requested, add universal tool
+    if (request.enable_universal is True) or (
+        request.enable_universal is None and (request.auto_universal if request.auto_universal is not None else CONFIG.AUTO_ENABLE_TOOLS and detection.get('need_universal'))
+    ):
+        if not request.tools:
+            request.tools = [{"name": "universal", "args": {"query": raw_prompt}}]
+    executed_tool_calls = []
+    # If file tools are enabled and files are attached, inject them into the prompt (for streaming)
+    if file_tool_enabled and request.file_ids:
+        for fid in request.file_ids:
+            try:
+                if fid not in UPLOADED_FILES:
+                    continue
+                meta = UPLOADED_FILES.get(fid)
+                if not meta:
+                    continue
+                from utils import file_read_from_path
+                fpath = meta.get('path')
+                if not fpath or not os.path.exists(fpath):
+                    continue
+                file_content = file_read_from_path(fpath, 200000)
+                if file_content:
+                    exec_entry = {"name": "file_inject", "args": {"file_id": fid}, "result": {"action": "file_inject", "result": "injected", "metadata": {"file_id": fid, "filename": meta.get('filename')}}}
+                    executed_tool_calls.append(exec_entry)
+                    prompt = (f"AttachedFile: {meta.get('filename')} (id:{fid})\n{file_content}\n\n" + prompt)
+            except Exception as e:
+                logger.info(f"File injection error: {e}")
+    # If file tools are enabled and files are attached, inject them into the prompt
+    if file_tool_enabled and request.file_ids:
+        for fid in request.file_ids:
+            try:
+                if fid not in UPLOADED_FILES:
+                    continue
+                meta = UPLOADED_FILES.get(fid)
+                if not meta:
+                    continue
+                from utils import file_read_from_path
+                fpath = meta.get('path')
+                if not fpath or not os.path.exists(fpath):
+                    continue
+                file_content = file_read_from_path(fpath, 200000)
+                if file_content:
+                    exec_entry = {"name": "file_inject", "args": {"file_id": fid}, "result": {"action": "file_inject", "result": "injected", "metadata": {"file_id": fid, "filename": meta.get('filename')}}}
+                    executed_tool_calls.append(exec_entry)
+                    prompt = (f"AttachedFile: {meta.get('filename')} (id:{fid})\n{file_content}\n\n" + prompt)
+            except Exception as e:
+                logger.info(f"File injection error: {e}")
     if request.tools:
         try:
             for tool in request.tools:
                     search_top_k = int(args.get('top_k') or request.search_top_k or 3)
                     search_str = web_search(search_q, search_top_k)
                     if search_str:
+                        search_res_struct = {"action": "web_search", "result": str(search_str), "metadata": {"query": search_q, "top_k": search_top_k, "confidence": 0.9}}
+                        executed_tool_calls.append({"name": "web_search", "args": {"query": search_q, "top_k": search_top_k}, "result": search_res_struct})
+                        prompt = (f"ToolResults:\n{search_res_struct.get('result')}\n\nUse these results to answer the prompt.\n\n" + prompt)
                 elif name == 'calc' or name == 'calculator':
                     from utils import calc
                     expr = args.get('expression')
                     if expr:
                         calc_res = calc(expr)
+                        # Wrap result into a structured dict
+                        calc_res_struct = {"action": "calc", "result": str(calc_res), "metadata": {"expression": expr, "confidence": 0.98}}
+                        executed_tool_calls.append({"name": "calc", "args": {"expression": expr}, "result": calc_res_struct})
+                        prompt = (f"ToolResults:\nCalcResult:{expr} = {calc_res_struct.get('result')}\n\nUse this result to answer the prompt.\n\n" + prompt)
+                elif name == 'universal':
+                    try:
+                        res = universal_tool(args or {"query": raw_prompt}, allow_web_search=bool(web_search_enabled), allow_tools=bool(tools_enabled), allow_file_tool=bool(file_tool_enabled))
+                        # If universal_tool returns a dict, extract text result for prompt injection
+                        if isinstance(res, dict):
+                            result_text = res.get('result') if res.get('result') is not None else ''
+                        else:
+                            result_text = str(res)
+                        executed_tool_calls.append({"name": "universal", "args": args, "result": res})
+                        prompt = (f"ToolResults:\n{result_text}\n\nUse this result to answer the prompt.\n\n" + prompt)
+                    except Exception as e:
+                        logger.info(f"Universal tool execution error: {e}")
                 else:
                     # Unsupported tool - ignore or log
                     logger.info(f"Unsupported tool requested: {name}")
+                if name == 'file_read':
+                    # read an uploaded file by id/path
+                    try:
+                        fid = args.get('file_id') or args.get('id') or (request.file_ids[0] if request.file_ids else None)
+                        if not fid:
+                            continue
+                        if fid not in UPLOADED_FILES:
+                            continue
+                        meta = UPLOADED_FILES.get(fid)
+                        if not meta:
+                            continue
+                        from utils import file_read_from_path
+                        fpath = meta.get('path')
+                        if not fpath or not os.path.exists(fpath):
+                            continue
+                        file_content = file_read_from_path(fpath, int(args.get('max_bytes') or 100000))
+                        exec_entry = {"name": "file_read", "args": {"file_id": fid, "max_bytes": int(args.get('max_bytes') or 100000)}, "result": {"action": "file_read", "result": file_content, "metadata": {"file_id": fid, "filename": meta.get('filename')}}}
+                        executed_tool_calls.append(exec_entry)
+                        _res = exec_entry.get('result') if isinstance(exec_entry, dict) else None
+                        _res_text = ''
+                        if isinstance(_res, dict):
+                            _res_text = _res.get('result') or ''
+                        elif _res is not None:
+                            _res_text = str(_res)
+                        prompt = (f"ToolResults:\n{_res_text}\n\nUse these file contents to answer the prompt.\n\n" + prompt)
+                    except Exception as e:
+                        logger.info(f"file_read tool error: {e}")
         except Exception as e:
             logger.info(f"Tool processing error: {e}")
+    elif request.web_search or web_search_enabled:
         try:
             from utils import web_search
             search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
             search_res = web_search(search_q, int(request.search_top_k or 3))
             if search_res:
+                search_res_struct = {"action": "web_search", "result": str(search_res), "metadata": {"query": search_q, "top_k": int(request.search_top_k or 3), "confidence": 0.9}}
+                executed_tool_calls.append({"name": "web_search", "args": {"query": search_q, "top_k": int(request.search_top_k or 3)}, "result": search_res_struct})
+                prompt = f"WebSearchResults:\n{search_res_struct.get('result')}\n\n" + prompt
         except Exception:
             pass
     logger.info(f"[REQ] {completionId} - prompt - {prompt}")
         state_key = (request.model, request.state_name)
         if state_key in STATE_STORE:
             stored = STATE_STORE[state_key]
+            model_state = stored.get('state', None)
             model_tokens = stored.get('model_tokens', [0])
+            if model_state is None:
+                # Recompute out and model_state from tokens since we did not persist the torch state
+                out, model_state = _recompute_out_and_state_from_tokens(request.model, model_tokens)
+            else:
+                # If we have a model_state, we still need out logits. Compute from last window of tokens
+                out, _ = _recompute_out_and_state_from_tokens(request.model, model_tokens[-CONFIG.CHUNK_LEN :])
         else:
             out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
     else:
     fullResponse = " <think" if enableReasoning else ""
     completionTokenCount = 0
     finishReason = None
+    # Limit model-initiated tool calls per request to avoid loops
+    model_initiated_tool_calls = 0
+    MODEL_MAX_TOOL_CALLS = 3
+    should_restart = True
+    while should_restart:
+        should_restart = False
+        gen = generate(
+            request,
+            out,
+            model_tokens,
+            model_state,
+            max_tokens=(
+                64000
+                if "max_tokens" not in request.model_fields_set and enableReasoning
+                else (request.max_tokens or 2048)
+            ),
+        )
+        for chunk in gen:
+            # chunk['content'] is now expected to be a single token's decoded text
+            fullResponse += chunk["content"]
+            # Detect model-issued tool call markers within the output
+            if model_initiated_tool_calls < MODEL_MAX_TOOL_CALLS:
+                m = TOOL_CALL_RE.search(fullResponse)
+                if m:
+                    try:
+                        payload_raw = m.group(1)
+                        import json
+                        payload = json.loads(payload_raw)
+                        tool_name = payload.get('name')
+                        tool_args = payload.get('args', {})
+                        tool_res = None
+                        if tool_name == 'web_search':
+                            from utils import web_search
+                            q = tool_args.get('query') or (request.prompt if request.prompt else cleanMessages(request.messages or []))
+                            k = int(tool_args.get('top_k') or request.search_top_k or 3)
+                            tool_res = web_search(q, k)
+                        elif tool_name in ('calc', 'calculator'):
+                            from utils import calc
+                            expr = tool_args.get('expression')
+                            if expr:
+                                tool_res = calc(expr)
+                        else:
+                            try:
+                                tool_res = universal_tool({'query': tool_args.get('query') or payload.get('query') or ''}, allow_web_search=bool(web_search_enabled), allow_tools=bool(tools_enabled), allow_file_tool=bool(file_tool_enabled))
+                            except Exception:
+                                tool_res = None
+                        if tool_res:
+                            if not isinstance(tool_res, dict):
+                                if tool_name in ('calc', 'calculator'):
+                                    tool_res_struct = {"action": "calc", "result": str(tool_res), "metadata": {"expression": tool_args.get('expression'), "confidence": 0.98}}
+                                elif tool_name == 'web_search':
+                                    tool_res_struct = {"action": "web_search", "result": str(tool_res), "metadata": {"query": tool_args.get('query'), "top_k": tool_args.get('top_k') or request.search_top_k or 3, "confidence": 0.9}}
+                                else:
+                                    tool_res_struct = {"action": tool_name, "result": str(tool_res), "metadata": {"confidence": 0.6}}
+                            else:
+                                tool_res_struct = tool_res
+                            exec_entry = {"name": tool_name, "args": tool_args, "result": tool_res_struct, 'initiated_by_model': True}
+                            executed_tool_calls.append(exec_entry)
+                            delta_text = f"ToolResults:\n{tool_res_struct.get('result')}\n\n"
+                            prompt = delta_text + prompt
+                            fullResponse = TOOL_CALL_RE.sub('', fullResponse)
+                            buffer = [fullResponse]
+                            out, model_tokens, model_state = await runPrefill(request, delta_text, model_tokens, model_state)
+                            model_initiated_tool_calls += 1
+                    except Exception as e:
+                        logger.info(f"Model-initiated tool handling error: {e}")
+            # Check stop sequences (multi-token) after each token
+            for stop_words in request.stop or []:
+                if stop_words in fullResponse:
+                    finishReason = f"stop:words:{stop_words}"
+                    break
+            completionTokenCount += 1
+            if chunk["finish_reason"]:
+                finishReason = chunk["finish_reason"]
+    generateTime = time.time()
     responseLog = {
         "content": fullResponse,
         "prefill_len": promptTokenCount,
         "prefill_tps": round(promptTokenCount / (prefillTime - createTimestamp), 2),
         "gen_len": completionTokenCount,
+        "gen_tps": round(completionTokenCount / (generateTime - prefillTime) if generateTime!=prefillTime else 0, 2),
     }
     logger.info(f"[RES] {completionId} - {responseLog}")
                     role="Assistant",
                     content=content,
                     reasoning_content=reasoning_content if reasoning_content else None,
+                    tool_calls=executed_tool_calls if executed_tool_calls else None,
                 ),
                 logprobs=None,
                 finish_reason=finishReason,
                 'state': model_state,
                 'model_tokens': model_tokens,
             }
+            if getattr(CONFIG, 'STATE_STORE_SAVE_ON_UPDATE', False):
+                try:
+                    _save_state_store_to_disk(force=True)
+                except Exception:
+                    pass
     except Exception:
         pass
 ):
     createTimestamp = int(time.time())
+    raw_prompt = request.prompt.strip() if request.prompt is not None else cleanMessages(request.messages or [], False)
+    # Intent detection and defaults: check whether to auto-enable tools, web_search, reasoning
+    detection = detect_tools_and_reasoning(raw_prompt)
+    web_search_enabled = (
+        True
+        if (request.enable_web_search is not None and request.enable_web_search)
+        else (
+            request.web_search
+            or (request.auto_web_search if request.auto_web_search is not None else CONFIG.AUTO_ENABLE_WEB_SEARCH and detection.get('need_web_search'))
+        )
+    )
+    if request.enable_tools is not None:
+        tools_enabled = bool(request.enable_tools)
+    else:
+        auto_tools_flag = request.auto_tools if request.auto_tools is not None else CONFIG.AUTO_ENABLE_TOOLS
+        tools_enabled = bool(request.tools) or CONFIG.ENABLE_TOOLS_BY_DEFAULT or (auto_tools_flag and (detection.get('need_calc') or detection.get('need_web_search')))
+    reasoning_enabled = bool(
+        True
+        if (request.enable_reasoning is not None and request.enable_reasoning)
+        else (
+            bool(enableReasoning) or bool(request.auto_reasoning if request.auto_reasoning is not None else (CONFIG.AUTO_ENABLE_REASONING and bool(detection.get('need_reasoning'))))
+        )
     )
+    enableReasoning = reasoning_enabled
+    try:
+        ms_cfg = MODEL_STORAGE.get(request.model)
+        if ms_cfg and ms_cfg.MODEL_CONFIG and hasattr(ms_cfg.MODEL_CONFIG, 'ALLOW_REASONING') and not ms_cfg.MODEL_CONFIG.ALLOW_REASONING:
+            enableReasoning = False
+    except Exception:
+        pass
+    # Decide whether file tools should be used for streaming variant
+    if request.enable_file_tool is not None:
+        file_tool_enabled = bool(request.enable_file_tool)
+    else:
+        auto_file_flag = request.auto_file_tool if request.auto_file_tool is not None else CONFIG.AUTO_ENABLE_TOOLS
+        file_tool_enabled = bool((request.file_ids and len(request.file_ids) > 0) or (auto_file_flag and request.file_ids))
+    if not getattr(CONFIG, 'ALLOW_FILE_TOOL_BY_DEFAULT', True) and request.enable_file_tool is None:
+        file_tool_enabled = False
+    try:
+        if request.sampler and getattr(request.sampler, 'ALLOW_FILE_TOOL', None) is not None:
+            file_tool_enabled = bool(request.sampler.ALLOW_FILE_TOOL)
+        elif hasattr(request, 'sampler_allow_file_tool') and request.sampler_allow_file_tool is not None:
+            file_tool_enabled = bool(request.sampler_allow_file_tool)
+        else:
+            ms2 = MODEL_STORAGE.get(request.model)
+            if ms2 and ms2.MODEL_CONFIG:
+                if hasattr(ms2.MODEL_CONFIG, 'DEFAULT_SAMPLER') and getattr(ms2.MODEL_CONFIG.DEFAULT_SAMPLER, 'ALLOW_FILE_TOOL', None) is not None:
+                    file_tool_enabled = bool(ms2.MODEL_CONFIG.DEFAULT_SAMPLER.ALLOW_FILE_TOOL)
+                elif hasattr(ms2.MODEL_CONFIG, 'ALLOW_FILE_TOOL') and not ms2.MODEL_CONFIG.ALLOW_FILE_TOOL:
+                    file_tool_enabled = False
+    except Exception:
+        pass
+    # Build final prompt after deciding enableReasoning
+    prompt = raw_prompt if request.prompt is not None else f"{cleanMessages(request.messages or [], enableReasoning)}\n\nAssistant:{' <think' if enableReasoning else ''}"
+    if tools_enabled and not request.tools:
+        if detection.get('detected_tools'):
+            request.tools = detection.get('detected_tools')
+    executed_tool_calls = []
     if request.tools:
         try:
             for tool in request.tools:
                     search_top_k = int(args.get('top_k') or request.search_top_k or 3)
                     search_str = web_search(search_q, search_top_k)
                     if search_str:
+                        search_res_struct = {"action": "web_search", "result": str(search_str), "metadata": {"query": search_q, "top_k": search_top_k, "confidence": 0.9}}
+                        executed_tool_calls.append({"name": "web_search", "args": {"query": search_q, "top_k": search_top_k}, "result": search_res_struct})
+                        prompt = (f"WebSearchResults:\n{search_res_struct.get('result')}\n\n" + prompt)
                 elif name == 'calc' or name == 'calculator':
                     from utils import calc
                     expr = args.get('expression')
                     if expr:
                         calc_res = calc(expr)
+                        calc_res_struct = {"action": "calc", "result": str(calc_res), "metadata": {"expression": expr, "confidence": 0.98}}
+                        executed_tool_calls.append({"name": "calc", "args": {"expression": expr}, "result": calc_res_struct})
+                        prompt = (f"CalcResult:{expr} = {calc_res_struct.get('result')}\n\n" + prompt)
+                elif name == 'universal':
+                    try:
+                        res = universal_tool(args or {"query": raw_prompt}, allow_web_search=bool(web_search_enabled), allow_tools=bool(tools_enabled), allow_file_tool=bool(file_tool_enabled))
+                        if isinstance(res, dict):
+                            result_text = res.get('result') if res.get('result') is not None else ''
+                        else:
+                            result_text = str(res)
+                        executed_tool_calls.append({"name": "universal", "args": args, "result": res})
+                        prompt = (f"ToolResults:\n{result_text}\n\n" + prompt)
+                    except Exception as e:
+                        logger.info(f"Universal tool execution error: {e}")
                 else:
                     logger.info(f"Unsupported tool requested: {name}")
         except Exception as e:
             logger.info(f"Tool processing error: {e}")
+    elif request.web_search or web_search_enabled:
         try:
             from utils import web_search
             search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
             search_res = web_search(search_q, int(request.search_top_k or 3))
             if search_res:
+                search_res_struct = {"action": "web_search", "result": str(search_res), "metadata": {"query": search_q, "top_k": int(request.search_top_k or 3), "confidence": 0.9}}
+                executed_tool_calls.append({"name": "web_search", "args": {"query": search_q, "top_k": int(request.search_top_k or 3)}, "result": search_res_struct})
+                prompt = f"WebSearchResults:\n{search_res_struct.get('result')}\n\n" + prompt
         except Exception:
             pass
         state_key = (request.model, request.state_name)
         if state_key in STATE_STORE:
             stored = STATE_STORE[state_key]
+            model_state = stored.get('state', None)
             model_tokens = stored.get('model_tokens', [0])
+            if model_state is None:
+                # Recompute out and model_state from tokens since we did not persist the torch state
+                out, model_state = _recompute_out_and_state_from_tokens(request.model, model_tokens)
+            else:
+                out, _ = _recompute_out_and_state_from_tokens(request.model, model_tokens[-CONFIG.CHUNK_LEN :])
         else:
             out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
     else:
     completionTokenCount = 0
     finishReason = None
+    # Limit how many tool calls the model can initiate during a single stream
+    model_initiated_tool_calls = 0
+    MODEL_MAX_TOOL_CALLS = 3
     response = ChatCompletionChunk(
         id=completionId,
     # Attach state_name in the initial chunk so client can save it to continue later
     r_dict = response.model_dump()
     r_dict['state_name'] = request.state_name
+    # Attach executed tool_calls both at root for easy client metadata, and within the assistant message delta
+    if executed_tool_calls:
+        r_dict['tool_calls'] = executed_tool_calls
+        try:
+            if r_dict.get('choices') and len(r_dict['choices']) > 0 and r_dict['choices'][0].get('delta') is not None:
+                r_dict['choices'][0]['delta']['tool_calls'] = executed_tool_calls
+        except Exception:
+            pass
     yield f"data: {r_dict}\n\n"
     buffer = []
                 delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
                 response.choices[0].delta = delta
             if delta.content != None or delta.reasoning_content != None:
+                    # Save model state frequently (after each token) to allow resuming
                 try:
                     if request.state_name:
                         STATE_STORE[(request.model, request.state_name)] = {
                             'state': model_state,
                             'model_tokens': model_tokens,
                         }
+                        if getattr(CONFIG, 'STATE_STORE_SAVE_ON_UPDATE', False):
+                            try:
+                                _save_state_store_to_disk(force=True)
+                            except Exception:
+                                pass
                 except Exception:
                     pass
+                # model-initiated tool call detection
+                if model_initiated_tool_calls < MODEL_MAX_TOOL_CALLS:
+                        m = TOOL_CALL_RE.search(fullText)
+                        if m:
+                            try:
+                                payload_raw = m.group(1)
+                                import json
+                                payload = json.loads(payload_raw)
+                                tool_name = payload.get('name')
+                                tool_args = payload.get('args', {})
+                                tool_res = None
+                                if tool_name == 'web_search':
+                                    from utils import web_search
+                                    q = tool_args.get('query') or (request.prompt if request.prompt else cleanMessages(request.messages or []))
+                                    k = int(tool_args.get('top_k') or request.search_top_k or 3)
+                                    tool_res = web_search(q, k)
+                                elif tool_name in ('calc', 'calculator'):
+                                    from utils import calc
+                                    expr = tool_args.get('expression')
+                                    if expr:
+                                        tool_res = calc(expr)
+                                else:
+                                    try:
+                                        tool_res = universal_tool({'query': tool_args.get('query') or payload.get('query') or ''}, allow_web_search=bool(web_search_enabled), allow_tools=bool(tools_enabled), allow_file_tool=bool(file_tool_enabled))
+                                    except Exception:
+                                        tool_res = None
+                                if tool_res:
+                                    # Normalize tool_res into a structured dict if needed
+                                    if not isinstance(tool_res, dict):
+                                        if tool_name in ('calc', 'calculator'):
+                                            tool_res_struct = {"action": "calc", "result": str(tool_res), "metadata": {"expression": tool_args.get('expression'), "confidence": 0.98}}
+                                        elif tool_name == 'web_search':
+                                            tool_res_struct = {"action": "web_search", "result": str(tool_res), "metadata": {"query": tool_args.get('query'), "top_k": tool_args.get('top_k') or request.search_top_k or 3, "confidence": 0.9}}
+                                        else:
+                                            tool_res_struct = {"action": tool_name, "result": str(tool_res), "metadata": {"confidence": 0.6}}
+                                    else:
+                                        tool_res_struct = tool_res
+                                    exec_entry = {"name": tool_name, "args": tool_args, "result": tool_res_struct, 'initiated_by_model': True}
+                                    executed_tool_calls.append(exec_entry)
+                                    delta_text = f"ToolResults:\n{tool_res_struct.get('result')}\n\n"
+                                    prompt = delta_text + prompt
+                                    fullText = TOOL_CALL_RE.sub('', fullText)
+                                    buffer = [fullText]
+                                    out, model_tokens, model_state = await runPrefill(request, delta_text, model_tokens, model_state)
+                                    model_initiated_tool_calls += 1
+                                    should_restart = True
+                                    break
+                            except Exception as e:
+                                logger.info(f"Model-initiated tool handling error: {e}")
                 yield f"data: {response.model_dump_json()}\n\n"
                 # check stop sequences and stop streaming if we see them
                 for stop_words in request.stop or []:
         del streamConfig
     else:
+        should_restart = True
+        while should_restart:
+            should_restart = False
+            gen = generate(request, out, model_tokens, model_state)
+            for chunk in gen:
+                completionTokenCount += 1
+                buffer.append(chunk["content"])
+                if chunk["finish_reason"]:
+                    finishReason = chunk["finish_reason"]
+                # Save model state frequently (after each token) to allow resuming
+                try:
+                    if request.state_name:
+                        STATE_STORE[(request.model, request.state_name)] = {
+                            'state': model_state,
+                            'model_tokens': model_tokens,
+                        }
+                        if getattr(CONFIG, 'STATE_STORE_SAVE_ON_UPDATE', False):
+                            try:
+                                _save_state_store_to_disk(force=True)
+                            except Exception:
+                                pass
+                except Exception:
+                    pass
+                # Detect model-initiated tool calls
+                if model_initiated_tool_calls < MODEL_MAX_TOOL_CALLS:
+                    fullText = ''.join(buffer)
+                    m = TOOL_CALL_RE.search(fullText)
+                    if m:
+                        try:
+                            payload_raw = m.group(1)
+                            import json
+                            payload = json.loads(payload_raw)
+                            tool_name = payload.get('name')
+                            tool_args = payload.get('args', {})
+                            tool_res = None
+                            if tool_name == 'web_search':
+                                from utils import web_search
+                                q = tool_args.get('query') or (request.prompt if request.prompt else cleanMessages(request.messages or []))
+                                k = int(tool_args.get('top_k') or request.search_top_k or 3)
+                                tool_res = web_search(q, k)
+                            elif tool_name in ('calc', 'calculator'):
+                                from utils import calc
+                                expr = tool_args.get('expression')
+                                if expr:
+                                    tool_res = calc(expr)
+                            else:
+                                try:
+                                    tool_res = universal_tool({'query': tool_args.get('query') or payload.get('query') or ''}, allow_web_search=bool(web_search_enabled), allow_tools=bool(tools_enabled), allow_file_tool=bool(file_tool_enabled))
+                                except Exception:
+                                    tool_res = None
+                            if tool_res:
+                                if not isinstance(tool_res, dict):
+                                    if tool_name in ('calc', 'calculator'):
+                                        tool_res_struct = {"action": "calc", "result": str(tool_res), "metadata": {"expression": tool_args.get('expression'), "confidence": 0.98}}
+                                    elif tool_name == 'web_search':
+                                        tool_res_struct = {"action": "web_search", "result": str(tool_res), "metadata": {"query": tool_args.get('query'), "top_k": tool_args.get('top_k') or request.search_top_k or 3, "confidence": 0.9}}
+                                    else:
+                                        tool_res_struct = {"action": tool_name, "result": str(tool_res), "metadata": {"confidence": 0.6}}
+                                else:
+                                    tool_res_struct = tool_res
+                                exec_entry = {"name": tool_name, "args": tool_args, "result": tool_res_struct, 'initiated_by_model': True}
+                                executed_tool_calls.append(exec_entry)
+                                delta_text = f"ToolResults:\n{tool_res_struct.get('result')}\n\n"
+                                prompt = delta_text + prompt
+                                fullText = TOOL_CALL_RE.sub('', fullText)
+                                buffer = [fullText]
+                                out, model_tokens, model_state = await runPrefill(request, delta_text, model_tokens, model_state)
+                                # Notify client that a tool was called mid-stream (metadata-only chunk)
+                                try:
+                                    meta_resp = ChatCompletionChunk(
+                                        id=completionId,
+                                        created=createTimestamp,
+                                        model=request.model,
+                                        usage=(
+                                            Usage(
+                                                prompt_tokens=promptTokenCount,
+                                                completion_tokens=completionTokenCount,
+                                                total_tokens=promptTokenCount + completionTokenCount,
+                                                prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+                                            )
+                                            if request.include_usage
+                                            else None
+                                        ),
+                                        choices=[
+                                            ChatCompletionChoice(
+                                                index=0,
+                                                delta=ChatCompletionMessage(role="Assistant", content=None, reasoning_content=None, tool_calls=executed_tool_calls),
+                                                logprobs=None,
+                                                finish_reason=None,
+                                            )
+                                        ],
+                                    )
+                                    yield f"data: {meta_resp.model_dump_json()}\n\n"
+                                except Exception:
+                                    pass
+                                model_initiated_tool_calls += 1
+                                should_restart = True
+                                break
+                        except Exception as e:
+                            logger.info(f"Model-initiated tool handling error: {e}")
+                response = ChatCompletionChunk(
+                    id=completionId,
+                    created=createTimestamp,
+                    model=request.model,
+                    usage=(
+                        Usage(
+                            prompt_tokens=promptTokenCount,
+                            completion_tokens=completionTokenCount,
+                            total_tokens=promptTokenCount + completionTokenCount,
+                            prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+                        )
+                        if request.include_usage
+                        else None
+                    ),
+                    choices=[
+                        ChatCompletionChoice(
+                            index=0,
+                            delta=ChatCompletionMessage(role="Assistant", content=chunk["content"], reasoning_content=None, tool_calls=None),
+                            logprobs=None,
+                            finish_reason=finishReason,
+                        )
+                    ],
+                )
+                yield f"data: {response.model_dump_json()}\n\n"
+                await asyncio.sleep(0)
     genenrateTime = time.time()
     completionId = str(next(CompletionIdGenerator))
     logger.info(f"[REQ] {completionId} - {request.model_dump()}")
+    # Support model suffixes like ':thinking' for reasoning or ':web' to request
+    # web search by default for this request. E.g., 'rwkv-latest:web' will enable web_search.
     modelName = request.model.split(":")[0]
+    if ":web" in request.model:
+        request.enable_web_search = True
+    if ":file" in request.model:
+        request.enable_file_tool = True
     enableReasoning = ":thinking" in request.model
     if "rwkv-latest" in request.model:
         model_tokens_for_resume = stored.get('model_tokens', [0])
     request_dict = request.model_dump()
+    # Apply defaults from model's DEFAULT_SAMPLER, optionally overridden by the
+    # per-request `sampler` object (or legacy sampler_allow_* booleans).
+    sampler_overrides = request_dict.get('sampler') or {}
     for k, v in defaultSamplerConfig.model_dump().items():
+        # If the request provided a sampler override for this field, use it
+        if sampler_overrides and k in sampler_overrides and sampler_overrides.get(k) is not None:
+            request_dict[k] = sampler_overrides.get(k)
+            continue
         if k in request_dict and request_dict[k] is None:
             request_dict[k] = v
     realRequest = ChatCompletionRequest(**request_dict)
+    # Ensure stream defaults to configuration value when not explicitly provided
+    if realRequest.stream is None:
+        realRequest.stream = CONFIG.DEFAULT_STREAM
     logger.info(f"[REQ] {completionId} - Real - {request.model_dump()}")
+    if realRequest.stream:
         r = StreamingResponse(
             chatResponseStream(realRequest, model_state, completionId, enableReasoning),
             media_type="text/event-stream",
     return r
+# We keep the service API-only; remove static mount for demo frontend to
+# avoid serving HTML files by default and keep the repository Python-only.
+logger.info("Static frontend mount removed for Python-only deploy; use API endpoints for integration")
+@app.get('/api/v1/models')
+def list_models():
+    """Return model configuration summary for clients/UI.
+    This endpoint returns configured models, their default sampler values, and
+    ALLOW_* flags so UI clients can build a controls surface based on server
+    capabilities (web search, tools, reasoning).
+    """
+    out = []
+    root_defaults = {
+        'ALLOW_FILE_TOOL_BY_DEFAULT': getattr(CONFIG, 'ALLOW_FILE_TOOL_BY_DEFAULT', True),
+        'ENABLE_WEB_SEARCH_BY_DEFAULT': getattr(CONFIG, 'ENABLE_WEB_SEARCH_BY_DEFAULT', True),
+        'ENABLE_REASONING_BY_DEFAULT': getattr(CONFIG, 'ENABLE_REASONING_BY_DEFAULT', True),
+        'SHOW_WEB_SEARCH_BUTTON_BY_DEFAULT': getattr(CONFIG, 'SHOW_WEB_SEARCH_BUTTON_BY_DEFAULT', True),
+        'SHOW_FILE_UPLOAD_BUTTON_BY_DEFAULT': getattr(CONFIG, 'SHOW_FILE_UPLOAD_BUTTON_BY_DEFAULT', True),
+        'SHOW_REASONING_TOGGLE_BY_DEFAULT': getattr(CONFIG, 'SHOW_REASONING_TOGGLE_BY_DEFAULT', True),
+        'UPLOAD_URL': '/api/v1/files',
+    }
+    for m in CONFIG.MODELS:
+        out.append(
+            {
+                'SERVICE_NAME': m.SERVICE_NAME,
+                'DEFAULT_CHAT': m.DEFAULT_CHAT,
+                'DEFAULT_REASONING': m.DEFAULT_REASONING,
+                'ALLOW_WEB_SEARCH': getattr(m, 'ALLOW_WEB_SEARCH', True),
+                'ALLOW_TOOLS': getattr(m, 'ALLOW_TOOLS', True),
+                'ALLOW_REASONING': getattr(m, 'ALLOW_REASONING', True),
+                'ALLOW_FILE_TOOL': getattr(m, 'ALLOW_FILE_TOOL', True),
+                'SHOW_WEB_SEARCH_BUTTON': getattr(m, 'SHOW_WEB_SEARCH_BUTTON', True),
+                'SHOW_FILE_UPLOAD_BUTTON': getattr(m, 'SHOW_FILE_UPLOAD_BUTTON', True),
+                'SHOW_REASONING_TOGGLE': getattr(m, 'SHOW_REASONING_TOGGLE', True),
+                'DEFAULT_SAMPLER': m.DEFAULT_SAMPLER.model_dump() if hasattr(m, 'DEFAULT_SAMPLER') else None,
+                # Convenience info for clients: upload endpoint and root defaults
+                'UPLOAD_URL': '/api/v1/files',
+                'UPLOAD_ALLOWED_BY_DEFAULT': getattr(CONFIG, 'ALLOW_FILE_TOOL_BY_DEFAULT', True),
+            }
+        )
+    return {'root_defaults': root_defaults, 'models': out}
+@app.post('/api/v1/files', response_model=FileUploadResponse)
+async def upload_file(file: UploadFile = File(...), model: Optional[str] = None):
+    """Save uploaded file to CONFIG.UPLOAD_DIR and return metadata."""
+    try:
+        # Respect root-level upload toggle
+        if not getattr(CONFIG, 'ALLOW_FILE_TOOL_BY_DEFAULT', True):
+            raise HTTPException(403, 'File uploads are disabled by server configuration')
+        # If a model is provided, verify the model allows file tools
+        if model:
+            if model not in MODEL_STORAGE:
+                raise HTTPException(404, f"Model {model} not found")
+            ms = MODEL_STORAGE[model]
+            if ms and ms.MODEL_CONFIG and not getattr(ms.MODEL_CONFIG, 'ALLOW_FILE_TOOL', True):
+                raise HTTPException(403, f"Model {model} does not allow file uploads")
+        from utils import save_bytes_to_upload
+        content = await file.read()
+        fname = file.filename if getattr(file, 'filename', None) else 'uploaded_file'
+        meta = save_bytes_to_upload(fname, content)
+        if meta.get('error'):
+            raise HTTPException(500, f"Could not save file: {meta.get('error')}")
+        UPLOADED_FILES[meta['file_id']] = meta
+        return FileUploadResponse(success=True, file=UploadedFile(**meta))
+    except Exception as e:
+        raise HTTPException(500, str(e))
+@app.get('/api/v1/files')
+def list_files():
+    return [UploadedFile(**v).model_dump() for v in UPLOADED_FILES.values()]
+@app.get('/api/v1/files/{file_id}')
+def get_file(file_id: str, download: bool = False):
+    if file_id not in UPLOADED_FILES:
+        raise HTTPException(404, 'File not found')
+    meta = UPLOADED_FILES[file_id]
+    if download:
+        # return file contents
+        try:
+            with open(meta['path'], 'rb') as f:
+                return StreamingResponse(f, media_type='application/octet-stream')
+        except Exception as e:
+            raise HTTPException(500, str(e))
+    return UploadedFile(**meta)
+@app.delete('/api/v1/files/{file_id}')
+def delete_file(file_id: str):
+    if file_id not in UPLOADED_FILES:
+        raise HTTPException(404, 'File not found')
+    meta = UPLOADED_FILES.pop(file_id)
+    try:
+        if os.path.exists(meta['path']):
+            os.remove(meta['path'])
+    except Exception:
+        pass
+    return {'success': True}
 if __name__ == "__main__":
     import uvicorn

config.local.yaml CHANGED Viewed

@@ -22,3 +22,6 @@ MODELS:
         - "\n\n"
       stop_tokens:
         - 0

         - "\n\n"
       stop_tokens:
         - 0
+      ALLOW_WEB_SEARCH: True
+      ALLOW_TOOLS: True
+      ALLOW_REASONING: True

config.production-modelscope.yaml CHANGED Viewed

@@ -3,6 +3,11 @@ PORT: 7860
 STRATEGY: "cuda fp16"
 RWKV_CUDA_ON: True
 CHUNK_LEN: 256
 MODELS:
   - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
     DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
@@ -22,3 +27,6 @@ MODELS:
         - "\n\n"
       stop_tokens:
         - 0

 STRATEGY: "cuda fp16"
 RWKV_CUDA_ON: True
 CHUNK_LEN: 256
+DEFAULT_STREAM: True
+AUTO_ENABLE_TOOLS: True
+AUTO_ENABLE_REASONING: True
+AUTO_ENABLE_WEB_SEARCH: True
+ENABLE_TOOLS_BY_DEFAULT: False
 MODELS:
   - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
     DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
         - "\n\n"
       stop_tokens:
         - 0
+      ALLOW_WEB_SEARCH: True
+      ALLOW_TOOLS: True
+      ALLOW_REASONING: True

config.production.yaml CHANGED Viewed

@@ -3,6 +3,11 @@ PORT: 7860
 STRATEGY: "cuda fp16"
 RWKV_CUDA_ON: True
 CHUNK_LEN: 256
 MODELS:
   - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
     DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
@@ -11,6 +16,9 @@ MODELS:
     REASONING: True
     DEFAULT_CHAT: True
     DEFAULT_REASONING: True
     DEFAULT_SAMPLER:
       max_tokens: 4096
       temperature: 1.0
@@ -21,4 +29,10 @@ MODELS:
       stop:
         - "\n\n"
       stop_tokens:
-        - 0

 STRATEGY: "cuda fp16"
 RWKV_CUDA_ON: True
 CHUNK_LEN: 256
+DEFAULT_STREAM: True
+AUTO_ENABLE_TOOLS: True
+AUTO_ENABLE_REASONING: True
+AUTO_ENABLE_WEB_SEARCH: True
+ENABLE_TOOLS_BY_DEFAULT: False
 MODELS:
   - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
     DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
     REASONING: True
     DEFAULT_CHAT: True
     DEFAULT_REASONING: True
+    ALLOW_WEB_SEARCH: True
+    ALLOW_TOOLS: True
+    ALLOW_REASONING: True
     DEFAULT_SAMPLER:
       max_tokens: 4096
       temperature: 1.0
       stop:
         - "\n\n"
       stop_tokens:
+        - 0
+      ALLOW_WEB_SEARCH: True
+      ALLOW_TOOLS: True
+      ALLOW_REASONING: True
+STATE_STORE_PATH: "./state_store.json"
+STATE_STORE_FLUSH_INTERVAL: 5
+STATE_STORE_SAVE_ON_UPDATE: True

config.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from pydantic import BaseModel, Field
-from typing import List, Optional
 from typing import List, Optional, Union, Any
 import sys
@@ -12,7 +11,7 @@ class CliConfig(BaseSettings, cli_parse_args=True, cli_use_class_docs_for_groups
     CONFIG_FILE: str = Field("./config.local.yaml", description="Config file path")
-CLI_CONFIG = CliConfig()
 class SamplerConfig(BaseModel):
@@ -26,6 +25,15 @@ class SamplerConfig(BaseModel):
     penalty_decay: float = Field(0.996, description="Penalty decay factor.")
     stop: List[str] = Field(["\n\n"], description="List of stop sequences.")
     stop_tokens: List[int] = Field([0], description="List of stop tokens.")
 class ModelConfig(BaseModel):
@@ -52,26 +60,66 @@ class ModelConfig(BaseModel):
     DEFAULT_CHAT: bool = Field(False, description="Whether this model is the default chat model.")
     DEFAULT_REASONING: bool = Field(False, description="Whether this model is the default reasoning model.")
     DEFAULT_SAMPLER: SamplerConfig = Field(
-        SamplerConfig(), description="Default sampler configuration for this model."
     )
     VOCAB: str = Field("rwkv_vocab_v20230424", description="Vocab Name")
 class RootConfig(BaseModel):
     """Root configuration for the RWKV service."""
-    HOST: Optional[str] = Field(
-        "127.0.0.1", description="Host IP address to bind to."
-    )  # 注释掉可选的HOST和PORT
-    PORT: Optional[int] = Field(
-        8000, description="Port number to listen on."
-    )  # 因为YAML示例中被注释掉了
-    STRATEGY: str = Field(
-        "cpu", description="Strategy for model execution (e.g., 'cuda fp16')."
-    )
     RWKV_CUDA_ON: bool = Field(False, description="Whether to enable RWKV CUDA kernel.")
     CHUNK_LEN: int = Field(256, description="Chunk length for processing.")
     MODELS: List[ModelConfig] = Field(..., description="List of model configurations.")
 import yaml
@@ -81,4 +129,5 @@ try:
         CONFIG = RootConfig.model_validate(yaml.safe_load(f.read()))
 except Exception as e:
     print(f"Pydantic Model Validation Failed: {e}")
-    sys.exit(0)

 from pydantic import BaseModel, Field
 from typing import List, Optional, Union, Any
 import sys
     CONFIG_FILE: str = Field("./config.local.yaml", description="Config file path")
+CLI_CONFIG = CliConfig(CONFIG_FILE="./config.local.yaml")
 class SamplerConfig(BaseModel):
     penalty_decay: float = Field(0.996, description="Penalty decay factor.")
     stop: List[str] = Field(["\n\n"], description="List of stop sequences.")
     stop_tokens: List[int] = Field([0], description="List of stop tokens.")
+    ALLOW_WEB_SEARCH: Optional[bool] = Field(None, description="Per-sampler override for allowing web search. If None, falls back to model/global.")
+    ALLOW_FILE_TOOL: Optional[bool] = Field(None, description="Per-sampler override for allowing file tools (e.g., file_read). If None, falls back to model/global.")
+    ALLOW_TOOLS: Optional[bool] = Field(None, description="Per-sampler override for allowing server-side tools. If None, falls back to model/global.")
+    ALLOW_REASONING: Optional[bool] = Field(None, description="Per-sampler override for allowing built-in reasoning. If None, falls back to model/global.")
+    # UI flags (non-functional in server, included so UI clients can show controls)
+    SHOW_WEB_SEARCH_BUTTON: Optional[bool] = Field(None, description="Whether to show the web-search toggle in the client UI for this sampler")
+    SHOW_FILE_UPLOAD_BUTTON: Optional[bool] = Field(None, description="Whether to show the file-upload control in the client UI for this sampler")
+    SHOW_REASONING_TOGGLE: Optional[bool] = Field(None, description="Whether to show the reasoning (think) toggle in the client UI for this sampler")
+    UI_STYLE: Optional[str] = Field(None, description="UI style hint that clients may use to render controls (example: 'whatsapp' or 'compact')")
 class ModelConfig(BaseModel):
     DEFAULT_CHAT: bool = Field(False, description="Whether this model is the default chat model.")
     DEFAULT_REASONING: bool = Field(False, description="Whether this model is the default reasoning model.")
     DEFAULT_SAMPLER: SamplerConfig = Field(
+        SamplerConfig(
+            max_tokens=512,
+            temperature=1.0,
+            top_p=0.3,
+            presence_penalty=0.5,
+            count_penalty=0.5,
+            penalty_decay=0.996,
+            stop=["\n\n"],
+            stop_tokens=[0],
+            ALLOW_WEB_SEARCH=None,
+            ALLOW_TOOLS=None,
+            ALLOW_REASONING=None,
+            ALLOW_FILE_TOOL=None,
+            SHOW_WEB_SEARCH_BUTTON=None,
+            SHOW_FILE_UPLOAD_BUTTON=None,
+            SHOW_REASONING_TOGGLE=None,
+            UI_STYLE=None,
+        ),
+        description="Default sampler configuration for this model."
     )
     VOCAB: str = Field("rwkv_vocab_v20230424", description="Vocab Name")
+    # Allow or disallow server-side features on a per-model basis
+    ALLOW_WEB_SEARCH: bool = Field(True, description="Whether this model supports web search injection")
+    ALLOW_TOOLS: bool = Field(True, description="Whether this model supports server-side tools execution")
+    ALLOW_REASONING: bool = Field(True, description="Whether this model supports built-in reasoning (in-process)")
+    ALLOW_FILE_TOOL: bool = Field(True, description="Whether this model supports file-based tools (file_upload/file_read)")
+    # UI flags for the model that the client may use to show/hide controls
+    SHOW_WEB_SEARCH_BUTTON: bool = Field(True, description="Whether to show the web search toggle for this model in client UIs")
+    SHOW_FILE_UPLOAD_BUTTON: bool = Field(True, description="Whether to show a file upload button for this model in client UIs")
+    SHOW_REASONING_TOGGLE: bool = Field(True, description="Whether to show the reasoning toggle for this model in client UIs")
 class RootConfig(BaseModel):
     """Root configuration for the RWKV service."""
+    HOST: Optional[str] = Field("127.0.0.1", description="Host IP address to bind to.")
+    PORT: Optional[int] = Field(8000, description="Port number to listen on.")
+    STRATEGY: str = Field("cpu", description="Strategy for model execution (e.g., 'cuda fp16').")
     RWKV_CUDA_ON: bool = Field(False, description="Whether to enable RWKV CUDA kernel.")
     CHUNK_LEN: int = Field(256, description="Chunk length for processing.")
     MODELS: List[ModelConfig] = Field(..., description="List of model configurations.")
+    # Additional defaults for auto behavior
+    DEFAULT_STREAM: bool = Field(True, description="Whether streaming is enabled by default")
+    AUTO_ENABLE_TOOLS: bool = Field(True, description="Whether to try auto-enabling tools based on intent")
+    AUTO_ENABLE_REASONING: bool = Field(True, description="Whether to auto-enable reasoning when needed")
+    AUTO_ENABLE_WEB_SEARCH: bool = Field(True, description="Whether to auto-enable web search based on intent")
+    ENABLE_TOOLS_BY_DEFAULT: bool = Field(False, description="Whether tools are enabled by default (without explicit request)")
+    ENABLE_WEB_SEARCH_BY_DEFAULT: bool = Field(True, description="Whether web search is enabled by default")
+    ENABLE_REASONING_BY_DEFAULT: bool = Field(True, description="Whether model reasoning is enabled by default when requested/supported")
+    # State store persistence
+    STATE_STORE_PATH: str = Field("./state_store.json", description="Path to persist streaming/resume state store")
+    STATE_STORE_FLUSH_INTERVAL: int = Field(5, description="Seconds between background flushes to the state store file")
+    STATE_STORE_SAVE_ON_UPDATE: bool = Field(True, description="Whether to save the state store to disk immediately when updated")
+    # File uploads / tools
+    UPLOAD_DIR: str = Field("./uploads", description="Directory to store uploaded files")
+    ALLOW_FILE_TOOL_BY_DEFAULT: bool = Field(True, description="Whether file-based tools are enabled by default")
+    # UI flags for the root server. These flags are advisory only and do not enable functionality.
+    SHOW_WEB_SEARCH_BUTTON_BY_DEFAULT: bool = Field(True, description="Whether to show web search toggle by default in clients")
+    SHOW_FILE_UPLOAD_BUTTON_BY_DEFAULT: bool = Field(True, description="Whether to show file-upload control by default in clients")
+    SHOW_REASONING_TOGGLE_BY_DEFAULT: bool = Field(True, description="Whether to show reasoning toggle by default in clients")
 import yaml
         CONFIG = RootConfig.model_validate(yaml.safe_load(f.read()))
 except Exception as e:
     print(f"Pydantic Model Validation Failed: {e}")
+    # Exit with non-zero to indicate error when config is invalid
+    sys.exit(1)

models/.cache/huggingface/download/rwkv7-g1a-0.1b-20250728-ctx4096.pth.metadata CHANGED Viewed

@@ -1,3 +1,3 @@
 8c8cdf8c605dc7dfdccb676b9d0c482ba002f710
 964f01cc4673273bbcf1b9c3cdc243d58af97bffeab51cb20c752eeaf048a3c6
-1763947179.4323187

 8c8cdf8c605dc7dfdccb676b9d0c482ba002f710
 964f01cc4673273bbcf1b9c3cdc243d58af97bffeab51cb20c752eeaf048a3c6
+1763950644.4308126

tests/api_test.py CHANGED Viewed

@@ -83,3 +83,14 @@ except Exception as e:
     print('Error in streaming request:', e)
 print('\nDone tests')

     print('Error in streaming request:', e)
 print('\nDone tests')
+print('\nChecking model listing endpoint')
+try:
+    r = requests.get('http://127.0.0.1:7860/api/v1/models', timeout=10)
+    print('Models endpoint status', r.status_code)
+    try:
+        print(json.dumps(r.json(), indent=2))
+    except Exception:
+        print('Models endpoint returned non-JSON:', r.text[:200])
+except Exception as e:
+    print('Error calling models endpoint:', e)

tests/run_api_single_request.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os, sys
+sys.path.append(os.getcwd())
+from app import chat_completions, ChatCompletionRequest
+import asyncio
+async def main():
+    req = ChatCompletionRequest(model='rwkv-latest', prompt='Who is the current president of France?', stream=False, max_tokens=32, temperature=0.2, include_usage=True, web_search=None, auto_web_search=True)
+    res = await chat_completions(req)
+    print(res)
+if __name__ == '__main__':
+    asyncio.run(main())

tests/run_autodetect_flags.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os, sys
+sys.path.append(os.getcwd())
+from app import ChatCompletionRequest
+from utils import detect_tools_and_reasoning
+from config import CONFIG
+# convenience function to compute flags, basically copying the logic we used in chatResponse
+def compute_flags(req: ChatCompletionRequest):
+    prompt = req.prompt if req.prompt else (req.messages and '\n\n'.join([m.content for m in req.messages]) or '')
+    detection = detect_tools_and_reasoning(prompt)
+    web_search_enabled = (
+        True
+        if (req.enable_web_search is not None and req.enable_web_search)
+        else (
+            req.web_search
+            or (req.auto_web_search if req.auto_web_search is not None else CONFIG.AUTO_ENABLE_WEB_SEARCH and detection.get('need_web_search'))
+        )
+    )
+    if req.enable_tools is not None:
+        tools_enabled = bool(req.enable_tools)
+    else:
+        auto_tools_flag = req.auto_tools if req.auto_tools is not None else CONFIG.AUTO_ENABLE_TOOLS
+        tools_enabled = bool(req.tools) or CONFIG.ENABLE_TOOLS_BY_DEFAULT or (auto_tools_flag and (detection.get('need_calc') or detection.get('need_web_search')))
+    if req.enable_reasoning is not None:
+        reasoning_enabled = bool(req.enable_reasoning)
+    else:
+        reasoning_enabled = False
+    return {
+        'detection': detection,
+        'web_search_enabled': web_search_enabled,
+        'tools_enabled': tools_enabled,
+        'reasoning_enabled': reasoning_enabled,
+    }
+# test cases
+cases = [
+    ChatCompletionRequest(model='rwkv-latest', prompt='Who is the current president of France?', stream=None, auto_web_search=True, auto_tools=None, auto_reasoning=None),
+    ChatCompletionRequest(model='rwkv-latest', prompt='Calculate 2+3*4 for me', stream=None, auto_web_search=True, auto_tools=True, auto_reasoning=None),
+    ChatCompletionRequest(model='rwkv-latest', prompt='Explain why the sky is blue', stream=None, auto_web_search=False, auto_tools=None, auto_reasoning=True),
+]
+for c in cases:
+    print('---')
+    print(c.prompt)
+    print(compute_flags(c))

tests/run_chat_response.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os, sys, asyncio
+sys.path.append(os.getcwd())
+from app import chatResponse, ChatCompletionRequest
+async def test():
+    req = ChatCompletionRequest(model='rwkv-latest', prompt='Who is the president of France today?', stream=False, max_tokens=2, temperature=0.2, include_usage=True, auto_web_search=True)
+    res = await chatResponse(req, model_state=None, completionId='test123', enableReasoning=False)
+    print(res.model_dump())
+if __name__ == '__main__':
+    asyncio.run(test())

tests/run_chat_response_out.txt ADDED Viewed

Binary file (7.31 kB). View file

tests/run_detect.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import sys, os
+sys.path.append(os.getcwd())
+from utils import detect_tools_and_reasoning
+print(detect_tools_and_reasoning('Who is the president of France today?'))
+print(detect_tools_and_reasoning('Calculate 2+3*4 for me'))
+print(detect_tools_and_reasoning('Explain why the sky is blue'))

tests/run_injected_tools.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os, sys
+sys.path.append(os.getcwd())
+from app import ChatCompletionRequest
+from utils import detect_tools_and_reasoning
+from config import CONFIG
+from pprint import pprint
+def emulate_injection(req: ChatCompletionRequest):
+    raw_prompt = req.prompt.strip() if req.prompt is not None else '\n\n'.join([m.content for m in req.messages]) if req.messages else ''
+    detection = detect_tools_and_reasoning(raw_prompt)
+    # compute web_search_enabled
+    web_search_enabled = (
+        True
+        if (req.enable_web_search is not None and req.enable_web_search)
+        else (
+            req.web_search
+            or (req.auto_web_search if req.auto_web_search is not None else CONFIG.AUTO_ENABLE_WEB_SEARCH and detection.get('need_web_search'))
+        )
+    )
+    if req.enable_tools is not None:
+        tools_enabled = bool(req.enable_tools)
+    else:
+        # Respect sampler override if present
+        if req.sampler and isinstance(req.sampler, dict) and req.sampler.get('ALLOW_TOOLS') is not None:
+            tools_enabled = bool(req.sampler.get('ALLOW_TOOLS'))
+        else:
+            auto_tools_flag = req.auto_tools if req.auto_tools is not None else CONFIG.AUTO_ENABLE_TOOLS
+            tools_enabled = bool(req.tools) or CONFIG.ENABLE_TOOLS_BY_DEFAULT or (auto_tools_flag and (detection.get('need_calc') or detection.get('need_web_search')))
+    if req.enable_reasoning is not None:
+        reasoning_enabled = bool(req.enable_reasoning)
+    else:
+        reasoning_enabled = False
+    # If tools_enabled and not provided, add detected tools
+    if tools_enabled and not req.tools and detection.get('detected_tools'):
+        req.tools = detection.get('detected_tools')
+    # If web_search should be used, and not already set, set flag
+    if web_search_enabled and not req.web_search:
+        req.web_search = True
+    return {
+        'raw_prompt': raw_prompt,
+        'detection': detection,
+        'web_search_enabled': web_search_enabled,
+        'tools_enabled': tools_enabled,
+        'reasoning_enabled': reasoning_enabled,
+        'req': req.model_dump(),
+    }
+# test cases
+cases = [
+    ChatCompletionRequest(model='rwkv-latest', prompt='Who is the current president of France?', stream=None, auto_web_search=True, auto_tools=None, auto_reasoning=None),
+    ChatCompletionRequest(model='rwkv-latest', prompt='Calculate 2+3*4 for me', stream=None, auto_web_search=True, auto_tools=True, auto_reasoning=None),
+    ChatCompletionRequest(model='rwkv-latest', prompt='Explain why the sky is blue', stream=None, auto_web_search=False, auto_tools=None, auto_reasoning=True),
+    # Sampler override should disable web_search even though auto_web_search is True
+    ChatCompletionRequest(model='rwkv-latest', prompt='Who is the current president of France?', stream=None, auto_web_search=True, auto_tools=None, auto_reasoning=None, sampler_allow_web_search=False),
+    # Per-request sampler object also should disable tools
+    ChatCompletionRequest(model='rwkv-latest', prompt='Calculate 2+3*4 for me', stream=None, auto_web_search=True, auto_tools=None, auto_reasoning=None, sampler= { 'ALLOW_TOOLS': False }),
+]
+for c in cases:
+    print('---')
+    pprint(emulate_injection(c))

tests/test_client_api.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from fastapi.testclient import TestClient
+from app import app
+import json
+client = TestClient(app)
+print('Non-streaming test')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Who is the president of France today?',
+    'stream': False,
+    'max_tokens': 64,
+    'temperature': 0.2,
+    'include_usage': True,
+}
+res = client.post('/api/v1/chat/completions', json=payload)
+print('Status', res.status_code)
+try:
+    print(json.dumps(res.json(), indent=2))
+except Exception as e:
+    print('Response not JSON or parse failed', e)
+print('\nTools calc test')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Calculate 2+3*4 and explain the result.',
+    'stream': False,
+    'tools': [{'name': 'calc', 'args': {'expression': '2+3*4'}}],
+}
+res = client.post('/api/v1/chat/completions', json=payload)
+print('Status', res.status_code)
+try:
+    print(json.dumps(res.json(), indent=2))
+except Exception as e:
+    print('Response not JSON or parse failed', e)
+print('\nTools web_search test')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Who is the current president of France?',
+    'stream': False,
+    'web_search': True,
+    'search_top_k': 2,
+}
+res = client.post('/api/v1/chat/completions', json=payload)
+print('Status', res.status_code)
+try:
+    print(json.dumps(res.json(), indent=2))
+except Exception as e:
+    print('Response not JSON or parse failed', e)

tests/test_universal_and_detect.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os, sys
+sys.path.append(os.getcwd())
+from utils import universal_tool, detect_tools_and_reasoning
+def test_universal_calc():
+    res = universal_tool({"query":"2+3*4"})
+    assert isinstance(res, dict)
+    assert res.get('action') == 'calc'
+    assert 'result' in res
+    assert str(res.get('result')) == '14'
+def test_universal_web_search():
+    res = universal_tool({"query":"Who is the president of France?"})
+    assert isinstance(res, dict)
+    assert res.get('action') in ('web_search', 'calc', 'unknown') or True
+    assert 'result' in res
+def test_detect_calc():
+    d = detect_tools_and_reasoning('Calculate 2+3*4 for me')
+    assert d.get('need_calc')
+    assert any(t.get('name') == 'calc' for t in d.get('detected_tools', []))
+    conf = d.get('confidence') or {}
+    assert conf.get('calc_confidence', 0) > 0.5
+def test_detect_web_search():
+    d = detect_tools_and_reasoning('Who is the president of France?')
+    assert d.get('need_web_search')
+    assert any(t.get('name') == 'web_search' for t in d.get('detected_tools', []))
+    conf = d.get('confidence') or {}
+    assert conf.get('web_search_confidence', 0) > 0.5
+if __name__ == '__main__':
+    test_universal_calc()
+    test_universal_web_search()
+    test_detect_calc()
+    test_detect_web_search()
+    print('All tests passed')

utils.py CHANGED Viewed

@@ -78,11 +78,13 @@ def logger():
     while True:
         item = LOGGER_QUEUE.get()
         try:
-            requests.post(
-                os.environ.get("LOG_PORT"),
-                headers={"Content-Type": "application/json"},
-                json=item,
-            )
         except Exception:
             pass
@@ -175,3 +177,184 @@ def calc(expr: str) -> str:
         return str(result)
     except Exception as e:
         return f"ERROR: {e}"

     while True:
         item = LOGGER_QUEUE.get()
         try:
+            LOG_PORT = os.environ.get("LOG_PORT")
+            if LOG_PORT:
+                requests.post(
+                    LOG_PORT,
+                    headers={"Content-Type": "application/json"},
+                    json=item,
+                )
         except Exception:
             pass
         return str(result)
     except Exception as e:
         return f"ERROR: {e}"
+def detect_tools_and_reasoning(text_or_messages) -> dict:
+    """Detects whether web_search, calc, or reasoning are likely needed based on heuristics.
+    Accepts either a single string prompt or a list of ChatMessage. Returns a dict with booleans and detected tools list.
+    """
+    if isinstance(text_or_messages, list):
+        try:
+            text = "\n\n".join([m.get('content', '') if isinstance(m, dict) else (getattr(m, 'content', '') or '') for m in text_or_messages if m])
+        except Exception:
+            text = ""
+    else:
+        text = str(text_or_messages or "")
+    t = text.lower()
+    # Simple heuristics
+    need_calc = False
+    need_web_search = False
+    need_reasoning = False
+    need_universal = False
+    detected_tools = []
+    # Heuristic for calc: presence of operators AND numbers OR keywords 'calculate/compute' plus numeric tokens
+    if (re.search(r"\d+\s*[-+*/%]\s*\d+", t) or (re.search(r"\b(calculate|compute|solve|evaluate|sum|add|subtract|multiply|divide)\b", t) and re.search(r"\d", t))):
+        need_calc = True
+        # Try to extract a most-likely arithmetic expression from the text
+        # Accept digits, parentheses and operators
+        m = re.search(r"([\d\(\)\s+\-*/%^.]+)", text)
+        expr = m.group(0).strip() if m else None
+        # only keep if it includes an operator
+        if expr and not re.search(r"[-+*/%]", expr):
+            expr = None
+        detected_tools.append({"name": "calc", "args": {"expression": expr, "confidence": 0.95 if expr else 0.5}})
+    # Heuristic for web search: 'who is', 'what is', 'current', 'latest', 'news', or question words with facts
+    # Heuristic for web search: question words + facts or 'current/latest' signals; avoid math queries
+    if (
+        re.search(r"\b(who is|who's|what is|what's|when is|where is|current|latest|news|is the president|president of|population of|capital of|how many|GDP of)\b", t)
+        and not re.search(r"\d+\s*[-+*/%]\s*\d+", t)
+    ):
+        need_web_search = True
+        detected_tools.append({"name": "web_search", "args": {"query": text, "confidence": 0.9}})
+    # Heuristic for reasoning: words like 'explain', 'why', 'reason', 'prove', 'derive', 'compare'
+    if re.search(r"\b(explain|why|because|reason|prove|derive|compare|analysis|analysis:|evaluate|argue|consequence|trade-offs)\b", t):
+        need_reasoning = True
+    # Heuristic for universal tool: requests to "use tool", "execute tool", or generic function-call language
+    if re.search(r"\b(use (a )?tool|execute (a )?tool|call (a )?tool|function call|run tool|do this via a tool|invoke tool|call tool)\b", t):
+        need_universal = True
+    # compute confidence summary
+    # For now, we use a simple heuristic: reasoning >0.8 if key words present; web_search 0.9; calc 0.95 if numeric
+    confs = {
+        "calc_confidence": 0.95 if need_calc else 0.0,
+        "web_search_confidence": 0.9 if need_web_search else 0.0,
+        "reasoning_confidence": 0.85 if need_reasoning else 0.0,
+        "universal_confidence": 0.65 if need_universal else 0.0,
+    }
+    return {
+        "need_calc": need_calc,
+        "need_web_search": need_web_search,
+        "need_reasoning": need_reasoning,
+        "need_universal": need_universal,
+        "detected_tools": detected_tools,
+        "confidence": confs,
+    }
+def ensure_upload_dir():
+    from config import CONFIG
+    try:
+        os.makedirs(CONFIG.UPLOAD_DIR, exist_ok=True)
+    except Exception:
+        pass
+from typing import Optional
+def save_bytes_to_upload(filename: Optional[str], data: bytes) -> dict:
+    from config import CONFIG
+    import hashlib, time, uuid
+    ensure_upload_dir()
+    _id = str(uuid.uuid4())
+    safe_name = f"{_id}_{os.path.basename(str(filename or 'uploaded_file'))}"
+    path = os.path.join(CONFIG.UPLOAD_DIR, safe_name)
+    try:
+        with open(path, 'wb') as f:
+            f.write(data)
+        size = os.path.getsize(path)
+        import mimetypes
+        mime_type = mimetypes.guess_type(path)[0]
+        return {
+            'file_id': _id,
+            'filename': filename,
+            'path': path,
+            'mime_type': mime_type,
+            'size': size,
+            'uploaded_at': int(time.time()),
+        }
+    except Exception as e:
+        return {'error': str(e)}
+def file_read_from_path(path: str, max_bytes: int = 100000) -> str:
+    try:
+        if not path or not os.path.exists(path):
+            return ""
+        with open(path, 'rb') as f:
+            b = f.read(max_bytes)
+            try:
+                return b.decode('utf-8', errors='replace')
+            except Exception:
+                return str(b)
+    except Exception:
+        return ""
+def universal_tool(args: dict, allow_web_search: bool = True, allow_tools: bool = True, allow_file_tool: bool = True) -> dict:
+    """Universal tool: if 'action' is provided, call the corresponding tool; otherwise autodetect using heuristics.
+    Supported actions: 'calc', 'web_search'. If the action is not provided, attempt to detect the appropriate tool.
+    Returns a string result for prompt injection.
+    """
+    if not isinstance(args, dict):
+        return {"error": "ERROR: invalid args for universal tool"}
+    action = args.get("action")
+    query = args.get("query")
+    # explicit action
+    if action == "calc":
+        if not allow_tools:
+            return {"action": "calc", "result": None, "metadata": {"error": "disabled_by_policy", "confidence": 0.0}}
+        expr = args.get("expression") or query
+        if not expr:
+            return {"action": "calc", "result": None, "metadata": {"error": "no expression provided", "confidence": 0.0}}
+        res = calc(str(expr))
+        return {"action": "calc", "result": str(res), "metadata": {"expression": expr, "confidence": 0.98}}
+    if action == "web_search":
+        if not allow_web_search:
+            return {"action": "web_search", "result": "", "metadata": {"error": "disabled_by_policy", "confidence": 0.0}}
+        q = args.get("query") or query
+        if not q:
+            return {"action": "web_search", "result": "", "metadata": {"confidence": 0.0}}
+        res = web_search(str(q), int(args.get("top_k") or 3))
+        return {"action": "web_search", "result": str(res), "metadata": {"query": q, "top_k": int(args.get("top_k") or 3), "confidence": 0.9}}
+    if action == 'file_read':
+        if not allow_file_tool:
+            return {"action": "file_read", "result": None, "metadata": {"error": "disabled_by_policy", "confidence": 0.0}}
+        fpath = args.get('path') or args.get('file_path')
+        if not fpath and args.get('file_id'):
+            from config import CONFIG
+            fid = args.get('file_id')
+            if fid:
+                candidate = os.path.join(CONFIG.UPLOAD_DIR, os.path.basename(str(fid)))
+            else:
+                candidate = None
+            if candidate and os.path.exists(candidate):
+                fpath = candidate
+        if not fpath:
+            return {"action": "file_read", "result": None, "metadata": {"error": "no_path_or_id", "confidence": 0.0}}
+        content = file_read_from_path(fpath, int(args.get('max_bytes') or 100000))
+        return {"action": "file_read", "result": str(content), "metadata": {"path": fpath, "confidence": 0.9}}
+    # auto-detect based on query content
+    if query:
+        # if expression - use calc
+        if re.search(r"\d+\s*[-+*/%]\s*\d+", str(query)):
+            if not allow_tools:
+                return {"action": "calc", "result": None, "metadata": {"error": "disabled_by_policy", "confidence": 0.0}}
+            res = calc(str(query))
+            return {"action": "calc", "result": str(res), "metadata": {"expression": str(query), "confidence": 0.95}}
+        # else, web_search
+        if not allow_web_search:
+            return {"action": "web_search", "result": "", "metadata": {"error": "disabled_by_policy", "confidence": 0.0}}
+        res = web_search(str(query), int(args.get("top_k") or 3))
+        return {"action": "web_search", "result": str(res), "metadata": {"query": str(query), "top_k": int(args.get("top_k") or 3), "confidence": 0.9}}
+    return {"error": "ERROR: could not determine action for universal tool"}