Spaces:

Jdbbd
/

Hhh

Sleeping

App Files Files Community

Ksjsjjdj commited on Nov 24, 2025

Commit

b219d99

verified ·

1 Parent(s): eaf7f79

Upload 34 files

Browse files

Files changed (34) hide show

.gitattributes +47 -35
.gitignore +21 -0
.python-version +1 -0
Dockerfile +62 -0
README.md +89 -12
api_types.py +82 -0
app.py +941 -0
app_stderr.log +33 -0
app_stdout.log +22 -0
config.local.yaml +24 -0
config.production-modelscope.yaml +24 -0
config.production.yaml +24 -0
config.py +84 -0
cuda/gemm_fp16_cublas.cpp +75 -0
cuda/operators.cu +246 -0
cuda/rwkv5.cu +88 -0
cuda/rwkv5_op.cpp +34 -0
cuda/rwkv6.cu +87 -0
cuda/rwkv6_op.cpp +34 -0
cuda/rwkv7.cu +77 -0
cuda/rwkv7_op.cpp +26 -0
cuda/wrapper.cpp +141 -0
download_models.py +62 -0
models/.cache/huggingface/.gitignore +1 -0
models/.cache/huggingface/download/rwkv7-g1a-0.1b-20250728-ctx4096.pth.metadata +3 -0
models/rwkv7-g1a-0.1b-20250728-ctx4096.pth +3 -0
pyproject.toml +45 -0
run_windows.ps1 +14 -0
setup_windows.ps1 +82 -0
tests/api_test.py +85 -0
tests/run_local_exec.py +14 -0
utils.py +177 -0
uv.lock +0 -0
verify_setup.py +54 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,47 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.cache
+*pth
+*.pt
+*.st
+*local*
+dist-frontend/
+.vscode/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

Dockerfile ADDED Viewed

	@@ -0,0 +1,62 @@

+FROM node:20-alpine AS FrontendBuilder
+RUN apk update && apk upgrade && \
+    apk add --no-cache bash git openssh curl rust cargo
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN npm install -g pnpm
+ADD https://api.github.com/repos/SolomonLeon/web-rwkv-realweb/git/refs/heads/ version_1.json
+WORKDIR /app
+RUN git clone https://github.com/SolomonLeon/web-rwkv-realweb.git /app
+WORKDIR /app/web-rwkv-wasm
+RUN ["cargo", "install", "wasm-pack", "--locked"]
+WORKDIR /app
+ENV PATH=/root/.cargo/bin:$PATH
+RUN pnpm install
+RUN if [ "$MODELSCOPE_ENVIRONMENT" = "studio" ]; then \
+    pnpm run build --mode target-rwkv-modelscope-space; \
+  else \
+    pnpm run build --mode target-rwkv-hf-space; \
+  fi
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS Backend
+RUN <<EOF
+apt update
+apt install --no-install-recommends -y \
+    build-essential \
+    git \
+    cuda-nvcc-12-4 \
+    cuda-cudart-dev-12-4 \
+    python3-dev \
+    python3-pip \
+    libpython3.10-dev
+apt clean && rm -rf /var/lib/apt/lists/*
+EOF
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+COPY . .
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/usr/local/cuda/bin:/home/user/.local/bin:$PATH \
+    LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
+    CXX=/usr/bin/g++ \
+    TORCH_CUDA_ARCH_LIST="7.5"
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY --chown=user --from=FrontendBuilder /app/dist $HOME/app/dist-frontend
+RUN uv sync --frozen --extra cu124
+CMD ["sh", "-c", "if [ \"$MODELSCOPE_ENVIRONMENT\" = \"studio\" ]; then CONFIG_FILE=\"./config.production-modelscope.yaml\"; else CONFIG_FILE=\"./config.production.yaml\"; fi; uv run --offline --frozen app.py --config_file \"$CONFIG_FILE\""]

README.md CHANGED Viewed

@@ -1,12 +1,89 @@
----
-title: Xd
-emoji: 🌖
-colorFrom: indigo
-colorTo: indigo
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: xd
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: RWKV HF Space
+emoji: 🐦‍⬛
+colorFrom: purple
+colorTo: pink
+sdk: docker
+pinned: false
+---
+# Simple RWKV OpenAI-Compatible API
+---
+title: RWKV HF Space
+emoji: 🐦‍⬛
+colorFrom: purple
+colorTo: pink
+sdk: docker
+pinned: false
+---
+# Simple RWKV OpenAI-Compatible API
+## Quick Windows Setup (no Docker)
+This repository was originally packaged with a Dockerfile. It now provides a `setup_windows.ps1` script that mirrors Dockerfile actions and sets up the service locally on Windows (installs Python dependencies, builds the frontend, and downloads the 0.1B model).
+Prerequisites:
+- Python 3.10+ installed and in PATH
+- Node.js + npm (optional, required for building the frontend)
+- (Optional) NVIDIA GPU and CUDA (for GPU runtime)
+To setup locally on Windows (CPU-only):
+```powershell
+.\setup_windows.ps1 -gpu:$false -buildFrontend:$true -CONFIG_FILE config.production.yaml
+```
+If you have a compatible NVIDIA GPU and prefer to install GPU-enabled dependencies, run with the `-gpu` switch.
+After setup, run the API:
+```powershell
+#$env:CONFIG_FILE='config.production.yaml'
+python app.py
+```
+The default production config in `config.production.yaml` now contains a single model — the 0.1B model `rwkv7-g1a-0.1b-20250728-ctx4096` — set as default chat and reasoning model.
+To download models defined in any config:
+```powershell
+python download_models.py --config config.production.yaml
+```
+This will store the downloaded .pth files under the `DOWNLOAD_MODEL_DIR` specified in the YAML (defaults to `./models`).
+Advanced features:
+ - `reasoning` is performed in-process by the same model (no external reasoning model is used). Use a request model like `rwkv-latest:thinking` or set the reasoning suffix and the requested model will run reasoning in the same model.
+ - `web_search` functionality is available at the request level — set `web_search: true` and optionally `search_top_k` to inject search results from DuckDuckGo into the prompt. This is executed by the server and provided to the same model as context.
+ - `tools` are executed server-side and results injected into the prompt for the same model. Supported tools: `web_search` and `calc` (calculator). Example of `tools` usage:
+```json
+{
+	"model": "rwkv-latest",
+	"prompt": "Calculate 2+3*4 and tell me the result",
+	"tools": [{"name": "calc", "args": {"expression": "2+3*4"}}]
+}
+```
+Example: POST with `web_search` and reasoning enabled
+```json
+{
+	"model": "rwkv-latest:thinking",
+	"prompt": "Who is the current president of France?",
+	"max_tokens": 32,
+	"web_search": true,
+	"search_top_k": 3
+}
+```
+The server will perform a web search for the prompt, aggregate the top 3 results, and inject those into the prompt, then run the model with reasoning enabled — all using the same model instead of an external reasoning or search model.
+Streaming behavior:
+- The API streams responses token-by-token by default (`stream: true`) and persists a `state_name` for the generation if requested (or will generate one). Provide `state_name` to resume continuation from where the previous stream stopped. The server stores model state in memory under `(model, state_name)` so subsequent requests with the same `state_name` can continue generation from that exact point.

api_types.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import List, Optional, Union, Dict, Any, Literal
+from pydantic import BaseModel, Field
+class ChatMessage(BaseModel):
+    role: str = Field()
+    content: str = Field()
+class Logprob(BaseModel):
+    token: str
+    logprob: float
+    top_logprobs: Optional[List[Dict[str, Any]]] = None
+class LogprobsContent(BaseModel):
+    content: Optional[List[Logprob]] = None
+    refusal: Optional[List[Logprob]] = None
+class FunctionCall(BaseModel):
+    name: str
+    arguments: str
+class ChatCompletionMessage(BaseModel):
+    role: Optional[str] = Field(
+        None, description="The role of the author of this message"
+    )
+    content: Optional[str] = Field(None, description="The contents of the message")
+    reasoning_content: Optional[str] = Field(
+        None, description="The reasoning contents of the message"
+    )
+    tool_calls: Optional[List[Dict[str, Any]]] = Field(
+        None, description="Tool calls generated by the model"
+    )
+class PromptTokensDetails(BaseModel):
+    cached_tokens: int
+class CompletionTokensDetails(BaseModel):
+    reasoning_tokens: int
+    accepted_prediction_tokens: int
+    rejected_prediction_tokens: int
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    prompt_tokens_details: Optional[PromptTokensDetails]
+    # completion_tokens_details: CompletionTokensDetails
+class ChatCompletionChoice(BaseModel):
+    index: int
+    message: Optional[ChatCompletionMessage] = None
+    delta: Optional[ChatCompletionMessage] = None
+    logprobs: Optional[LogprobsContent] = None
+    finish_reason: Optional[str] = Field(
+        ..., description="Reason for stopping: stop, length, content_filter, tool_calls"
+    )
+class ChatCompletion(BaseModel):
+    id: str = Field(..., description="Unique identifier for the chat completion")
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(..., description="Unix timestamp of creation")
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: Usage
+class ChatCompletionChunk(BaseModel):
+    id: str = Field(..., description="Unique identifier for the chat completion")
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(..., description="Unix timestamp of creation")
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: Optional[Usage]

app.py ADDED Viewed

	@@ -0,0 +1,941 @@

+import os
+if os.environ.get("MODELSCOPE_ENVIRONMENT") == "studio":
+    from modelscope import patch_hub
+    patch_hub()
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
+from config import CONFIG, ModelConfig
+from utils import (
+    cleanMessages,
+    parse_think_response,
+    remove_nested_think_tags_stack,
+    format_bytes,
+    log,
+)
+import copy, types, gc, sys, re, time, collections, asyncio
+from huggingface_hub import hf_hub_download
+from loguru import logger
+from rich import print
+from snowflake import SnowflakeGenerator
+CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
+from typing import List, Optional, Union, Any, Dict
+import uuid
+from pydantic import BaseModel, Field, model_validator
+from pydantic_settings import BaseSettings
+import numpy as np
+import torch
+if "cuda" in CONFIG.STRATEGY.lower() and not torch.cuda.is_available():
+    logger.info(f"CUDA not found, fall back to cpu")
+    CONFIG.STRATEGY = "cpu fp16"
+# Normalize STRATEGY to include precision if missing (e.g., 'cpu' -> 'cpu fp16')
+_s = CONFIG.STRATEGY.lower()
+if ("cpu" in _s or "cuda" in _s) and not ("fp16" in _s or "fp32" in _s):
+    logger.info(f"STRATEGY missing precision, appending 'fp16' to `{CONFIG.STRATEGY}`")
+    CONFIG.STRATEGY = CONFIG.STRATEGY + " fp16"
+try:
+    from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
+except Exception:
+    nvmlInit = None
+    nvmlDeviceGetHandleByIndex = None
+    nvmlDeviceGetMemoryInfo = None
+if "cuda" in CONFIG.STRATEGY.lower() and nvmlInit is not None and nvmlDeviceGetHandleByIndex is not None:
+    nvmlInit()
+    gpu_h = nvmlDeviceGetHandleByIndex(0)
+def logGPUState():
+    if "cuda" in CONFIG.STRATEGY and nvmlDeviceGetMemoryInfo is not None:
+        gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+        logger.info(
+            f"[STATUS] Torch - {format_bytes(torch.cuda.memory_allocated())} - NVML - vram {format_bytes(gpu_info.total)} used {format_bytes(gpu_info.used)} free {format_bytes(gpu_info.free)}"
+        )
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cuda.matmul.allow_tf32 = True
+os.environ["RWKV_V7_ON"] = "1"  # enable this for rwkv-7 models
+os.environ["RWKV_JIT_ON"] = "1"
+os.environ["RWKV_CUDA_ON"] = (
+    "1" if CONFIG.RWKV_CUDA_ON and "cuda" in CONFIG.STRATEGY.lower() else "0"
+)
+from rwkv.model import RWKV
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+from fastapi import FastAPI, HTTPException
+from starlette.background import BackgroundTask
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.gzip import GZipMiddleware
+from api_types import (
+    ChatMessage,
+    ChatCompletion,
+    ChatCompletionChunk,
+    Usage,
+    PromptTokensDetails,
+    ChatCompletionChoice,
+    ChatCompletionMessage,
+)
+class ModelStorage:
+    MODEL_CONFIG: Optional[ModelConfig] = None
+    model: Optional[RWKV] = None
+    pipeline: Optional[PIPELINE] = None
+MODEL_STORAGE: Dict[str, ModelStorage] = {}
+DEFALUT_MODEL_NAME = None
+DEFAULT_REASONING_MODEL_NAME = None
+# In-memory model state store to support streaming continuation/resume per state_name.
+# Keys: (model_name, state_name) -> model_state object
+STATE_STORE: Dict[tuple, Any] = {}
+logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
+logGPUState()
+# Enforce single 0.1b model. If multiple models are present, select only the one
+# that matches '0.1b' literally in the service name, to obey policy of single model.
+filtered_models = [m for m in CONFIG.MODELS if '0.1b' in m.SERVICE_NAME]
+if len(filtered_models) == 0:
+    # If no explicit 0.1b model detected, fall back to the first provided model but warn.
+    logger.warning("No '0.1b' model detected in config; using the first available model. To ensure single 0.1b use, include a model name with '0.1b'.")
+    CONFIG.MODELS = [CONFIG.MODELS[0]]
+elif len(filtered_models) > 1:
+    logger.warning("Multiple '0.1b' models detected; selecting the first one as the single model.")
+    CONFIG.MODELS = [filtered_models[0]]
+else:
+    CONFIG.MODELS = [filtered_models[0]]
+for model_config in CONFIG.MODELS:
+    logger.info(f"Load Model - {model_config.SERVICE_NAME}")
+    if model_config.MODEL_FILE_PATH == None:
+        model_config.MODEL_FILE_PATH = hf_hub_download(
+            repo_id=str(model_config.DOWNLOAD_MODEL_REPO_ID),
+            filename=str(model_config.DOWNLOAD_MODEL_FILE_NAME),
+            local_dir=str(model_config.DOWNLOAD_MODEL_DIR),
+        )
+    logger.info(f"Load Model - Path - {model_config.MODEL_FILE_PATH}")
+    if model_config.DEFAULT_CHAT:
+        if DEFALUT_MODEL_NAME != None:
+            logger.info(
+                f"Load Model - Replace `DEFALUT_MODEL_NAME` from `{DEFALUT_MODEL_NAME}` to `{model_config.SERVICE_NAME}`"
+            )
+        DEFALUT_MODEL_NAME = model_config.SERVICE_NAME
+    if model_config.DEFAULT_REASONING:
+        if DEFAULT_REASONING_MODEL_NAME != None:
+            logger.info(
+                f"Load Model - Replace `DEFAULT_REASONING_MODEL_NAME` from `{DEFAULT_REASONING_MODEL_NAME}` to `{model_config.SERVICE_NAME}`"
+            )
+        DEFAULT_REASONING_MODEL_NAME = model_config.SERVICE_NAME
+    logger.info(f"Load Model - Loading `{model_config.SERVICE_NAME}`")
+    print(model_config.DEFAULT_SAMPLER)
+    MODEL_STORAGE[model_config.SERVICE_NAME] = ModelStorage()
+    MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
+    MODEL_STORAGE[model_config.SERVICE_NAME].model = RWKV(
+        model=model_config.MODEL_FILE_PATH.replace(".pth", ""),
+        strategy=CONFIG.STRATEGY,
+    )
+    MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = PIPELINE(
+        MODEL_STORAGE[model_config.SERVICE_NAME].model, model_config.VOCAB
+    )
+    if "cuda" in CONFIG.STRATEGY:
+        torch.cuda.empty_cache()
+        gc.collect()
+    logGPUState()
+logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
+logger.info(
+    f"Load Model - DEFAULT_REASONING_MODEL_NAME is `{DEFAULT_REASONING_MODEL_NAME}`"
+)
+if len(MODEL_STORAGE) == 1:
+    single_name = list(MODEL_STORAGE.keys())[0]
+    if DEFALUT_MODEL_NAME != single_name:
+        DEFALUT_MODEL_NAME = single_name
+        logger.info(f"Load Model - Only one model present; DEFALUT_MODEL_NAME set to `{DEFALUT_MODEL_NAME}`")
+    if DEFAULT_REASONING_MODEL_NAME != single_name:
+        DEFAULT_REASONING_MODEL_NAME = single_name
+        logger.info(f"Load Model - Only one model present; DEFAULT_REASONING_MODEL_NAME set to `{DEFAULT_REASONING_MODEL_NAME}`")
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(
+        default="rwkv-latest",
+        description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
+    )
+    messages: Optional[List[ChatMessage]] = Field(default=None)
+    prompt: Optional[str] = Field(default=None)
+    max_tokens: Optional[int] = Field(default=None)
+    temperature: Optional[float] = Field(default=None)
+    top_p: Optional[float] = Field(default=None)
+    presence_penalty: Optional[float] = Field(default=None)
+    count_penalty: Optional[float] = Field(default=None)
+    penalty_decay: Optional[float] = Field(default=None)
+    stream: Optional[bool] = Field(default=True, description="Whether to stream token-by-token responses by default")
+    state_name: Optional[str] = Field(default=None)
+    include_usage: Optional[bool] = Field(default=False)
+    stop: Optional[list[str]] = Field(["\n\n"])
+    stop_tokens: Optional[list[int]] = Field([0])
+    web_search: Optional[bool] = Field(default=False, description="Whether to perform a web search and append results to the prompt")
+    search_top_k: Optional[int] = Field(default=3, description="Number of web search results to retrieve")
+    tools: Optional[List[Dict[str, Any]]] = Field(default=None, description="List of tools to execute server-side (e.g., {'name':'web_search','args':{'query':'x'}})")
+    @model_validator(mode="before")
+    @classmethod
+    def validate_mutual_exclusivity(cls, data: Any) -> Any:
+        if not isinstance(data, dict):
+            return data
+        messages_provided = "messages" in data and data["messages"] != None
+        prompt_provided = "prompt" in data and data["prompt"] != None
+        if messages_provided and prompt_provided:
+            raise ValueError("messages and prompt cannot coexist. Choose one.")
+        if not messages_provided and not prompt_provided:
+            raise ValueError("Either messages or prompt must be provided.")
+        return data
+app = FastAPI(title="RWKV OpenAI-Compatible API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
+async def runPrefill(
+    request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state
+):
+    ctx = ctx.replace("\r\n", "\n")
+    out = None
+    ms = MODEL_STORAGE.get(request.model)
+    if not ms or not ms.pipeline or not ms.model:
+        raise HTTPException(500, f"Model {request.model} not loaded or pipeline missing")
+    tokens = ms.pipeline.encode(ctx)
+    tokens = [int(x) for x in tokens]
+    model_tokens += tokens
+    while len(tokens) > 0:
+        out, model_state = ms.model.forward(
+            tokens[: CONFIG.CHUNK_LEN], model_state
+        )
+        tokens = tokens[CONFIG.CHUNK_LEN :]
+        await asyncio.sleep(0)
+    return out, model_tokens, model_state
+def generate(
+    request: ChatCompletionRequest,
+    out,
+    model_tokens: List[int],
+    model_state,
+    max_tokens=2048,
+):
+    ms = MODEL_STORAGE.get(request.model)
+    if not ms or not ms.pipeline or not ms.model:
+        raise HTTPException(500, f"Model {request.model} not loaded or pipeline missing")
+    temperature = request.temperature if request.temperature is not None else 0.2
+    top_p = request.top_p if request.top_p is not None else 0.9
+    alpha_frequency = request.count_penalty if request.count_penalty is not None else 0.0
+    alpha_presence = request.presence_penalty if request.presence_penalty is not None else 0.0
+    penalty_decay = request.penalty_decay if request.penalty_decay is not None else 0.5
+    args = PIPELINE_ARGS(
+        temperature=max(0.2, temperature),
+        top_p=top_p,
+        alpha_frequency=alpha_frequency,
+        alpha_presence=alpha_presence,
+        token_ban=[],  # ban the generation of some tokens
+        token_stop=[0],
+    )  # stop generation whenever you see any token here
+    occurrence = {}
+    out_tokens: List[int] = []
+    out_last = 0
+    # Stream token-by-token; each chunk contains a single decoded token string.
+    for i in range(max_tokens):
+        for n in occurrence:
+            out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
+        # out[0] -= 1e10  # disable END_OF_TEXT
+        token = ms.pipeline.sample_logits(
+            out, temperature=args.temperature, top_p=args.top_p
+        )
+        if token == 0 and request.stop_tokens and token in request.stop_tokens:
+            yield {
+                "content": "",
+                "tokens": out_tokens[out_last:],
+                "finish_reason": "stop:token:0",
+                "state": model_state,
+            }
+            del out
+            gc.collect()
+            return
+        out, model_state = ms.model.forward([token], model_state)
+        model_tokens.append(token)
+        out_tokens.append(token)
+        if request.stop_tokens and token in request.stop_tokens:
+            yield {
+                "content": "",
+                "tokens": out_tokens[out_last:],
+                "finish_reason": f"stop:token:{token}",
+                "state": model_state,
+            }
+            del out
+            gc.collect()
+            return
+        for xxx in list(occurrence.keys()):
+            occurrence[xxx] *= penalty_decay
+        occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
+        # Decode token to text and yield it as a single-token chunk
+        decoded = ms.pipeline.decode([token])
+        # filter out replacement characters
+        if "\ufffd" in decoded:
+            continue
+        yield {
+            "content": decoded,
+            "tokens": [token],
+            "finish_reason": None,
+            "state": model_state,
+        }
+        out_last = i + 1
+    else:
+        yield {
+            "content": "",
+            "tokens": [],
+            "finish_reason": "length",
+        }
+async def chatResponse(
+    request: ChatCompletionRequest,
+    model_state: Any,
+    completionId: str,
+    enableReasoning: bool,
+) -> ChatCompletion:
+    createTimestamp = time.time()
+    prompt = (
+        f"{cleanMessages(request.messages or [])}\n\nAssistant:{' <think' if enableReasoning else ''}"
+        if request.prompt == None
+        else request.prompt.strip()
+    )
+    # Process tools and web_search (tools executed server-side and results injected to prompt)
+    if request.tools:
+        try:
+            for tool in request.tools:
+                name = tool.get('name')
+                args = tool.get('args', {})
+                if name == 'web_search':
+                    from utils import web_search
+                    search_q = args.get('query') or (request.prompt if request.prompt else cleanMessages(request.messages or []))
+                    search_top_k = int(args.get('top_k') or request.search_top_k or 3)
+                    search_str = web_search(search_q, search_top_k)
+                    if search_str:
+                        prompt = (f"ToolResults:\n{search_str}\n\nUse these results to answer the prompt.\n\n" + prompt)
+                elif name == 'calc' or name == 'calculator':
+                    from utils import calc
+                    expr = args.get('expression')
+                    if expr:
+                        calc_res = calc(expr)
+                        prompt = (f"ToolResults:\nCalcResult:{expr} = {calc_res}\n\nUse this result to answer the prompt.\n\n" + prompt)
+                else:
+                    # Unsupported tool - ignore or log
+                    logger.info(f"Unsupported tool requested: {name}")
+        except Exception as e:
+            logger.info(f"Tool processing error: {e}")
+    elif request.web_search:
+        try:
+            from utils import web_search
+            search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
+            search_res = web_search(search_q, int(request.search_top_k or 3))
+            if search_res:
+                prompt = f"WebSearchResults:\n{search_res}\n\n" + prompt
+        except Exception:
+            pass
+    logger.info(f"[REQ] {completionId} - prompt - {prompt}")
+    # Resume or prefill tokens/state
+    if request.state_name:
+        state_key = (request.model, request.state_name)
+        if state_key in STATE_STORE:
+            stored = STATE_STORE[state_key]
+            model_state = stored.get('state', model_state)
+            model_tokens = stored.get('model_tokens', [0])
+            out = None
+        else:
+            out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
+    else:
+        out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
+    prefillTime = time.time()
+    promptTokenCount = len(model_tokens)
+    fullResponse = " <think" if enableReasoning else ""
+    completionTokenCount = 0
+    finishReason = None
+    for chunk in generate(
+        request,
+        out,
+        model_tokens,
+        model_state,
+        max_tokens=(
+            64000
+            if "max_tokens" not in request.model_fields_set and enableReasoning
+            else (request.max_tokens or 2048)
+        ),
+    ):
+        # chunk['content'] is now expected to be a single token's decoded text
+        fullResponse += chunk["content"]
+        # Check stop sequences (multi-token) after each token
+        for stop_words in request.stop or []:
+            if stop_words in fullResponse:
+                finishReason = f"stop:words:{stop_words}"
+                break
+        completionTokenCount += 1
+        if chunk["finish_reason"]:
+            finishReason = chunk["finish_reason"]
+        await asyncio.sleep(0)
+    genenrateTime = time.time()
+    responseLog = {
+        "content": fullResponse,
+        "finish": finishReason,
+        "prefill_len": promptTokenCount,
+        "prefill_tps": round(promptTokenCount / (prefillTime - createTimestamp), 2),
+        "gen_len": completionTokenCount,
+        "gen_tps": round(completionTokenCount / (genenrateTime - prefillTime), 2),
+    }
+    logger.info(f"[RES] {completionId} - {responseLog}")
+    reasoning_content, content = parse_think_response(fullResponse)
+    response = ChatCompletion(
+        id=completionId,
+        created=int(createTimestamp),
+        model=request.model,
+            usage=Usage(
+            prompt_tokens=promptTokenCount,
+            completion_tokens=completionTokenCount,
+            total_tokens=promptTokenCount + completionTokenCount,
+            prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+        ),
+        choices=[
+            ChatCompletionChoice(
+                index=0,
+                message=ChatCompletionMessage(
+                    role="Assistant",
+                    content=content,
+                    reasoning_content=reasoning_content if reasoning_content else None,
+                    tool_calls=None,
+                ),
+                logprobs=None,
+                finish_reason=finishReason,
+            )
+        ],
+    )
+    # Save state if requested for future resumption
+    try:
+        if request.state_name:
+            STATE_STORE[(request.model, request.state_name)] = {
+                'state': model_state,
+                'model_tokens': model_tokens,
+            }
+    except Exception:
+        pass
+    return response
+async def chatResponseStream(
+    request: ChatCompletionRequest,
+    model_state: Any,
+    completionId: str,
+    enableReasoning: bool,
+):
+    createTimestamp = int(time.time())
+    prompt = (
+        f"{cleanMessages(request.messages or [], enableReasoning)}\n\nAssistant:{' <think' if enableReasoning else ''}"
+        if request.prompt == None
+        else request.prompt.strip()
+    )
+    # Process tools and web_search (tools executed server-side and results injected to prompt)
+    if request.tools:
+        try:
+            for tool in request.tools:
+                name = tool.get('name')
+                args = tool.get('args', {})
+                if name == 'web_search':
+                    from utils import web_search
+                    search_q = args.get('query') or (request.prompt if request.prompt else cleanMessages(request.messages or []))
+                    search_top_k = int(args.get('top_k') or request.search_top_k or 3)
+                    search_str = web_search(search_q, search_top_k)
+                    if search_str:
+                        prompt = (f"WebSearchResults:\n{search_str}\n\n" + prompt)
+                elif name == 'calc' or name == 'calculator':
+                    from utils import calc
+                    expr = args.get('expression')
+                    if expr:
+                        calc_res = calc(expr)
+                        prompt = (f"CalcResult:{expr} = {calc_res}\n\n" + prompt)
+                else:
+                    logger.info(f"Unsupported tool requested: {name}")
+        except Exception as e:
+            logger.info(f"Tool processing error: {e}")
+    elif request.web_search:
+        try:
+            from utils import web_search
+            search_q = request.prompt if request.prompt else cleanMessages(request.messages or [])
+            search_res = web_search(search_q, int(request.search_top_k or 3))
+            if search_res:
+                prompt = f"WebSearchResults:\n{search_res}\n\n" + prompt
+        except Exception:
+            pass
+    logger.info(f"[REQ] {completionId} - context\n```{prompt}```")
+    # Resume or prefill tokens/state
+    if request.state_name:
+        state_key = (request.model, request.state_name)
+        if state_key in STATE_STORE:
+            stored = STATE_STORE[state_key]
+            model_state = stored.get('state', model_state)
+            model_tokens = stored.get('model_tokens', [0])
+            out = None
+        else:
+            out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
+    else:
+        out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
+    prefillTime = time.time()
+    promptTokenCount = len(model_tokens)
+    completionTokenCount = 0
+    finishReason = None
+    response = ChatCompletionChunk(
+        id=completionId,
+        created=createTimestamp,
+        model=request.model,
+        usage=(
+                Usage(
+                    prompt_tokens=promptTokenCount,
+                    completion_tokens=completionTokenCount,
+                    total_tokens=promptTokenCount + completionTokenCount,
+                    prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+                )
+            if request.include_usage
+            else None
+        ),
+        choices=[
+            ChatCompletionChoice(
+                index=0,
+                delta=ChatCompletionMessage(
+                    role="Assistant",
+                    content="",
+                    reasoning_content="" if enableReasoning else None,
+                    tool_calls=None,
+                ),
+                logprobs=None,
+                finish_reason=finishReason,
+            )
+        ],
+    )
+    if response.choices and response.choices[0].delta is None:
+        response.choices[0].delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+    # Attach state_name in the initial chunk so client can save it to continue later
+    r_dict = response.model_dump()
+    r_dict['state_name'] = request.state_name
+    yield f"data: {r_dict}\n\n"
+    buffer = []
+    if enableReasoning:
+        buffer.append("<think")
+        streamConfig = {
+            "isChecking": False,  # check whether is <think> tag
+            "fullTextCursor": 0,
+            "in_think": False,
+            "cacheStr": "",
+        }
+        for chunk in generate(
+            request,
+            out,
+            model_tokens,
+            model_state,
+            max_tokens=(
+                64000
+                if "max_tokens" not in request.model_fields_set and enableReasoning
+                else (request.max_tokens or 2048)
+            ),
+        ):
+            completionTokenCount += 1
+            # Each token stream is delivered as a decoded character/bytes (maybe 1 or more chars)
+            chunkContent: str = chunk["content"]
+            buffer.append(chunkContent)
+            fullText = "".join(buffer)
+            if chunk["finish_reason"]:
+                finishReason = chunk["finish_reason"]
+            response = ChatCompletionChunk(
+                id=completionId,
+                created=createTimestamp,
+                model=request.model,
+                usage=(
+                    Usage(
+                        prompt_tokens=promptTokenCount,
+                        completion_tokens=completionTokenCount,
+                        total_tokens=promptTokenCount + completionTokenCount,
+                        prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+                    )
+                    if request.include_usage
+                    else None
+                ),
+                choices=[
+                    ChatCompletionChoice(
+                        index=0,
+                        delta=ChatCompletionMessage(
+                            role="Assistant",
+                            content=None,
+                            reasoning_content=None,
+                            tool_calls=None,
+                        ),
+                        logprobs=None,
+                        finish_reason=finishReason,
+                    )
+                ],
+            )
+            if response.choices and response.choices[0].delta is None:
+                response.choices[0].delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+            markStart = fullText.find("<", streamConfig["fullTextCursor"])
+            if not streamConfig["isChecking"] and markStart != -1:
+                streamConfig["isChecking"] = True
+                if streamConfig["in_think"]:
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.reasoning_content = fullText[streamConfig["fullTextCursor"] : markStart]
+                else:
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.content = fullText[streamConfig["fullTextCursor"] : markStart]
+                streamConfig["cacheStr"] = ""
+                streamConfig["fullTextCursor"] = markStart
+            if streamConfig["isChecking"]:
+                streamConfig["cacheStr"] = fullText[streamConfig["fullTextCursor"] :]
+            else:
+                if streamConfig["in_think"]:
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.reasoning_content = chunkContent
+                else:
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.content = chunkContent
+                streamConfig["fullTextCursor"] = len(fullText)
+            markEnd = fullText.find(">", streamConfig["fullTextCursor"])
+            if (streamConfig["isChecking"] and markEnd != -1) or finishReason != None:
+                streamConfig["isChecking"] = False
+                if (
+                    not streamConfig["in_think"]
+                    and streamConfig["cacheStr"].find("<think>") != -1
+                ):
+                    streamConfig["in_think"] = True
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.reasoning_content = (
+                        delta.reasoning_content
+                        if delta.reasoning_content != None
+                        else "" + streamConfig["cacheStr"].replace("<think>", "")
+                    )
+                elif (
+                    streamConfig["in_think"]
+                    and streamConfig["cacheStr"].find("</think>") != -1
+                ):
+                    streamConfig["in_think"] = False
+                    delta = response.choices[0].delta
+                    if delta is None:
+                        delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                        response.choices[0].delta = delta
+                    delta.content = (
+                        delta.content
+                        if delta.content != None
+                        else "" + streamConfig["cacheStr"].replace("</think>", "")
+                    )
+                else:
+                    if streamConfig["in_think"]:
+                        delta = response.choices[0].delta
+                        if delta is None:
+                            delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                            response.choices[0].delta = delta
+                        delta.reasoning_content = (
+                            delta.reasoning_content
+                            if delta.reasoning_content != None
+                            else "" + streamConfig["cacheStr"]
+                        )
+                    else:
+                        delta = response.choices[0].delta
+                        if delta is None:
+                            delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                            response.choices[0].delta = delta
+                        delta.content = (
+                            delta.content
+                            if delta.content != None
+                            else "" + streamConfig["cacheStr"]
+                        )
+                streamConfig["fullTextCursor"] = len(fullText)
+            delta = response.choices[0].delta
+            if delta is None:
+                delta = ChatCompletionMessage(role="Assistant", content="", reasoning_content=None, tool_calls=None)
+                response.choices[0].delta = delta
+            if delta.content != None or delta.reasoning_content != None:
+                # Save model state frequently (after each token) to allow resuming
+                try:
+                    if request.state_name:
+                        STATE_STORE[(request.model, request.state_name)] = {
+                            'state': model_state,
+                            'model_tokens': model_tokens,
+                        }
+                except Exception:
+                    pass
+                yield f"data: {response.model_dump_json()}\n\n"
+                # check stop sequences and stop streaming if we see them
+                for stop_words in request.stop or []:
+                    if stop_words in ''.join(buffer):
+                        finishReason = f"stop:words:{stop_words}"
+                        return
+            await asyncio.sleep(0)
+        del streamConfig
+    else:
+        for chunk in generate(request, out, model_tokens, model_state):
+            completionTokenCount += 1
+            buffer.append(chunk["content"])
+            if chunk["finish_reason"]:
+                finishReason = chunk["finish_reason"]
+            response = ChatCompletionChunk(
+                id=completionId,
+                created=createTimestamp,
+                model=request.model,
+                usage=(
+                    Usage(
+                        prompt_tokens=promptTokenCount,
+                        completion_tokens=completionTokenCount,
+                        total_tokens=promptTokenCount + completionTokenCount,
+                        prompt_tokens_details=PromptTokensDetails(cached_tokens=0),
+                    )
+                    if request.include_usage
+                    else None
+                ),
+                choices=[
+                    ChatCompletionChoice(
+                        index=0,
+                        delta=ChatCompletionMessage(role="Assistant", content=chunk["content"], reasoning_content=None, tool_calls=None),
+                        logprobs=None,
+                        finish_reason=finishReason,
+                    )
+                ],
+            )
+            yield f"data: {response.model_dump_json()}\n\n"
+            await asyncio.sleep(0)
+    genenrateTime = time.time()
+    responseLog = {
+        "content": "".join(buffer),
+        "finish": finishReason,
+        "prefill_len": promptTokenCount,
+        "prefill_tps": round(promptTokenCount / (prefillTime - createTimestamp), 2),
+        "gen_len": completionTokenCount,
+        "gen_tps": round(completionTokenCount / (genenrateTime - prefillTime), 2),
+    }
+    logger.info(f"[RES] {completionId} - {responseLog}")
+    if request.messages is None:
+        request.messages = []
+    request.messages.append(ChatMessage(role="Assistant", content=responseLog["content"]))
+    log(
+        {
+            **request.model_dump(),
+            **responseLog,
+            "completionId": completionId,
+            "machineLabel": os.environ.get("MACHINE_LABEL"),
+        }
+    )
+    del buffer
+    yield "data: [DONE]\n\n"
+@app.post("/api/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    completionId = str(next(CompletionIdGenerator))
+    logger.info(f"[REQ] {completionId} - {request.model_dump()}")
+    modelName = request.model.split(":")[0]
+    enableReasoning = ":thinking" in request.model
+    if "rwkv-latest" in request.model:
+        # Map to the default chat model in all cases. Do not redirect to a separate
+        # reasoning model when ':thinking' is used. The same model will be used
+        # and reasoning handled in-process by setting enableReasoning=True.
+        if DEFALUT_MODEL_NAME == None:
+            raise HTTPException(404, "DEFALUT_MODEL_NAME not set")
+        ms_def = MODEL_STORAGE.get(DEFALUT_MODEL_NAME)
+        if not ms_def or not ms_def.MODEL_CONFIG:
+            raise HTTPException(500, "Default sampler config missing for default model")
+        defaultSamplerConfig = ms_def.MODEL_CONFIG.DEFAULT_SAMPLER
+        request.model = DEFALUT_MODEL_NAME
+    elif modelName in MODEL_STORAGE:
+        ms_sel = MODEL_STORAGE.get(modelName)
+        if not ms_sel or not ms_sel.MODEL_CONFIG:
+            raise HTTPException(500, f"Default sampler config missing for model {modelName}")
+        defaultSamplerConfig = ms_sel.MODEL_CONFIG.DEFAULT_SAMPLER
+        request.model = modelName
+    else:
+        raise HTTPException(404, f"Can not find `{modelName}`")
+    async def chatResponseStreamDisconnect():
+        logGPUState()
+    # Load or initialize model_state and tokens based on state_name
+    model_state = None
+    model_tokens_for_resume = [0]
+    state_name = request.state_name
+    if state_name is None:
+        state_name = str(uuid.uuid4())
+        request.state_name = state_name
+    state_key = (request.model, state_name)
+    if state_key in STATE_STORE:
+        stored = STATE_STORE[state_key]
+        model_state = stored.get('state', None)
+        model_tokens_for_resume = stored.get('model_tokens', [0])
+    request_dict = request.model_dump()
+    for k, v in defaultSamplerConfig.model_dump().items():
+        if k in request_dict and request_dict[k] is None:
+            request_dict[k] = v
+    realRequest = ChatCompletionRequest(**request_dict)
+    logger.info(f"[REQ] {completionId} - Real - {request.model_dump()}")
+    if request.stream:
+        r = StreamingResponse(
+            chatResponseStream(realRequest, model_state, completionId, enableReasoning),
+            media_type="text/event-stream",
+            background=BackgroundTask(chatResponseStreamDisconnect),
+        )
+    else:
+        r = await chatResponse(realRequest, model_state, completionId, enableReasoning)
+        # Attach state_name to non-streaming response as additional metadata
+        try:
+            import json
+            if isinstance(r, ChatCompletion):
+                d = r.model_dump()
+                d['state_name'] = state_name
+                return d
+        except Exception:
+            pass
+    return r
+if os.path.exists("dist-frontend"):
+    app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
+else:
+    logger.info("dist-frontend not found; skipping static files mount")
+if __name__ == "__main__":
+    import uvicorn
+    host = CONFIG.HOST or "127.0.0.1"
+    port = CONFIG.PORT or 7860
+    uvicorn.run(app, host=host, port=port)

app_stderr.log ADDED Viewed

	@@ -0,0 +1,33 @@

+C:\Users\Administrator\Downloads\New folder (3)\RWKV\.venv\Lib\site-packages\torch\cuda\__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+  import pynvml  # type: ignore[import]
+2025-11-23 16:35:08.739 | INFO     | __main__:<module>:104 - STRATEGY - cpu fp16
+2025-11-23 16:35:08.740 | INFO     | __main__:<module>:109 - Load Model - rwkv7-g1a-0.1b-20250728-ctx4096
+2025-11-23 16:35:09.724 | INFO     | __main__:<module>:117 - Load Model - Path - models\rwkv7-g1a-0.1b-20250728-ctx4096.pth
+2025-11-23 16:35:09.724 | INFO     | __main__:<module>:133 - Load Model - Loading `rwkv7-g1a-0.1b-20250728-ctx4096`
+2025-11-23 16:35:15.073 | INFO     | __main__:<module>:151 - Load Model - DEFALUT_MODEL_NAME is `rwkv7-g1a-0.1b-20250728-ctx4096`
+2025-11-23 16:35:15.074 | INFO     | __main__:<module>:152 - Load Model - DEFAULT_REASONING_MODEL_NAME is `rwkv7-g1a-0.1b-20250728-ctx4096`
+2025-11-23 16:35:15.080 | INFO     | __main__:<module>:746 - dist-frontend not found; skipping static files mount
+INFO:     Started server process [9328]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:7860 (Press CTRL+C to quit)
+2025-11-23 16:35:51.067 | INFO     | __main__:chat_completions:698 - [REQ] 7398519686318694400 - {'model': 'rwkv-latest', 'messages': None, 'prompt': 'Who is the current president of France?', 'max_tokens': 50, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 3}
+2025-11-23 16:35:51.067 | INFO     | __main__:chat_completions:729 - [REQ] 7398519686318694400 - Real - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096', 'messages': None, 'prompt': 'Who is the current president of France?', 'max_tokens': 50, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 3}
+2025-11-23 16:35:53.728 | INFO     | __main__:chatResponse:363 - [REQ] 7398519686318694400 - prompt - Who is the current president of France?
+2025-11-23 16:36:09.388 | INFO     | __main__:chatResponse:402 - [RES] 7398519686318694400 - {'content': '\nThe current president of France is Emmanuel Macron.', 'finish': 'stop:words:\n\n', 'prefill_len': 9, 'prefill_tps': 1.36, 'gen_len': 6, 'gen_tps': 0.51}
+2025-11-23 16:36:52.165 | INFO     | __main__:chat_completions:698 - [REQ] 7398519942582280192 - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096:thinking', 'messages': None, 'prompt': 'Summarize the first paragraph from the search about Python programming', 'max_tokens': 60, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 2}
+2025-11-23 16:36:52.165 | INFO     | __main__:chat_completions:729 - [REQ] 7398519942582280192 - Real - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096', 'messages': None, 'prompt': 'Summarize the first paragraph from the search about Python programming', 'max_tokens': 60, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 2}
+2025-11-23 16:36:54.650 | INFO     | __main__:chatResponse:363 - [REQ] 7398519942582280192 - prompt - Summarize the first paragraph from the search about Python programming
+2025-11-23 16:38:03.778 | INFO     | __main__:chatResponse:402 - [RES] 7398519942582280192 - {'content': ' <think.\nThe first paragraph of the search is about Python programming. It talks about how to use Python for data analysis and machine learning. The second paragraph is about how to use Python for web development. It talks about how to use Python for creating websites and applications. The third', 'finish': 'length', 'prefill_len': 13, 'prefill_tps': 1.65, 'gen_len': 56, 'gen_tps': 0.88}
+2025-11-23 16:38:05.030 | INFO     | __main__:chat_completions:698 - [REQ] 7398520248166686720 - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096:thinking', 'messages': None, 'prompt': 'Tell me a short summary of Python programming', 'max_tokens': 50, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 2}
+2025-11-23 16:38:05.033 | INFO     | __main__:chat_completions:729 - [REQ] 7398520248166686720 - Real - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096', 'messages': None, 'prompt': 'Tell me a short summary of Python programming', 'max_tokens': 50, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': True, 'search_top_k': 2}
+2025-11-23 16:38:06.800 | INFO     | __main__:chatResponse:363 - [REQ] 7398520248166686720 - prompt - Tell me a short summary of Python programming
+2025-11-23 16:38:24.585 | INFO     | __main__:chatResponse:402 - [RES] 7398520248166686720 - {'content': ' <think and how it can be used to solve problems.', 'finish': 'stop:words:\n\n', 'prefill_len': 9, 'prefill_tps': 1.55, 'gen_len': 6, 'gen_tps': 0.44}
+2025-11-23 16:42:18.982 | INFO     | __main__:chat_completions:698 - [REQ] 7398521313352130560 - {'model': 'rwkv-latest', 'messages': None, 'prompt': 'What is two plus three times four?', 'max_tokens': 32, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': False, 'search_top_k': 3}
+2025-11-23 16:42:18.982 | INFO     | __main__:chat_completions:729 - [REQ] 7398521313352130560 - Real - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096', 'messages': None, 'prompt': 'What is two plus three times four?', 'max_tokens': 32, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': False, 'search_top_k': 3}
+2025-11-23 16:42:18.982 | INFO     | __main__:chatResponse:363 - [REQ] 7398521313352130560 - prompt - What is two plus three times four?
+2025-11-23 16:42:56.030 | INFO     | __main__:chatResponse:402 - [RES] 7398521313352130560 - {'content': '\n100\nWhat is the difference between 0.9 and 0.8?\n0.2\nWhat is the sum of', 'finish': 'length', 'prefill_len': 9, 'prefill_tps': 2.17, 'gen_len': 28, 'gen_tps': 0.85}
+2025-11-23 16:44:08.178 | INFO     | __main__:chat_completions:698 - [REQ] 7398521771353350144 - {'model': 'rwkv-latest', 'messages': None, 'prompt': 'What is two plus three times four?', 'max_tokens': 32, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': False, 'search_top_k': 3}
+2025-11-23 16:44:08.179 | INFO     | __main__:chat_completions:729 - [REQ] 7398521771353350144 - Real - {'model': 'rwkv7-g1a-0.1b-20250728-ctx4096', 'messages': None, 'prompt': 'What is two plus three times four?', 'max_tokens': 32, 'temperature': None, 'top_p': None, 'presence_penalty': None, 'count_penalty': None, 'penalty_decay': None, 'stream': False, 'state_name': None, 'include_usage': False, 'stop': ['\n\n'], 'stop_tokens': [0], 'web_search': False, 'search_top_k': 3}
+2025-11-23 16:44:08.179 | INFO     | __main__:chatResponse:363 - [REQ] 7398521771353350144 - prompt - What is two plus three times four?
+2025-11-23 16:44:45.828 | INFO     | __main__:chatResponse:402 - [RES] 7398521771353350144 - {'content': '\nTwo plus three times four is eight.\nWhat is the sum of the digits of two-digit numbers?\nThe sum of', 'finish': 'length', 'prefill_len': 9, 'prefill_tps': 2.28, 'gen_len': 28, 'gen_tps': 0.83}

app_stdout.log ADDED Viewed

	@@ -0,0 +1,22 @@

+### RWKV-7 "Goose" enabled ###
+SamplerConfig(
+    max_tokens=4096,
+    temperature=1.0,
+    top_p=0.3,
+    presence_penalty=0.5,
+    count_penalty=0.5,
+    penalty_decay=0.996,
+    stop=['\n\n'],
+    stop_tokens=[0]
+)
+Loading models\rwkv7-g1a-0.1b-20250728-ctx4096 (cpu fp16)
+INFO:     127.0.0.1:50012 - "POST /api/v1/chat/completions HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50128 - "POST /api/v1/chat/completions HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50128 - "POST /api/v1/chat/completions HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50763 - "POST /api/v1/chat/completions HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50973 - "POST /api/v1/chat/completions HTTP/1.1" 200 OK
+INFO:     127.0.0.1:51134 - "POST /api/v1/chat/completions HTTP/1.1" 422 Unprocessable Entity
+INFO:     127.0.0.1:51144 - "POST /api/v1/chat/completions HTTP/1.1" 422 Unprocessable Entity

config.local.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+HOST: "0.0.0.0"
+PORT: 7860
+STRATEGY: "cpu fp16"
+RWKV_CUDA_ON: False
+CHUNK_LEN: 256
+MODELS:
+  - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
+    DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
+    DOWNLOAD_MODEL_REPO_ID: "BlinkDL/rwkv7-g1"
+    DOWNLOAD_MODEL_DIR: "./models"
+    REASONING: True
+    DEFAULT_CHAT: True
+    DEFAULT_REASONING: True
+    DEFAULT_SAMPLER:
+      max_tokens: 4096
+      temperature: 1.0
+      top_p: 0.3
+      presence_penalty: 0.5
+      count_penalty: 0.5
+      penalty_decay: 0.996
+      stop:
+        - "\n\n"
+      stop_tokens:
+        - 0

config.production-modelscope.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+HOST: "0.0.0.0"
+PORT: 7860
+STRATEGY: "cuda fp16"
+RWKV_CUDA_ON: True
+CHUNK_LEN: 256
+MODELS:
+  - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
+    DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
+    DOWNLOAD_MODEL_REPO_ID: "RWKV/rwkv7-g1"
+    DOWNLOAD_MODEL_DIR: "./models"
+    REASONING: True
+    DEFAULT_CHAT: True
+    DEFAULT_REASONING: True
+    DEFAULT_SAMPLER:
+      max_tokens: 4096
+      temperature: 1.0
+      top_p: 0.3
+      presence_penalty: 0.5
+      count_penalty: 0.5
+      penalty_decay: 0.996
+      stop:
+        - "\n\n"
+      stop_tokens:
+        - 0

config.production.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+HOST: "0.0.0.0"
+PORT: 7860
+STRATEGY: "cuda fp16"
+RWKV_CUDA_ON: True
+CHUNK_LEN: 256
+MODELS:
+  - SERVICE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096"
+    DOWNLOAD_MODEL_FILE_NAME: "rwkv7-g1a-0.1b-20250728-ctx4096.pth"
+    DOWNLOAD_MODEL_REPO_ID: "BlinkDL/rwkv7-g1"
+    DOWNLOAD_MODEL_DIR: "./models"
+    REASONING: True
+    DEFAULT_CHAT: True
+    DEFAULT_REASONING: True
+    DEFAULT_SAMPLER:
+      max_tokens: 4096
+      temperature: 1.0
+      top_p: 0.3
+      presence_penalty: 0.5
+      count_penalty: 0.5
+      penalty_decay: 0.996
+      stop:
+        - "\n\n"
+      stop_tokens:
+        - 0

config.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+from typing import List, Optional, Union, Any
+import sys
+from pydantic_settings import BaseSettings
+class CliConfig(BaseSettings, cli_parse_args=True, cli_use_class_docs_for_groups=True):
+    CONFIG_FILE: str = Field("./config.local.yaml", description="Config file path")
+CLI_CONFIG = CliConfig()
+class SamplerConfig(BaseModel):
+    """Default sampler configuration for each model."""
+    max_tokens: int = Field(512, description="Maximum number of tokens to generate.")
+    temperature: float = Field(1.0, description="Sampling temperature.")
+    top_p: float = Field(0.3, description="Top-p sampling threshold.")
+    presence_penalty: float = Field(0.5, description="Presence penalty.")
+    count_penalty: float = Field(0.5, description="Count penalty.")
+    penalty_decay: float = Field(0.996, description="Penalty decay factor.")
+    stop: List[str] = Field(["\n\n"], description="List of stop sequences.")
+    stop_tokens: List[int] = Field([0], description="List of stop tokens.")
+class ModelConfig(BaseModel):
+    """Configuration for each individual model."""
+    SERVICE_NAME: str = Field(..., description="Service name of the model.")
+    MODEL_FILE_PATH: Optional[str] = Field(None, description="Model file path.")
+    DOWNLOAD_MODEL_FILE_NAME: Optional[str] = Field(
+        None, description="Model name, should end with .pth"
+    )
+    DOWNLOAD_MODEL_REPO_ID: Optional[str] = Field(
+        None, description="Model repository ID on Hugging Face Hub."
+    )
+    DOWNLOAD_MODEL_DIR: Optional[str] = Field(
+        "./models", description="Directory to download the model to."
+    )
+    REASONING: bool = Field(
+        False, description="Whether reasoning is enabled for this model."
+    )
+    DEFAULT_CHAT: bool = Field(False, description="Whether this model is the default chat model.")
+    DEFAULT_REASONING: bool = Field(False, description="Whether this model is the default reasoning model.")
+    DEFAULT_SAMPLER: SamplerConfig = Field(
+        SamplerConfig(), description="Default sampler configuration for this model."
+    )
+    VOCAB: str = Field("rwkv_vocab_v20230424", description="Vocab Name")
+class RootConfig(BaseModel):
+    """Root configuration for the RWKV service."""
+    HOST: Optional[str] = Field(
+        "127.0.0.1", description="Host IP address to bind to."
+    )  # 注释掉可选的HOST和PORT
+    PORT: Optional[int] = Field(
+        8000, description="Port number to listen on."
+    )  # 因为YAML示例中被注释掉了
+    STRATEGY: str = Field(
+        "cpu", description="Strategy for model execution (e.g., 'cuda fp16')."
+    )
+    RWKV_CUDA_ON: bool = Field(False, description="Whether to enable RWKV CUDA kernel.")
+    CHUNK_LEN: int = Field(256, description="Chunk length for processing.")
+    MODELS: List[ModelConfig] = Field(..., description="List of model configurations.")
+import yaml
+try:
+    with open(CLI_CONFIG.CONFIG_FILE, "r", encoding="utf-8") as f:
+        CONFIG = RootConfig.model_validate(yaml.safe_load(f.read()))
+except Exception as e:
+    print(f"Pydantic Model Validation Failed: {e}")
+    sys.exit(0)

cuda/gemm_fp16_cublas.cpp ADDED Viewed

	@@ -0,0 +1,75 @@

+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#define CUBLAS_CHECK(condition)                                                \
+  for (cublasStatus_t _cublas_check_status = (condition);                      \
+       _cublas_check_status != CUBLAS_STATUS_SUCCESS;)                         \
+    throw std::runtime_error("cuBLAS error " +                                 \
+                             std::to_string(_cublas_check_status) + " at " +   \
+                             std::to_string(__LINE__));
+#define CUDA_CHECK(condition)                                                  \
+  for (cudaError_t _cuda_check_status = (condition);                           \
+       _cuda_check_status != cudaSuccess;)                                     \
+    throw std::runtime_error(                                                  \
+        "CUDA error " + std::string(cudaGetErrorString(_cuda_check_status)) +  \
+        " at " + std::to_string(__LINE__));
+/*
+  NOTE: blas gemm is column-major by default, but we need row-major output.
+  The data of row-major, transposed matrix is exactly the same as the
+  column-major, non-transposed matrix, and C = A * B ---> C^T = B^T * A^T
+ */
+void gemm_fp16_cublas(torch::Tensor a, torch::Tensor b, torch::Tensor c) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  const auto cuda_data_type = CUDA_R_16F;
+  const auto cuda_c_data_type =
+      c.dtype() == torch::kFloat32 ? CUDA_R_32F : CUDA_R_16F;
+  const auto compute_type = CUDA_R_32F;
+  const float sp_alpha = 1.f;
+  // swap a and b, and use CUBLAS_OP_N. see the notes above
+  std::swap(a, b);
+  const cublasOperation_t cublas_trans_a = CUBLAS_OP_N;
+  const cublasOperation_t cublas_trans_b = CUBLAS_OP_N;
+  // m = (B^T).size(0) = B.size(1), and = A.size(1) after swap,
+  // negative axis is used because of the existence of batch matmul.
+  const int m = a.size(-1);
+  const int k = a.size(-2);
+  const int n = b.size(-2);
+  const int cublas_lda = m;
+  const int cublas_ldb = k;
+  const int cublas_ldc = m;
+  cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
+#if CUDA_VERSION >= 11000
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+#else
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+#endif
+  const float sp_beta = 0.f;
+  if (a.sizes().size() == 2 && b.sizes().size() == 2) {
+    CUBLAS_CHECK(cublasGemmEx(
+        cublas_handle, cublas_trans_a, cublas_trans_b, m, n, k, &sp_alpha,
+        a.data_ptr(), cuda_data_type, cublas_lda, b.data_ptr(), cuda_data_type,
+        cublas_ldb, &sp_beta, c.data_ptr(), cuda_c_data_type, cublas_ldc,
+        compute_type, algo));
+  } else {
+    // batch matmul
+    assert(a.sizes().size() == 3 && b.sizes().size() == 3);
+    const long long int cublas_stride_a = m * k;
+    const long long int cublas_stride_b = k * n;
+    const long long int cublas_stride_c = m * n;
+    CUBLAS_CHECK(cublasGemmStridedBatchedEx(
+        cublas_handle, cublas_trans_a, cublas_trans_b, m,
+        n, k, &sp_alpha, a.data_ptr(), cuda_data_type, cublas_lda,
+        cublas_stride_a, b.data_ptr(), cuda_data_type, cublas_ldb, cublas_stride_b,
+        &sp_beta, c.data_ptr(), cuda_c_data_type, cublas_ldc, cublas_stride_c,
+        a.size(0), compute_type, algo));
+  }
+}

cuda/operators.cu ADDED Viewed

	@@ -0,0 +1,246 @@

+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+#include <cuda_fp16.h>
+#define MIN_VALUE (-1e38)
+typedef at::Half fp16;
+__half *cast(fp16 *ptr) {
+    return reinterpret_cast<__half *>(ptr);
+}
+template <typename F>
+__global__ void kernel_wkv_forward(const int B, const int T, const int C,
+                               const float *__restrict__ const _w, const float *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,
+                               F *__restrict__ const _y, float *__restrict__ const _aa, float *__restrict__ const _bb, float *__restrict__ const _pp) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+    const int _state_offset = _b * C + _c;
+    float u = _u[_c];
+    float w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+    float aa = _aa[_state_offset];
+    float bb = _bb[_state_offset];
+    float pp = _pp[_state_offset];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = F((e1 * aa + e2 * vv) / (e1 * bb + e2));
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    _aa[_state_offset] = aa;
+    _bb[_state_offset] = bb;
+    _pp[_state_offset] = pp;
+}
+template <typename F>
+void cuda_wkv_forward(int B, int T, int C, float *w, float *u, F *k, F *v, F *y, float *aa, float *bb, float *pp) {
+    dim3 threadsPerBlock( min(C, 32) );
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_wkv_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, aa, bb, pp);
+}
+template void cuda_wkv_forward<fp16>(
+    int B, int T, int C,
+    float *w, float *u, fp16 *k, fp16 *v, fp16 *y,
+    float *aa, float *bb, float *pp);
+template void cuda_wkv_forward<float>(
+    int B, int T, int C,
+    float *w, float *u, float *k, float *v, float *y,
+    float *aa, float *bb, float *pp);
+__global__ void kernel_mm_seq_fp32i8(
+    const int B, const int N, const int M,
+    const float *__restrict__ const x, const int x_stride,
+    const uint8_t *__restrict__ const w, const int w_stride,
+    const float *__restrict__ const mx,
+    const float *__restrict__ const rx,
+    const float *__restrict__ const my,
+    const float *__restrict__ const ry,
+    float *__restrict__ const y, const int y_stride) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int k = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i < B && k < M) {
+        float y_local = 0;
+        for (int j = 0; j < N; ++j) {
+            y_local += x[i * x_stride + j] * (
+                (float(w[j * w_stride + k]) + 0.5f)
+                * rx[k] * ry[j] + mx[k] + my[j]
+            );
+        }
+        y[i * y_stride + k] = y_local;
+    }
+}
+template <typename F>
+void cuda_mm8_seq(int B, int N, int M,
+                  F *x, int x_stride,
+                  uint8_t *w, int w_stride,
+                  F *mx, F *rx,
+                  F *my, F *ry,
+                  F *y, int y_stride);
+template <>
+void cuda_mm8_seq<float>(int B, int N, int M,
+                         float *x, int x_stride,
+                         uint8_t *w, int w_stride,
+                         float *mx, float *rx,
+                         float *my, float *ry,
+                         float *y, int y_stride) {
+    dim3 blockSize(1, 128);
+    dim3 gridSize((B + blockSize.x - 1) / blockSize.x, (M + blockSize.y - 1) / blockSize.y);
+    kernel_mm_seq_fp32i8<<<gridSize, blockSize>>>(
+        B, N, M, x, x_stride, w, w_stride,
+        mx, rx, my, ry, y, y_stride);
+}
+__global__ void kernel_mm_seq_fp16i8(
+    const int B, const int N, const int M,
+    const __half *__restrict__ const x, const int x_stride,
+    const uint8_t *__restrict__ const w, const int w_stride,
+    const __half *__restrict__ const mx,
+    const __half *__restrict__ const rx,
+    const __half *__restrict__ const my,
+    const __half *__restrict__ const ry,
+    __half *__restrict__ const y, const int y_stride) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int k = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i < B && k < M) {
+        float y_local = 0;
+        for (int j = 0; j < N; ++j) {
+            y_local += __half2float(x[i * x_stride + j]) * (
+                (float(w[j * w_stride + k]) + 0.5f)
+                * __half2float(rx[k]) * __half2float(ry[j])
+                + __half2float(mx[k]) + __half2float(my[j])
+            );
+        }
+        y[i * y_stride + k] = __float2half(y_local);
+    }
+}
+template <>
+void cuda_mm8_seq<fp16>(int B, int N, int M,
+                        fp16 *x, int x_stride,
+                        uint8_t *w, int w_stride,
+                        fp16 *mx, fp16 *rx,
+                        fp16 *my, fp16 *ry,
+                        fp16 *y, int y_stride) {
+    dim3 blockSize(1, 128);
+    dim3 gridSize((B + blockSize.x - 1) / blockSize.x, (M + blockSize.y - 1) / blockSize.y);
+    kernel_mm_seq_fp16i8<<<gridSize, blockSize>>>(
+        B, N, M, cast(x), x_stride, w, w_stride,
+        cast(mx), cast(rx), cast(my), cast(ry), cast(y), y_stride);
+}
+#define MM8_ONE_JSPLIT 24
+#define MM8_ONE_TILE 1024
+__global__ void kernel_mm_one_fp32i8(
+    const int N, const int M,
+    const float *__restrict__ const x,
+    const uint8_t *__restrict__ const w, const int w_stride,
+    const float *__restrict__ const mx,
+    const float *__restrict__ const rx,
+    const float *__restrict__ const my,
+    const float *__restrict__ const ry,
+    float *__restrict__ const y) {
+    const int k = blockIdx.y * blockDim.y + threadIdx.y;
+    const int j0 = min(N, blockIdx.x * ((N + MM8_ONE_JSPLIT - 1) / MM8_ONE_JSPLIT));
+    const int j1 = min(N, (blockIdx.x + 1) * ((N + MM8_ONE_JSPLIT - 1) / MM8_ONE_JSPLIT));
+    if (k < M) {
+        float y_local = 0;
+        for (int j = j0; j < j1; ++j) {
+            y_local += x[j] * (
+                (float(w[j * w_stride + k]) + 0.5f)
+                * rx[k] * ry[j] + mx[k] + my[j]
+            );
+        }
+        atomicAdd(&y[k], y_local);
+    }
+}
+template <typename F>
+void cuda_mm8_one(int N, int M,
+                  F *x,
+                  uint8_t *w, int w_stride,
+                  F *mx, F *rx,
+                  F *my, F *ry,
+                  float *y);
+template <>
+void cuda_mm8_one<float>(int N, int M,
+                        float *x,
+                        uint8_t *w, int w_stride,
+                        float *mx, float *rx,
+                        float *my, float *ry,
+                        float *y) {
+    dim3 blockSize(1, MM8_ONE_TILE);
+    dim3 gridSize(MM8_ONE_JSPLIT, (M + blockSize.y - 1) / blockSize.y);
+    kernel_mm_one_fp32i8<<<gridSize, blockSize>>>(
+        N, M, x, w, w_stride,
+        mx, rx, my, ry, y);
+}
+__global__ void kernel_mm_one_fp16i8(
+    const int N, const int M,
+    const __half *__restrict__ const x,
+    const uint8_t *__restrict__ const w, const int w_stride,
+    const __half *__restrict__ const mx,
+    const __half *__restrict__ const rx,
+    const __half *__restrict__ const my,
+    const __half *__restrict__ const ry,
+    float *__restrict__ const y) {
+    const int k = blockIdx.y * blockDim.y + threadIdx.y;
+    const int j0 = min(N, blockIdx.x * ((N + MM8_ONE_JSPLIT - 1) / MM8_ONE_JSPLIT));
+    const int j1 = min(N, (blockIdx.x + 1) * ((N + MM8_ONE_JSPLIT - 1) / MM8_ONE_JSPLIT));
+    if (k < M) {
+        float y_local = 0;
+        for (int j = j0; j < j1; ++j) {
+            y_local += __half2float(x[j]) * (
+                (float(w[j * w_stride + k]) + 0.5f)
+                * __half2float(rx[k]) * __half2float(ry[j])
+                + __half2float(mx[k]) + __half2float(my[j])
+            );
+        }
+        atomicAdd(&y[k], y_local);
+    }
+}
+template <>
+void cuda_mm8_one<fp16>(int N, int M,
+                        fp16 *x,
+                        uint8_t *w, int w_stride,
+                        fp16 *mx, fp16 *rx,
+                        fp16 *my, fp16 *ry,
+                        float *y) {
+    dim3 blockSize(1, MM8_ONE_TILE);
+    dim3 gridSize(MM8_ONE_JSPLIT, (M + blockSize.y - 1) / blockSize.y);
+    kernel_mm_one_fp16i8<<<gridSize, blockSize>>>(
+        N, M, cast(x), w, w_stride,
+        cast(mx), cast(rx), cast(my), cast(ry), y);
+}

cuda/rwkv5.cu ADDED Viewed

	@@ -0,0 +1,88 @@

+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+typedef at::BFloat16 bf16;
+typedef at::Half fp16;
+typedef float fp32;
+template <typename F>
+__global__ void kernel_forward(const int B, const int T, const int C, const int H, float *__restrict__ _state,
+                               const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const F *__restrict__ _u,
+                               F *__restrict__ const _y)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    _w += h*_N_;
+    _u += h*_N_;
+    _state += h*_N_*_N_ + i*_N_; // wrong if B > 1 !!!
+    __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];
+    float state[_N_];
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        state[j] = _state[j];
+    __syncthreads();
+    u[i] = float(_u[i]);
+    w[i] = _w[i];
+    __syncthreads();
+    for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)
+    {
+        __syncthreads();
+        r[i] = float(_r[t]);
+        k[i] = float(_k[t]);
+        __syncthreads();
+        const float v = float(_v[t]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j+=4)
+        {
+            const float4& r_ = (float4&)(r[j]);
+            const float4& k_ = (float4&)(k[j]);
+            const float4& w_ = (float4&)(w[j]);
+            const float4& u_ = (float4&)(u[j]);
+            float4& s = (float4&)(state[j]);
+            float4 x;
+            x.x = k_.x * v;
+            x.y = k_.y * v;
+            x.z = k_.z * v;
+            x.w = k_.w * v;
+            y += r_.x * (u_.x * x.x + s.x);
+            y += r_.y * (u_.y * x.y + s.y);
+            y += r_.z * (u_.z * x.z + s.z);
+            y += r_.w * (u_.w * x.w + s.w);
+            s.x = s.x * w_.x + x.x;
+            s.y = s.y * w_.y + x.y;
+            s.z = s.z * w_.z + x.z;
+            s.w = s.w * w_.w + x.w;
+        }
+        _y[t] = F(y);
+    }
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        _state[j] = state[j];
+}
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16 *k, fp16 *v, float *w, fp16 *u, fp16 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32 *k, fp32 *v, float *w, fp32 *u, fp32 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}

cuda/rwkv5_op.cpp ADDED Viewed

	@@ -0,0 +1,34 @@

+#include <torch/extension.h>
+#include "ATen/ATen.h"
+#include <c10/cuda/CUDAGuard.h>
+typedef at::BFloat16 bf16;
+typedef at::Half fp16;
+typedef float fp32;
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y);
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16 *k, fp16 *v, float *w, fp16 *u, fp16 *y);
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32 *k, fp32 *v, float *w, fp32 *u, fp32 *y);
+void forward_bf16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_bf16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), w.data_ptr<float>(), u.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_fp16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_fp16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp16>(), k.data_ptr<fp16>(), v.data_ptr<fp16>(), w.data_ptr<float>(), u.data_ptr<fp16>(), y.data_ptr<fp16>());
+}
+void forward_fp32(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_fp32(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp32>(), k.data_ptr<fp32>(), v.data_ptr<fp32>(), w.data_ptr<float>(), u.data_ptr<fp32>(), y.data_ptr<fp32>());
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward_bf16", &forward_bf16, "rwkv5 forward_bf16");
+    m.def("forward_fp16", &forward_fp16, "rwkv5 forward_fp16");
+    m.def("forward_fp32", &forward_fp32, "rwkv5 forward_fp32");
+}
+TORCH_LIBRARY(rwkv5, m) {
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_fp16", forward_fp16);
+    m.def("forward_fp32", forward_fp32);
+}

cuda/rwkv6.cu ADDED Viewed

	@@ -0,0 +1,87 @@

+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+typedef at::BFloat16 bf16;
+typedef at::Half fp16;
+typedef float fp32;
+template <typename F>
+__global__ void kernel_forward(const int B, const int T, const int C, const int H, float *__restrict__ _state,
+                               const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const F *__restrict__ _u,
+                               F *__restrict__ const _y)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    _u += h*_N_;
+    _state += h*_N_*_N_ + i*_N_; // wrong if B > 1 !!!
+    __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];
+    float state[_N_];
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        state[j] = _state[j];
+    __syncthreads();
+    u[i] = float(_u[i]);
+    __syncthreads();
+    for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)
+    {
+        __syncthreads();
+        w[i] = _w[t];
+        r[i] = float(_r[t]);
+        k[i] = float(_k[t]);
+        __syncthreads();
+        const float v = float(_v[t]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j+=4)
+        {
+            const float4& r_ = (float4&)(r[j]);
+            const float4& k_ = (float4&)(k[j]);
+            const float4& w_ = (float4&)(w[j]);
+            const float4& u_ = (float4&)(u[j]);
+            float4& s = (float4&)(state[j]);
+            float4 x;
+            x.x = k_.x * v;
+            x.y = k_.y * v;
+            x.z = k_.z * v;
+            x.w = k_.w * v;
+            y += r_.x * (u_.x * x.x + s.x);
+            y += r_.y * (u_.y * x.y + s.y);
+            y += r_.z * (u_.z * x.z + s.z);
+            y += r_.w * (u_.w * x.w + s.w);
+            s.x = s.x * w_.x + x.x;
+            s.y = s.y * w_.y + x.y;
+            s.z = s.z * w_.z + x.z;
+            s.w = s.w * w_.w + x.w;
+        }
+        _y[t] = F(y);
+    }
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        _state[j] = state[j];
+}
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16 *k, fp16 *v, float *w, fp16 *u, fp16 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32 *k, fp32 *v, float *w, fp32 *u, fp32 *y)
+{
+    assert(H*_N_ == C);
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, k, v, w, u, y);
+}

cuda/rwkv6_op.cpp ADDED Viewed

	@@ -0,0 +1,34 @@

+#include <torch/extension.h>
+#include "ATen/ATen.h"
+#include <c10/cuda/CUDAGuard.h>
+typedef at::BFloat16 bf16;
+typedef at::Half fp16;
+typedef float fp32;
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y);
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16 *k, fp16 *v, float *w, fp16 *u, fp16 *y);
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32 *k, fp32 *v, float *w, fp32 *u, fp32 *y);
+void forward_bf16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_bf16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), w.data_ptr<float>(), u.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_fp16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_fp16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp16>(), k.data_ptr<fp16>(), v.data_ptr<fp16>(), w.data_ptr<float>(), u.data_ptr<fp16>(), y.data_ptr<fp16>());
+}
+void forward_fp32(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(state));
+    cuda_forward_fp32(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp32>(), k.data_ptr<fp32>(), v.data_ptr<fp32>(), w.data_ptr<float>(), u.data_ptr<fp32>(), y.data_ptr<fp32>());
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward_bf16", &forward_bf16, "rwkv6 forward_bf16");
+    m.def("forward_fp16", &forward_fp16, "rwkv6 forward_fp16");
+    m.def("forward_fp32", &forward_fp32, "rwkv6 forward_fp32");
+}
+TORCH_LIBRARY(rwkv6, m) {
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_fp16", forward_fp16);
+    m.def("forward_fp32", forward_fp32);
+}

cuda/rwkv7.cu ADDED Viewed

	@@ -0,0 +1,77 @@

+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+typedef at::Half fp16;
+typedef at::BFloat16 bf16;
+typedef float fp32;
+template <typename F>
+__global__ void kernel_forward(const int B, const int T, const int C, const int H,
+                               float *__restrict__ _state, const F *__restrict__ const _r, const F *__restrict__ const _w, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _a, const F *__restrict__ const _b,
+                               F *__restrict__ const _y)
+{
+    const int e = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    _state += h*_N_*_N_ + i*_N_; // wrong if B > 1 !!!
+    float state[_N_];
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        state[j] = _state[j];
+    __shared__ float r[_N_], k[_N_], w[_N_], a[_N_], b[_N_];
+    for (int _t = 0; _t < T; _t++)
+    {
+        const int t = e*T*C + h*_N_ + i + _t * C;
+        __syncthreads();
+        r[i] = float(_r[t]);
+        w[i] = __expf(-__expf(float(_w[t])));
+        k[i] = float(_k[t]);
+        a[i] = float(_a[t]);
+        b[i] = float(_b[t]);
+        __syncthreads();
+        float sa = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            sa += a[j] * state[j];
+        }
+        float vv = float(_v[t]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = state[j];
+            s = s * w[j] + k[j] * vv + sa * b[j];
+            y += s * r[j];
+        }
+        _y[t] = F(y);
+    }
+    #pragma unroll
+    for (int j = 0; j < _N_; j++)
+        _state[j] = state[j];
+}
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16* w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y)
+{
+    assert(H*_N_ == C);
+    assert(B == 1); // only for B=1
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, w, k, v, a, b, y);
+}
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16* w, fp16 *k, fp16 *v, fp16 *a, fp16 *b, fp16 *y)
+{
+    assert(H*_N_ == C);
+    assert(B == 1); // only for B=1
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, w, k, v, a, b, y);
+}
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32* w, fp32 *k, fp32 *v, fp32 *a, fp32 *b, fp32 *y)
+{
+    assert(H*_N_ == C);
+    assert(B == 1); // only for B=1
+    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, w, k, v, a, b, y);
+}

cuda/rwkv7_op.cpp ADDED Viewed

	@@ -0,0 +1,26 @@

+#include <torch/extension.h>
+#include "ATen/ATen.h"
+typedef at::Half fp16;
+typedef at::BFloat16 bf16;
+typedef float fp32;
+void cuda_forward_bf16(int B, int T, int C, int H, float *state, bf16 *r, bf16 *w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y);
+void cuda_forward_fp16(int B, int T, int C, int H, float *state, fp16 *r, fp16 *w, fp16 *k, fp16 *v, fp16 *a, fp16 *b, fp16 *y);
+void cuda_forward_fp32(int B, int T, int C, int H, float *state, fp32 *r, fp32 *w, fp32 *k, fp32 *v, fp32 *a, fp32 *b, fp32 *y);
+void forward_bf16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
+    cuda_forward_bf16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<bf16>(), w.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), a.data_ptr<bf16>(), b.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_fp16(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
+    cuda_forward_fp16(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp16>(), w.data_ptr<fp16>(), k.data_ptr<fp16>(), v.data_ptr<fp16>(), a.data_ptr<fp16>(), b.data_ptr<fp16>(), y.data_ptr<fp16>());
+}
+void forward_fp32(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
+    cuda_forward_fp32(B, T, C, H, state.data_ptr<float>(), r.data_ptr<fp32>(), w.data_ptr<fp32>(), k.data_ptr<fp32>(), v.data_ptr<fp32>(), a.data_ptr<fp32>(), b.data_ptr<fp32>(), y.data_ptr<fp32>());
+}
+TORCH_LIBRARY(wkv7s, m) {
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_fp16", forward_fp16);
+    m.def("forward_fp32", forward_fp32);
+}

cuda/wrapper.cpp ADDED Viewed

	@@ -0,0 +1,141 @@

+#include <torch/extension.h>
+#include "ATen/ATen.h"
+#include <iostream>
+#include <c10/cuda/CUDAGuard.h>
+typedef at::Half fp16;
+template <typename F>
+void cuda_wkv_forward(int B, int T, int C,
+                      float *w, float *u, F *k, F *v, F *y,
+                      float *aa, float *bb, float *pp);
+template <typename F>
+void cuda_mm8_seq(int B, int N, int M,
+                  F *x, int x_stride,
+                  uint8_t *w, int w_stride,
+                  F *mx, F *rx,
+                  F *my, F *ry,
+                  F *y, int y_stride);
+template <typename F>
+void cuda_mm8_one(int N, int M,
+                  F *x,
+                  uint8_t *w, int w_stride,
+                  F *mx, F *rx,
+                  F *my, F *ry,
+                  float *y);
+void wkv_forward(int64_t B, int64_t T, int64_t C,
+                 torch::Tensor &w, torch::Tensor &u,
+                 torch::Tensor &k, torch::Tensor &v, torch::Tensor &y,
+                 torch::Tensor &aa, torch::Tensor &bb, torch::Tensor &pp) {
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(w));
+    switch (k.scalar_type()) {
+    case c10::ScalarType::Half:
+        cuda_wkv_forward(B, T, C,
+                         w.data_ptr<float>(), u.data_ptr<float>(),
+                         k.data_ptr<fp16>(), v.data_ptr<fp16>(), y.data_ptr<fp16>(),
+                         aa.data_ptr<float>(), bb.data_ptr<float>(), pp.data_ptr<float>());
+        break;
+    case c10::ScalarType::Float:
+        cuda_wkv_forward(B, T, C,
+                         w.data_ptr<float>(), u.data_ptr<float>(),
+                         k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(),
+                         aa.data_ptr<float>(), bb.data_ptr<float>(), pp.data_ptr<float>());
+        break;
+    default:
+        assert(false && "Only FP16 and FP32 are currently supported");
+    }
+}
+void mm8_seq(int64_t B, int64_t N, int64_t M,
+             torch::Tensor &x, torch::Tensor &w,
+             torch::Tensor &mx, torch::Tensor &rx,
+             torch::Tensor &my, torch::Tensor &ry,
+             torch::Tensor &y) {
+    assert(x.stride(1) == 1);
+    assert(w.stride(1) == 1);
+    assert(mx.stride(0) == 1 && rx.stride(0) == 1);
+    assert(my.stride(0) == 1 && ry.stride(0) == 1);
+    assert(y.stride(1) == 1);
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(w));
+    switch (x.scalar_type()) {
+    case c10::ScalarType::Half:
+        cuda_mm8_seq(
+            B, N, M,
+            x.data_ptr<fp16>(), x.stride(0),
+            w.data_ptr<uint8_t>(), w.stride(0),
+            mx.data_ptr<fp16>(), rx.data_ptr<fp16>(),
+            my.data_ptr<fp16>(), ry.data_ptr<fp16>(),
+            y.data_ptr<fp16>(), y.stride(0));
+        break;
+    case c10::ScalarType::Float:
+        cuda_mm8_seq(
+            B, N, M,
+            x.data_ptr<float>(), x.stride(0),
+            w.data_ptr<uint8_t>(), w.stride(0),
+            mx.data_ptr<float>(), rx.data_ptr<float>(),
+            my.data_ptr<float>(), ry.data_ptr<float>(),
+            y.data_ptr<float>(), y.stride(0));
+        break;
+    default:
+        assert(false && "Only FP16 and FP32 are currently supported");
+    }
+}
+void mm8_one(int64_t N, int64_t M,
+             torch::Tensor &x, torch::Tensor &w,
+             torch::Tensor &mx, torch::Tensor &rx,
+             torch::Tensor &my, torch::Tensor &ry,
+             torch::Tensor &y) {
+    assert(x.stride(0) == 1);
+    assert(w.stride(1) == 1);
+    assert(mx.stride(0) == 1 && rx.stride(0) == 1);
+    assert(my.stride(0) == 1 && ry.stride(0) == 1);
+    assert(y.stride(0) == 1);
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(w));
+    switch (x.scalar_type()) {
+    case c10::ScalarType::Half:
+        cuda_mm8_one(
+            N, M,
+            x.data_ptr<fp16>(),
+            w.data_ptr<uint8_t>(), w.stride(0),
+            mx.data_ptr<fp16>(), rx.data_ptr<fp16>(),
+            my.data_ptr<fp16>(), ry.data_ptr<fp16>(),
+            y.data_ptr<float>());
+        break;
+    case c10::ScalarType::Float:
+        cuda_mm8_one(
+            N, M,
+            x.data_ptr<float>(),
+            w.data_ptr<uint8_t>(), w.stride(0),
+            mx.data_ptr<float>(), rx.data_ptr<float>(),
+            my.data_ptr<float>(), ry.data_ptr<float>(),
+            y.data_ptr<float>());
+        break;
+    default:
+        assert(false && "Only FP16 and FP32 are currently supported");
+    }
+}
+using torch::Tensor;
+#ifndef DISABLE_CUBLAS_GEMM
+void gemm_fp16_cublas(Tensor a, Tensor b, Tensor c);
+#endif
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("wkv_forward", &wkv_forward, "wkv forward");
+    m.def("mm8_seq", &mm8_seq, "mm8 seq");
+    m.def("mm8_one", &mm8_one, "mm8 one");
+#ifndef DISABLE_CUBLAS_GEMM
+    m.def("gemm_fp16_cublas", &gemm_fp16_cublas, "gemv fp16 cublas");
+#endif
+}
+TORCH_LIBRARY(rwkv, m) {
+    m.def("wkv_forward", wkv_forward);
+    m.def("mm8_seq", mm8_seq);
+    m.def("mm8_one", mm8_one);
+#ifndef DISABLE_CUBLAS_GEMM
+    m.def("gemm_fp16_cublas", gemm_fp16_cublas);
+#endif
+}

download_models.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+"""
+Download model weights listed in a config YAML (replicates Dockerfile download behavior without Docker).
+Usage:
+    python download_models.py --config config.production.yaml
+This script uses huggingface_hub.hf_hub_download to download specified .pth files to the
+model's DOWNLOAD_MODEL_DIR (or ./models by default).
+"""
+import argparse
+import os
+import yaml
+import time
+from huggingface_hub import hf_hub_download
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="config.production.yaml")
+    parser.add_argument("--token", default=None, help="Hugging Face token (optional)")
+    args = parser.parse_args()
+    with open(args.config, "r", encoding="utf-8") as f:
+        cfg = yaml.safe_load(f.read())
+    models = cfg.get("MODELS", [])
+    if len(models) == 0:
+        print("No models found in config. Nothing to download.")
+        return
+    for m in models:
+        repo_id = m.get("DOWNLOAD_MODEL_REPO_ID")
+        filename = m.get("DOWNLOAD_MODEL_FILE_NAME")
+        local_dir = m.get("DOWNLOAD_MODEL_DIR", "./models")
+        if repo_id is None or filename is None:
+            print(f"Skipping model with incomplete download info: {m}")
+            continue
+        os.makedirs(local_dir, exist_ok=True)
+        print(f"Downloading {filename} from repo {repo_id} into {local_dir} ...")
+        os.makedirs(local_dir, exist_ok=True)
+        # Add retry logic
+        max_attempts = 5
+        for attempt in range(1, max_attempts + 1):
+            try:
+                path = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir, token=args.token)
+                print(f"Downloaded file to {path}")
+                break
+            except Exception as e:
+                print(f"Attempt {attempt} failed to download {filename} from {repo_id}: {e}")
+                if attempt < max_attempts:
+                    print(f"Retrying in {attempt*5} seconds...")
+                    time.sleep(attempt * 5)
+                else:
+                    print(f"Failed after {max_attempts} attempts. Skipping {filename}.")
+if __name__ == "__main__":
+    main()

models/.cache/huggingface/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

models/.cache/huggingface/download/rwkv7-g1a-0.1b-20250728-ctx4096.pth.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+8c8cdf8c605dc7dfdccb676b9d0c482ba002f710
+964f01cc4673273bbcf1b9c3cdc243d58af97bffeab51cb20c752eeaf048a3c6
+1763947179.4323187

models/rwkv7-g1a-0.1b-20250728-ctx4096.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:964f01cc4673273bbcf1b9c3cdc243d58af97bffeab51cb20c752eeaf048a3c6
+size 382223868

pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+[project]
+name = "rwkv-hf-space"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi[standard]>=0.115.11",
+    "huggingface-hub>=0.29.1",
+    "loguru>=0.7.3",
+    "ninja>=1.11.1.3",
+    "numpy>=2.2.3",
+    "pydantic>=2.10.6",
+    "pydantic-settings>=2.8.1",
+    "pynvml>=12.0.0",
+    "rich>=13.9.4",
+    "rwkv>=0.8.30",
+    "setuptools>=75.8.2",
+    "snowflake-id>=1.0.2",
+    "modelscope>=1.23.0",
+    "transformers",
+]
+[project.optional-dependencies]
+cpu = ["torch>=2.6.0"]
+cu124 = ["torch>=2.6.0"]
+[tool.uv]
+conflicts = [[{ extra = "cpu" }, { extra = "cu124" }, { extra = "cu113" }]]
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-cpu", extra = "cpu" },
+    { index = "pytorch-cu124", extra = "cu124" },
+]
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true

run_windows.ps1 ADDED Viewed

	@@ -0,0 +1,14 @@

+Param(
+    [string]$CONFIG_FILE = 'config.production.yaml'
+)
+if (-not (Test-Path .\.venv\Scripts\Activate.ps1)) {
+    Write-Host "Virtualenv not found. Run setup_windows.ps1 first." -ForegroundColor Red
+    exit 1
+}
+.\.venv\Scripts\Activate.ps1
+$env:CONFIG_FILE=$CONFIG_FILE
+Write-Host "Starting the RWKV FastAPI app using $CONFIG_FILE..." -ForegroundColor Green
+python app.py

setup_windows.ps1 ADDED Viewed

	@@ -0,0 +1,82 @@

+Param(
+    [switch]$gpu,
+    [string]$CONFIG_FILE = 'config.production.yaml',
+    [switch]$buildFrontend
+)
+Write-Host "Starting RWKV local setup for Windows..." -ForegroundColor Green
+if (-not (Get-Command python -ErrorAction SilentlyContinue)) {
+    Write-Host "Python not found. Please install Python 3.10+ and add it to PATH." -ForegroundColor Red
+    exit 1
+}
+Write-Host "Creating virtual environment (./.venv) ..."
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+pip install --upgrade pip setuptools wheel
+if ($gpu) {
+    Write-Host "GPU support requested. Installing GPU dependencies (cu124) ..." -ForegroundColor Yellow
+    pip install -e .[cu124]
+} else {
+    Write-Host "Installing CPU-only dependencies ..." -ForegroundColor Yellow
+    pip install -e .[cpu]
+}
+Write-Host "Installing extra tooling (huggingface_hub for downloads) ..."
+pip install huggingface-hub
+pip install beautifulsoup4
+Write-Host "Downloading models from config: $CONFIG_FILE" -ForegroundColor Green
+python .\download_models.py --config $CONFIG_FILE --token $env:HF_TOKEN
+# Ensure the config file is available as config.local.yaml that the app's default reads
+if (Test-Path $CONFIG_FILE) {
+    Write-Host "Copying $CONFIG_FILE to config.local.yaml so the application uses it by default..." -ForegroundColor Green
+    Copy-Item -Force $CONFIG_FILE config.local.yaml
+    # If GPU not requested, set STRATEGY to CPU in config.local.yaml
+    if (-not $gpu) {
+        Write-Host "GPU not requested. Setting STRATEGY to 'cpu fp16' in config.local.yaml..." -ForegroundColor Yellow
+        try {
+            $yaml = (Get-Content config.local.yaml -Raw)
+            # Replace the STRATEGY line with CPU + fp16 precision
+            $yaml = $yaml -replace '(?m)^STRATEGY:.*$', 'STRATEGY: "cpu fp16"'
+\r\"]*\"?", "STRATEGY: \"cpu\""
+            $yaml | Out-File -Encoding utf8 config.local.yaml -Force
+        } catch {
+            Write-Host "Warning: failed to modify config.local.yaml; please set STRATEGY manually" -ForegroundColor Yellow
+        }
+    }
+}
+if ($buildFrontend) {
+    if (-not (Get-Command pnpm -ErrorAction SilentlyContinue)) {
+        Write-Host "pnpm not found. Installing pnpm globally via npm..." -ForegroundColor Yellow
+        npm install -g pnpm
+    }
+    if (-not (Test-Path .\web-frontend)) {
+        Write-Host "Cloning web frontend repo..."
+        git clone https://github.com/SolomonLeon/web-rwkv-realweb.git web-frontend
+    }
+    Push-Location web-frontend
+    pnpm install
+    if ($env:MODELSCOPE_ENVIRONMENT -eq "studio") {
+        pnpm run build --mode target-rwkv-modelscope-space
+    } else {
+        pnpm run build --mode target-rwkv-hf-space
+    }
+    Pop-Location
+    # Copy dist to the project's dist-frontend
+    if (Test-Path .\web-frontend\dist) {
+        Remove-Item -Recurse -Force .\dist-frontend -ErrorAction SilentlyContinue
+        Copy-Item -Recurse .\web-frontend\dist .\dist-frontend
+        Write-Host "Frontend built and copied to ./dist-frontend" -ForegroundColor Green
+    }
+}
+Write-Host "Setup complete. Run the app with: \n$env:CONFIG_FILE='$CONFIG_FILE'\npython app.py" -ForegroundColor Cyan

tests/api_test.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import requests, json, time
+BASE = 'http://127.0.0.1:7860/api/v1/chat/completions'
+headers = {'Content-Type': 'application/json'}
+print('Non-streaming example')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Who is the president of France today?',
+    'stream': False,
+    'max_tokens': 64,
+    'temperature': 0.2,
+    'include_usage': True,
+}
+try:
+    r = requests.post(BASE, json=payload, timeout=120)
+    print('Status', r.status_code)
+    try:
+        print(json.dumps(r.json(), indent=2))
+    except Exception:
+        print('Non-JSON response:', r.text[:1000])
+except Exception as e:
+    print('Error in non-stream request:', e)
+print('\nTools: calc example')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Calculate 2+3*4 and explain the result.',
+    'stream': False,
+    'tools': [{'name': 'calc', 'args': {'expression': '2+3*4'}}],
+    'include_usage': True,
+}
+try:
+    r = requests.post(BASE, json=payload, timeout=120)
+    print('Status', r.status_code)
+    try:
+        print(json.dumps(r.json(), indent=2))
+    except Exception:
+        print('Non-JSON response:', r.text[:1000])
+except Exception as e:
+    print('Error in calc tool request:', e)
+print('\nTools: web_search example')
+payload = {
+    'model': 'rwkv-latest',
+    'prompt': 'Who is the current president of France?',
+    'stream': False,
+    'web_search': True,
+    'search_top_k': 2,
+    'include_usage': True,
+}
+try:
+    r = requests.post(BASE, json=payload, timeout=120)
+    print('Status', r.status_code)
+    try:
+        print(json.dumps(r.json(), indent=2))
+    except Exception:
+        print('Non-JSON response:', r.text[:1000])
+except Exception as e:
+    print('Error in web_search request:', e)
+print('\nStreaming example (short)')
+payload = {
+    'model': 'rwkv-latest:thinking',
+    'messages': [{'role': 'user', 'content': 'Explain Newton\'s first law in one sentence.'}],
+    'stream': True,
+    'max_tokens': 64,
+    'temperature': 0.2,
+}
+try:
+    r = requests.post(BASE, json=payload, headers=headers, stream=True, timeout=120)
+    print('Status', r.status_code)
+    if r.status_code == 200:
+        for line in r.iter_lines(decode_unicode=True):
+            if not line:
+                continue
+            print('SSE:', line)
+            if line.strip().endswith('[DONE]'):
+                break
+except Exception as e:
+    print('Error in streaming request:', e)
+print('\nDone tests')

tests/run_local_exec.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import asyncio
+import os
+os.environ['MODELSCOPE_ENVIRONMENT'] = ''
+from app import chat_completions, ChatCompletionRequest
+async def run_once():
+    req = ChatCompletionRequest(model='rwkv-latest', prompt='Who is the president of France today?', stream=False, max_tokens=32, temperature=0.2, include_usage=True)
+    res = await chat_completions(req)
+    print(res)
+if __name__ == '__main__':
+    asyncio.run(run_once())

utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import re, os, threading, queue, requests
+from typing import List, Optional, Union
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from api_types import ChatMessage
+def parse_think_response(full_response: str):
+    think_start = full_response.find("<think")
+    if think_start == -1:
+        return None, full_response.strip()
+    think_end = full_response.find("</think>")
+    if think_end == -1:  # 未闭合的情况
+        reasoning = full_response[think_start:].strip()
+        content = ""
+    else:
+        reasoning = full_response[think_start : think_end + 9].strip()  # +9包含完整标签
+        content = full_response[think_end + 9 :].strip()
+    # 清理标签保留内容
+    reasoning_content = reasoning.replace("<think", "").replace("</think>", "").strip()
+    return reasoning_content, content
+def cleanMessages(messages: List[ChatMessage], removeThinkingContent: bool = False):
+    promptStrList = []
+    for message in messages:
+        content = message.content.strip()
+        content = re.sub(r"\n+", "\n", content)
+        promptStrList.append(
+            f"{message.role.strip().lower().capitalize()}: {content if message.role.strip().lower().capitalize()!='Assistant' or not removeThinkingContent else remove_nested_think_tags_stack(content)}"
+        )
+    return "\n\n".join(promptStrList)
+def remove_nested_think_tags_stack(text):
+    stack = []
+    result = ""
+    i = 0
+    while i < len(text):
+        if text[i : i + 7] == "<think>":
+            stack.append("<think>")
+            i += 7
+        elif text[i : i + 8] == "</think>":
+            if stack and stack[-1] == "<think>":
+                stack.pop()
+                i += 8
+            else:
+                result += text[i : i + 8]
+                i += 8
+        elif not stack:
+            result += text[i]
+            i += 1
+        else:
+            i += 1
+    return result
+def format_bytes(size):
+    power = 2**10
+    n = 0
+    power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
+    while size > power:
+        size /= power
+        n += 1
+    return f"{size:.4f}{power_labels[n]+'B'}"
+LOGGER_QUEUE = queue.Queue(5)
+def logger():
+    print("enable")
+    while True:
+        item = LOGGER_QUEUE.get()
+        try:
+            requests.post(
+                os.environ.get("LOG_PORT"),
+                headers={"Content-Type": "application/json"},
+                json=item,
+            )
+        except Exception:
+            pass
+if os.environ.get("LOG_PORT"):
+    threading.Thread(target=logger).start()
+def log(item):
+    LOGGER_QUEUE.put_nowait(item)
+def web_search(query: str, top_k: int = 3) -> str:
+    """Perform a simple web search via DuckDuckGo HTML and return top_k results as a combined string.
+    This is a lightweight fallback search that does not call external model services —
+    it queries a public search endpoint, parses titles/snippets/urls and returns them as
+    formatted text to be included into the model's prompt context.
+    """
+    if not query or query.strip() == "":
+        return ""
+    try:
+        from bs4 import BeautifulSoup
+    except Exception:
+        return ""
+    try:
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
+        q = query.strip()
+        resp = requests.get("https://duckduckgo.com/html/", params={"q": q}, headers=headers, timeout=10)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        # DuckDuckGo's html structure: results are in `div.result` containers.
+        results = []
+        for r in soup.find_all("div", class_="result", limit=top_k):
+            a = r.find("a", class_="result__a") or r.find("a", href=True)
+            title = a.get_text(strip=True) if a else ""
+            href = a.get("href") if a else ""
+            snippet = ""
+            s = r.find("a", class_="result__snippet") or r.find("div", class_="result__snippet")
+            if s:
+                snippet = s.get_text(strip=True)
+            results.append(f"{title} - {snippet} - {href}")
+        return "\n".join(results)
+    except Exception:
+        return ""
+def calc(expr: str) -> str:
+    """Safely evaluate a simple arithmetic expression and return the result as string.
+    This uses ast parsing to disallow attributes and only permit arithmetic operators.
+    """
+    try:
+        import ast, operator as op
+        # supported operators
+        allowed_ops = {
+            ast.Add: op.add,
+            ast.Sub: op.sub,
+            ast.Mult: op.mul,
+            ast.Div: op.truediv,
+            ast.Pow: op.pow,
+            ast.BitXor: op.xor,
+            ast.USub: op.neg,
+            ast.Mod: op.mod,
+            ast.FloorDiv: op.floordiv,
+        }
+        def _eval(node):
+            if isinstance(node, ast.Num):  # <number>
+                return node.n
+            elif isinstance(node, ast.BinOp):
+                left = _eval(node.left)
+                right = _eval(node.right)
+                op_type = type(node.op)
+                if op_type in allowed_ops:
+                    return allowed_ops[op_type](left, right)
+                else:
+                    raise ValueError("Unsupported operator")
+            elif isinstance(node, ast.UnaryOp):
+                operand = _eval(node.operand)
+                op_type = type(node.op)
+                if op_type in allowed_ops:
+                    return allowed_ops[op_type](operand)
+                raise ValueError("Unsupported unary op")
+            else:
+                raise ValueError("Unsupported expression type")
+        node = ast.parse(expr, mode='eval')
+        result = _eval(node.body)
+        return str(result)
+    except Exception as e:
+        return f"ERROR: {e}"

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

verify_setup.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Simple local verification script to ensure the environment is prepared and the model is downloaded.
+"""
+import os
+import sys
+def check_venv_python():
+    if sys.prefix == sys.base_prefix:
+        print("Not in a virtual environment; consider activating .venv")
+    else:
+        print(f"Virtualenv detected: {sys.prefix}")
+def check_models_dir():
+    models_dir = "models"
+    if not os.path.exists(models_dir):
+        print("Models directory not found. Run download_models.py first.")
+        return False
+    files = [f for f in os.listdir(models_dir) if f.endswith('.pth')]
+    if not files:
+        print("No .pth files found in ./models. Run download_models.py to fetch model weights.")
+        return False
+    print(f"Found model files: {files}")
+    return True
+def check_dependencies():
+    try:
+        import importlib
+        packages = [
+            'fastapi', 'uvicorn', 'rwkv', 'huggingface_hub', 'pydantic', 'loguru'
+        ]
+        missing = []
+        for p in packages:
+            if importlib.util.find_spec(p) is None:
+                missing.append(p)
+        if missing:
+            print(f"Missing packages: {missing}; install them in your venv")
+            return False
+        print("All key dependencies found.")
+        return True
+    except Exception as e:
+        print(f"Dependency check failed: {e}")
+        return False
+def main():
+    check_venv_python()
+    deps_ok = check_dependencies()
+    models_ok = check_models_dir()
+    if deps_ok and models_ok:
+        print("Environment appears configured. You can run: python app.py")
+    else:
+        print("Fix missing items and re-run verification.")
+if __name__ == '__main__':
+    main()