[{"id":"token_saver","name":"Token Saver","meta":{"description":"Inlet filter that middle-truncates already consumed tool results before they are sent to the LLM, saving tokens and cost.","type":"filter","manifest":{"title":"Token Saver Filter","author":"Filip Pytloun","version":"4.1.0","license":"MIT","description":">"}},"content":"\"\"\"\ntitle: Token Saver Filter\nauthor: Filip Pytloun\nversion: 4.1.0\nlicense: MIT\ndescription: >\n  Inlet filter that middle-truncates consumed tool results before they\n  are sent to the LLM, saving tokens and cost.\n\n  A tool result is \"consumed\" when the model has already processed it\n  and written a response.  Unconsumed results (the model hasn't\n  responded yet) are never touched — the model needs the full output.\n  This means large tool outputs cost tokens only once, on the turn\n  they are first returned.\n\n  Handles both Open WebUI tool result formats:\n  - Native tool calling: role=\"tool\" messages\n  - Legacy/details mode: <details result=\"...\"> in assistant messages\n\n  Never truncates assistant text, user, system, or developer messages.\n\"\"\"\n\nimport logging\nimport re\nfrom typing import Callable, Optional\n\nfrom pydantic import BaseModel, Field\n\n_log = logging.getLogger(__name__)\n\n# Regex to match <details ... result=\"...\"> attributes.\n_DETAILS_RESULT_RE = re.compile(\n    r'(<details\\b[^>]*?\\bresult=\")' r\"((?:&quot;|[^\\\"])*?)\" r'(\")',\n    re.DOTALL,\n)\n\n\nclass Filter:\n    class Valves(BaseModel):\n        priority: int = Field(\n            default=5,\n            description=(\n                \"Filter priority (lower = runs first). Set higher than other filters like mnemory.\"\n            ),\n        )\n        enabled: bool = Field(\n            default=True,\n            description=\"Enable or disable the token saver filter.\",\n        )\n        max_tool_result_chars: int = Field(\n            default=2000,\n            description=(\n                \"Max chars for consumed tool results (both native \"\n                \"role='tool' messages and <details result=...> \"\n                \"attributes). The model already processed and \"\n                \"summarized these — the truncated version is just \"\n                \"a reminder of what was returned.\"\n            ),\n        )\n        head_ratio: float = Field(\n            default=0.5,\n            description=\"Head/tail split ratio. 0.5 = equal head and tail.\",\n        )\n        show_status: bool = Field(\n            default=True,\n            description=\"Show truncation summary as a chat status message.\",\n        )\n        exclude_tools: str = Field(\n            default=\"mnemory_initialize_memory,mnemory_get_core_memories\",\n            description=(\n                \"Comma-separated list of tool names whose results \"\n                \"should never be truncated. These tools provide \"\n                \"context that must remain intact across the entire \"\n                \"conversation.\"\n            ),\n        )\n        debug: bool = Field(\n            default=False,\n            description=\"Show detailed per-message debug info in chat status.\",\n        )\n\n    class UserValves(BaseModel):\n        enabled: bool = Field(\n            default=True,\n            description=\"Enable token saver for this user.\",\n        )\n\n    def __init__(self):\n        self.valves = self.Valves()\n\n    # ------------------------------------------------------------------\n    # Helpers\n    # ------------------------------------------------------------------\n\n    @staticmethod\n    def _middle_truncate(\n        text: str, max_chars: int, head_ratio: float = 0.5\n    ) -> tuple[str, bool]:\n        \"\"\"Truncate text preserving head and tail, removing the middle.\"\"\"\n        if len(text) <= max_chars:\n            return text, False\n\n        original_len = len(text)\n        marker = f\"\\n... [trimmed: {original_len:,} -> {max_chars:,} chars] ...\\n\"\n        available = max_chars - len(marker)\n        if available < 100:\n            return text[:max_chars], True\n\n        head_size = int(available * head_ratio)\n        tail_size = available - head_size\n        return text[:head_size] + marker + text[-tail_size:], True\n\n    def _truncate_details_results(\n        self, content: str, max_result: int, head_ratio: float\n    ) -> tuple[str, int]:\n        \"\"\"Find <details result=\"...\"> attributes and truncate large ones.\"\"\"\n        saved = 0\n\n        def _replace(m: re.Match) -> str:\n            nonlocal saved\n            prefix, value, suffix = m.group(1), m.group(2), m.group(3)\n            if len(value) <= max_result:\n                return m.group(0)\n            truncated, was = self._middle_truncate(value, max_result, head_ratio)\n            if was:\n                saved += len(value) - len(truncated)\n                return prefix + truncated + suffix\n            return m.group(0)\n\n        new_content = _DETAILS_RESULT_RE.sub(_replace, content)\n        return new_content, saved\n\n    @staticmethod\n    def _find_consumed_boundary(messages: list[dict]) -> int:\n        \"\"\"Find the index before which all tool results are consumed.\n\n        Walk backwards through messages.  A tool result is \"consumed\"\n        if there is an assistant message with non-empty content after\n        it — meaning the model has already seen and responded to it.\n\n        Returns the index of the latest unconsumed boundary: all tool\n        results at indices < boundary are consumed and eligible for\n        truncation.  Tool results at indices >= boundary have not been\n        processed by the model yet and must be left untouched.\n        \"\"\"\n        # Find the last assistant message with actual content (not just\n        # an empty message with tool_calls).\n        last_response_idx = -1\n        for i in range(len(messages) - 1, -1, -1):\n            msg = messages[i]\n            if msg.get(\"role\") != \"assistant\":\n                continue\n            content = msg.get(\"content\", \"\")\n            if isinstance(content, str) and content.strip():\n                last_response_idx = i\n                break\n\n        # Everything before the last substantive assistant response\n        # has been consumed.  If no assistant response exists, nothing\n        # is consumed (boundary = 0, nothing gets truncated).\n        return last_response_idx if last_response_idx > 0 else 0\n\n    async def _status(\n        self, emitter: Callable | None, msg: str, done: bool = True\n    ) -> None:\n        if not emitter:\n            return\n        await emitter({\"type\": \"status\", \"data\": {\"description\": msg, \"done\": done}})\n\n    # ------------------------------------------------------------------\n    # Inlet\n    # ------------------------------------------------------------------\n\n    async def inlet(\n        self,\n        body: dict,\n        __user__: Optional[dict] = None,\n        __event_emitter__: Optional[Callable] = None,\n    ) -> dict:\n        if not self.valves.enabled:\n            return body\n\n        user_valves = __user__.get(\"valves\") if __user__ else None\n        if user_valves and hasattr(user_valves, \"enabled\") and not user_valves.enabled:\n            return body\n\n        messages = body.get(\"messages\", [])\n        if not messages:\n            return body\n\n        max_chars = self.valves.max_tool_result_chars\n        head_ratio = self.valves.head_ratio\n        excluded_tools = {\n            t.strip() for t in self.valves.exclude_tools.split(\",\") if t.strip()\n        }\n        boundary = self._find_consumed_boundary(messages)\n\n        if boundary <= 0:\n            if self.valves.debug:\n                await self._status(\n                    __event_emitter__,\n                    \"[TokenSaver] No consumed tool results found\",\n                )\n            return body\n\n        total_chars_saved = 0\n        truncated_count = 0\n\n        for i in range(boundary):\n            msg = messages[i]\n            role = msg.get(\"role\", \"\")\n            content = msg.get(\"content\", \"\")\n            if not isinstance(content, str):\n                continue\n\n            original_len = len(content)\n            msg_saved = 0\n\n            if role == \"tool\":\n                tool_name = msg.get(\"name\", \"\")\n                if tool_name in excluded_tools:\n                    if self.valves.debug:\n                        _log.debug(\"token_saver: skipping excluded tool %s\", tool_name)\n                    continue\n                if len(content) > max_chars:\n                    content, was = self._middle_truncate(content, max_chars, head_ratio)\n                    if was:\n                        msg_saved = original_len - len(content)\n\n            elif role == \"assistant\":\n                if \"<details\" in content and 'result=\"' in content:\n                    content, details_saved = self._truncate_details_results(\n                        content, max_chars, head_ratio\n                    )\n                    msg_saved += details_saved\n\n            else:\n                continue\n\n            if msg_saved > 0:\n                total_chars_saved += msg_saved\n                truncated_count += 1\n                messages[i] = {**msg, \"content\": content}\n\n                if self.valves.debug:\n                    await self._status(\n                        __event_emitter__,\n                        f\"[TokenSaver] msg[{i}] role={role}: \"\n                        f\"{original_len:,} -> {len(content):,} chars \"\n                        f\"(saved {msg_saved:,})\",\n                    )\n\n        if truncated_count > 0:\n            estimated_tokens_saved = total_chars_saved // 4\n            summary = (\n                f\"[TokenSaver] Trimmed {truncated_count} message(s), \"\n                f\"saved ~{estimated_tokens_saved:,} tokens\"\n            )\n            _log.info(\n                \"token_saver: trimmed %d messages, saved ~%d chars (~%d tokens)\",\n                truncated_count,\n                total_chars_saved,\n                estimated_tokens_saved,\n            )\n            if self.valves.show_status:\n                await self._status(__event_emitter__, summary)\n        elif self.valves.debug:\n            await self._status(\n                __event_emitter__,\n                f\"[TokenSaver] {boundary} message(s) before boundary, \"\n                f\"none exceeded {max_chars:,} chars\",\n            )\n\n        return body\n"}]