| /** | |
| * v2.0.72 (#115 #120 root-cause workaround) — NLU intent extractor. | |
| * | |
| * Cascade upstream's `SendUserCascadeMessage` proto has no OpenAI | |
| * `tools[]` field. The proxy injects tool definitions into the system | |
| * prompt (additional_instructions_section), but GPT / GLM / Kimi | |
| * weren't trained on prompt-level tool-calling protocols — they see the | |
| * `<tool_call>{"name":...}</tool_call>` instructions, decide to call | |
| * the tool, but emit it as natural-language NARRATION instead of the | |
| * exact markup we asked for. v2.0.71 fabricate detection just flagged | |
| * these as failures; v2.0.72 actually RECOVERS the call. | |
| * | |
| * Real probe captures (from scripts/probes/v2071-glm-kimi-tool-probe): | |
| * | |
| * GLM-4.7 → "I should call the shell_exec function with the command | |
| * 'echo HELLO_FROM_PROBE'." | |
| * GLM-5.1 → "I'll run the shell command as requested." (no args!) | |
| * GPT-5.5 → "PROBE_V0270_1777751588" (pure fabricated output) | |
| * | |
| * The first one carries enough signal to reconstruct the call; the | |
| * second has the intent but no args; the third is hopeless. Layered | |
| * extraction: | |
| * | |
| * Layer 1 (highest confidence) — explicit invocation syntax: | |
| * "Let me run shell_command(command='echo HELLO')" | |
| * "function_call: shell_exec(\"echo HELLO\")" | |
| * | |
| * Layer 2 — backtick-quoted name + value: | |
| * "I'll call `shell_exec` with command `echo HELLO`" | |
| * "use the `Read` function with file_path `/etc/hosts`" | |
| * | |
| * Layer 3 — natural narrative (model "thinking out loud"): | |
| * "I should call the shell_exec function with the command 'echo HI'" | |
| * "Let me invoke the Read tool to read /etc/hosts" | |
| * | |
| * Each layer requires the extracted name to match a caller-declared | |
| * tool. Layer 3 also requires the user prompt to plausibly want a | |
| * tool call (shell-style verbs in the most recent user message). | |
| * | |
| * Conservative by design: false-positive tool_calls drive agent loops | |
| * to execute things the model didn't actually decide on. When in | |
| * doubt, return []. | |
| */ | |
| import { log } from '../config.js'; | |
| /** | |
| * @typedef {Object} ExtractedToolCall | |
| * @property {string} name OpenAI tool name (matches caller's tools[]) | |
| * @property {string} argumentsJson JSON-stringified args | |
| * @property {'explicit-syntax'|'backtick-quoted'|'narrative'} layer | |
| * @property {number} confidence 0..1 | |
| */ | |
| /** | |
| * Build a Set of declared tool names + a name → primaryParamName map | |
| * for inference of single-arg shorthands ("with command 'echo X'" → | |
| * arguments.command = 'echo X'). | |
| */ | |
| function indexTools(tools) { | |
| const names = new Set(); | |
| const primaryParam = new Map(); // tool name → first required string param | |
| if (!Array.isArray(tools)) return { names, primaryParam }; | |
| for (const t of tools) { | |
| if (t?.type !== 'function') continue; | |
| const name = t.function?.name; | |
| if (!name || typeof name !== 'string') continue; | |
| names.add(name); | |
| const params = t.function?.parameters; | |
| if (params?.type === 'object' && params.properties) { | |
| const required = Array.isArray(params.required) ? params.required : []; | |
| let primary = required[0]; | |
| // Prefer the first required string-typed param (`command`, | |
| // `file_path`, `query`) — that's the one models naturally | |
| // mention with "with command X" / "with file Y" narrative. | |
| for (const r of required) { | |
| const p = params.properties[r]; | |
| if (p?.type === 'string') { primary = r; break; } | |
| } | |
| // Fall through to first declared property if no required ones. | |
| if (!primary) { | |
| const keys = Object.keys(params.properties || {}); | |
| primary = keys.find(k => params.properties[k]?.type === 'string') || keys[0]; | |
| } | |
| if (primary) primaryParam.set(name, primary); | |
| } | |
| } | |
| return { names, primaryParam }; | |
| } | |
| // Regex utilities — escape user-controlled tool name for regex insertion. | |
| function escapeRe(s) { | |
| return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | |
| } | |
| // v2.0.78 (#120 follow-up + audit H-2): values extracted from narrative | |
| // can easily be a generic noun phrase ("a shell command", "the file", | |
| // "your input") or a literal placeholder keyword ("command", | |
| // "argument"). Both produce garbage tool_calls — the agent loop will | |
| // then try to execute `command` as a literal command, fail, and recurse. | |
| // Reject these uniformly across all three layers. | |
| const PLACEHOLDER_KEYWORDS = new Set([ | |
| 'command', 'argument', 'arguments', 'param', 'parameter', | |
| 'parameters', 'input', 'value', 'file_path', 'filepath', 'path', | |
| 'query', 'string', 'text', 'name', 'arg', 'output', | |
| // v2.0.81 (#125 — GLM-5.1 Chinese narrate): models echo Chinese | |
| // param-name keywords as the value too. "调用 shell_exec 命令 '命令'" | |
| // would otherwise produce a real tool_call with command='命令'. | |
| '命令', '参数', '文件', '路径', '输入', '值', '字符串', '文本', '名称', '查询', '输出', | |
| ]); | |
| const ARTICLE_PREFIX_RE = /^(?:a|an|the|this|that|these|those|your|my|our|some|any|each|every)\s+/i; | |
| // Chinese article-led / vague phrase prefixes — "某个命令" / "一个命令" | |
| // / "某种参数" — same idea as ARTICLE_PREFIX_RE but for CJK. | |
| const CN_VAGUE_PREFIX_RE = /^(?:某个?|一个|这个|那个|某种|什么|任何|每个|所有的?)/; | |
| function looksLikePlaceholderValue(value) { | |
| if (typeof value !== 'string' || !value.trim()) return true; | |
| const v = value.trim(); | |
| // Strip trailing punctuation (`.`, `,`, `;`, `:`, `。`, `,`) before comparison. | |
| const stripped = v.replace(/[.,;:!?。,;:!?]+$/, ''); | |
| if (PLACEHOLDER_KEYWORDS.has(stripped.toLowerCase())) return true; | |
| // Article-led phrase ("a shell command", "the file") — model | |
| // narrating about the call rather than supplying the call value. | |
| if (ARTICLE_PREFIX_RE.test(stripped)) return true; | |
| // Chinese vague prefix — "某个命令", "一个文件", "这个参数" | |
| if (CN_VAGUE_PREFIX_RE.test(stripped)) return true; | |
| return false; | |
| } | |
| /** | |
| * Layer 1: explicit invocation syntax. | |
| * | |
| * shell_command(command="echo X") | |
| * shell_exec("echo X") | |
| * function_call: name=shell_exec args={"command":"echo X"} | |
| */ | |
| function extractLayer1(text, names) { | |
| const out = []; | |
| // function_name(arg=value) or function_name("value") | |
| const reExplicit = /\b([A-Za-z_][A-Za-z0-9_]*)\s*\(\s*(?:([A-Za-z_][A-Za-z0-9_]*)\s*=\s*)?["'`]([^"'`)]{1,2000})["'`]\s*\)/g; | |
| let m; | |
| while ((m = reExplicit.exec(text)) !== null) { | |
| const [, fn, paramName, value] = m; | |
| if (!names.has(fn)) continue; | |
| if (looksLikePlaceholderValue(value)) continue; | |
| const args = paramName ? { [paramName]: value } : { _value: value }; | |
| out.push({ | |
| name: fn, | |
| argumentsJson: JSON.stringify(args), | |
| layer: 'explicit-syntax', | |
| confidence: paramName ? 0.95 : 0.85, | |
| }); | |
| } | |
| // function_call: name=X args={...} | |
| const reFc = /function[_\s]?call\s*[:=][^{]*?\bname\s*[:=]\s*["'`]?([A-Za-z_][A-Za-z0-9_]*)["'`]?[^{]*?(\{[\s\S]{1,2000}?\})/g; | |
| while ((m = reFc.exec(text)) !== null) { | |
| const [, fn, argsBlob] = m; | |
| if (!names.has(fn)) continue; | |
| let args = {}; | |
| try { args = JSON.parse(argsBlob); } catch {} | |
| out.push({ | |
| name: fn, | |
| argumentsJson: JSON.stringify(args), | |
| layer: 'explicit-syntax', | |
| confidence: 0.9, | |
| }); | |
| } | |
| return out; | |
| } | |
| /** | |
| * Layer 2: backtick-quoted name + later backtick-quoted value. | |
| * | |
| * "I'll call `shell_exec` with command `echo HELLO`" | |
| * "use the `Read` function with file_path `/etc/hosts`" | |
| */ | |
| function extractLayer2(text, names, primaryParam) { | |
| const out = []; | |
| for (const fn of names) { | |
| const fnRe = new RegExp(`\\\`${escapeRe(fn)}\\\``, 'g'); | |
| let m; | |
| while ((m = fnRe.exec(text)) !== null) { | |
| // Look for next backtick-quoted token within 200 chars | |
| const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 200); | |
| // Capture optional "with PARAM `value`" or just "`value`" | |
| const argRe = /(?:with\s+)?(?:the\s+)?(?:argument|param|parameter|input|command|file[_-]?path|path|query)?\s*[:=]?\s*`([^`]{1,1000})`/i; | |
| const a = tail.match(argRe); | |
| if (!a) continue; | |
| const value = a[1]; | |
| if (looksLikePlaceholderValue(value)) continue; | |
| const param = primaryParam.get(fn) || 'input'; | |
| out.push({ | |
| name: fn, | |
| argumentsJson: JSON.stringify({ [param]: value }), | |
| layer: 'backtick-quoted', | |
| confidence: 0.8, | |
| }); | |
| } | |
| } | |
| return out; | |
| } | |
| /** | |
| * Layer 3: natural narrative. | |
| * | |
| * "I should call the shell_exec function with the command 'echo HI'" | |
| * "Let me invoke the Read tool to read /etc/hosts" | |
| * "I'll run shell_command with command echo HELLO" | |
| */ | |
| function extractLayer3(text, names, primaryParam) { | |
| const out = []; | |
| // v2.0.81 (#125 DuZunTianXia): GLM-5.1 narrate in Chinese — log | |
| // showed "让我用 Bash 来列出..." / "用户想查看..." / "我会调用 X | |
| // 工具" — none of which the English-only verb regex picked up. | |
| // Add Chinese verbs alongside English so the name pattern matches | |
| // either language (or mixed). The primary tool-name match still | |
| // requires the literal tool name (e.g. `Bash`, `shell_exec`) since | |
| // those are emitted in the original alphabet by every model. | |
| const verbs = '(?:call|invoke|run|use|execute|exec|trigger|fire' | |
| + '|调用|使用|运行|执行|触发|启动|让我用|让我使用|我会用|我将用|通过|借助|采用)'; | |
| const articles = '(?:the\\s+)?'; | |
| // Suffix matches ONLY tool/function meta-words (not arg labels like | |
| // "command" / "命令") so the latter stay in the tail and feed the | |
| // argPatterns. Pre-v2.0.81 it included "command" / "命令" which | |
| // greedily consumed the very keyword that argPattern 2/4 needs. | |
| const suffix = '(?:\\s+(?:function|tool|method|函数|工具|方法))?'; | |
| for (const fn of names) { | |
| // Pattern: "<verb> [the] [function|tool] <fn> [function|tool]" | |
| // \b doesn't match between Chinese and Latin, so we drop the | |
| // leading word boundary and rely on the verb list itself. | |
| const namePat = new RegExp( | |
| `${verbs}\\s*${articles}(?:function|tool|method|函数|工具|方法)?\\s*\\\`?${escapeRe(fn)}\\\`?${suffix}`, | |
| 'gi', | |
| ); | |
| let m; | |
| while ((m = namePat.exec(text)) !== null) { | |
| // Hunt for value within next 300 chars | |
| const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 300); | |
| // ordered by specificity: | |
| const argPatterns = [ | |
| // with the command 'echo X' / with command "echo X" / with command `echo X` | |
| /\bwith\s+(?:the\s+)?(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i, | |
| // bare keyword + value (no "with"): command 'echo X' / argument "X" | |
| /(?:^|\s)(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i, | |
| // 中文:用命令 'X' / 传入 'X' / 参数 'X' / 命令 'X' / 路径 'X' | |
| /(?:用|使用|传入|输入|参数(?:为)?|命令(?:为)?|路径(?:为)?|文件(?:为)?|查询(?:为)?)\s*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/, | |
| // with 'echo X' (no param keyword) | |
| /\bwith\s+["'`]([^"'`\n]{1,500})["'`]/i, | |
| // to read /etc/hosts (positional after action verb) | |
| /\bto\s+(?:read|run|execute|view|search|find|cat|ls)\s+([\S][^\n]{0,200})/i, | |
| // : 'echo X' / = 'echo X' | |
| /[:=]\s*["'`]([^"'`\n]{1,500})["'`]/, | |
| // last resort: very first quoted string in the tail | |
| /^[\s,,。.]*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/, | |
| ]; | |
| let value = null; | |
| for (const pat of argPatterns) { | |
| const a = tail.match(pat); | |
| if (a && a[1]) { value = a[1].trim(); break; } | |
| } | |
| if (!value) continue; | |
| // v2.0.76 + v2.0.78 (audit H-2): reject placeholder keywords | |
| // (`command` / `argument` / ...) AND article-led prose phrases | |
| // (`a shell command` / `the file` / `your input`). GLM-4.7 | |
| // narrative reproducer "to run a shell command" was capturing | |
| // "a shell command." as the value pre-v2.0.78 even with the | |
| // single-word filter in place. | |
| if (looksLikePlaceholderValue(value)) continue; | |
| const param = primaryParam.get(fn) || 'input'; | |
| out.push({ | |
| name: fn, | |
| argumentsJson: JSON.stringify({ [param]: value }), | |
| layer: 'narrative', | |
| confidence: 0.65, | |
| }); | |
| } | |
| } | |
| return out; | |
| } | |
| /** | |
| * Detect whether the user prompt asked for an action a function could | |
| * perform. Layer 3 (narrative) only fires when this is true to avoid | |
| * false-positive tool_call extraction from casual chat. | |
| */ | |
| function userPromptLooksActionable(lastUserText) { | |
| if (!lastUserText) return false; | |
| // v2.0.81 (#125): widen to Chinese verbs/nouns so GLM-5.1 / Kimi | |
| // running with a Chinese system prompt + Chinese user turn still | |
| // routes through Layer 3. | |
| if (/\b(?:run|exec|execute|cat|ls|echo|grep|find|read|search|list|invoke|call|fetch|get|fix|edit|write|patch)\b/i.test(lastUserText)) return true; | |
| if (/\b(?:shell|bash|terminal|command|tool|function|file|path)\b/i.test(lastUserText)) return true; | |
| if (/(?:运行|执行|读取|查看|列出|查找|搜索|获取|修改|编辑|写入|修复|分析|调用|使用|拉取|下载|找到|看一下|看看|检查)/.test(lastUserText)) return true; | |
| if (/(?:文件|目录|路径|命令|工具|函数|参数|项目|代码|配置)/.test(lastUserText)) return true; | |
| return false; | |
| } | |
| /** | |
| * Detect whether the model's narrative looks like it INTENDED to call | |
| * a tool but never produced a usable extraction. Used to gate the | |
| * retry-with-correction loop in chat.js — we only burn an extra | |
| * cascade round-trip when there's clear tool intent we couldn't | |
| * recover. | |
| * | |
| * Returns one of: | |
| * - the matched declared tool name (when the model named it inline) | |
| * - the FIRST declared tool name (when the narrative shows clear | |
| * action intent + user actionable prompt + an action verb, | |
| * even if the model didn't name a specific tool — GLM-5.1 will | |
| * say "Let me list the files" without saying "Bash") | |
| * - null when there's no usable signal | |
| * | |
| * v2.0.82 (#125 — proper translator layer beyond NLU). | |
| */ | |
| export function detectToolIntentInNarrative(text, tools, opts = {}) { | |
| if (typeof text !== 'string' || !text.trim()) return null; | |
| if (!Array.isArray(tools) || !tools.length) return null; | |
| const lastUserText = opts.lastUserText || ''; | |
| if (!userPromptLooksActionable(lastUserText)) return null; | |
| const { names } = indexTools(tools); | |
| if (!names.size) return null; | |
| // Verb forms (English + Chinese) that signal "I'm about to call X". | |
| const verbPattern = /\b(?:call|invoke|run|use|execute|exec|trigger|fire|going to|will|let me|i'?ll|i'?m going|need to|should)\b|(?:调用|使用|运行|执行|触发|启动|让我|我会|我将|准备|打算|想要|需要|应该)/i; | |
| if (!verbPattern.test(text)) return null; | |
| // Action keywords (file ops, search, read, etc.) — these stand in | |
| // for "the model is talking about USING tools generically". | |
| const actionVerbPattern = /\b(?:list|show|read|cat|grep|find|search|view|fetch|get|create|write|edit|run|execute|check|inspect|examine|analyz|browse|explore)\b|(?:列出|展示|读取|查看|查找|搜索|获取|拉取|下载|创建|写入|编辑|运行|执行|检查|检视|分析|浏览|探索|看一下|看看)/i; | |
| // Pass 1: specific tool name in narrative (most precise). | |
| for (const fn of names) { | |
| const fnRe = new RegExp(`\\b${escapeRe(fn)}\\b|\\\`${escapeRe(fn)}\\\``); | |
| if (fnRe.test(text)) return fn; | |
| } | |
| // Pass 2: action keyword present (model said "let me list..." but | |
| // didn't name the tool). Return the first declared tool — caller's | |
| // correction prompt will name it explicitly so the retry knows | |
| // which tool to emit. | |
| if (actionVerbPattern.test(text)) return [...names][0]; | |
| return null; | |
| } | |
| /** | |
| * Top-level extractor. Returns a deduped, confidence-sorted list of | |
| * extracted tool_calls. Empty array when nothing is recoverable. | |
| * | |
| * Set the `WINDSURFAPI_NLU_RECOVERY=0` env to turn off entirely | |
| * (default ON). | |
| */ | |
| export function extractIntentFromNarrative(text, tools, opts = {}) { | |
| if (process.env.WINDSURFAPI_NLU_RECOVERY === '0') return []; | |
| if (typeof text !== 'string' || !text.trim()) return []; | |
| if (!Array.isArray(tools) || !tools.length) return []; | |
| const lastUserText = opts.lastUserText || ''; | |
| const minConfidence = typeof opts.minConfidence === 'number' ? opts.minConfidence : 0.65; | |
| // v2.0.78 (audit H-4): structural markers MAY indicate a malformed | |
| // protocol attempt — Layer 3 narrative around it tends to be | |
| // descriptive prose, not args. v2.0.79 narrowed the gate after | |
| // GLM-4.7 e2e probe regressed: GLM emits `markers=bare_json` | |
| // (because thinking text contains JSON-shaped fragments) AND a | |
| // legitimate narrate; Layer 3 is exactly what catches the narrate. | |
| // Now we only skip Layer 3 for `xml_tag` (Claude's tool_use shape) | |
| // — that's where parser-failure → Layer 3 most often produces | |
| // false positives. fenced_json / bare_json / openai_native still | |
| // allow Layer 3 because models emitting those shapes (GLM, Kimi, | |
| // some GPT) also reliably narrate the call in surrounding prose. | |
| const markers = Array.isArray(opts.markers) ? opts.markers : []; | |
| const skipLayer3 = markers.includes('xml_tag') && !markers.includes('natural_lang'); | |
| const { names, primaryParam } = indexTools(tools); | |
| if (!names.size) return []; | |
| const all = [ | |
| ...extractLayer1(text, names), | |
| ...extractLayer2(text, names, primaryParam), | |
| ...(!skipLayer3 && userPromptLooksActionable(lastUserText) ? extractLayer3(text, names, primaryParam) : []), | |
| ]; | |
| if (!all.length) return []; | |
| // Dedupe by (name, argumentsJson). Keep the highest-confidence pick. | |
| const byKey = new Map(); | |
| for (const tc of all) { | |
| if (tc.confidence < minConfidence) continue; | |
| const key = `${tc.name}::${tc.argumentsJson}`; | |
| const existing = byKey.get(key); | |
| if (!existing || tc.confidence > existing.confidence) byKey.set(key, tc); | |
| } | |
| const recovered = [...byKey.values()].sort((a, b) => b.confidence - a.confidence); | |
| if (recovered.length) { | |
| log.info(`NLU recovery: extracted ${recovered.length} tool_call(s) from narrative — ${recovered.map(t => `${t.name}@${t.layer}/${t.confidence.toFixed(2)}`).join(', ')}${skipLayer3 ? ' (layer3-skipped: structural markers seen)' : ''}`); | |
| } | |
| return recovered; | |
| } | |