/** * v2.0.72 (#115 #120 root-cause workaround) — NLU intent extractor. * * Cascade upstream's `SendUserCascadeMessage` proto has no OpenAI * `tools[]` field. The proxy injects tool definitions into the system * prompt (additional_instructions_section), but GPT / GLM / Kimi * weren't trained on prompt-level tool-calling protocols — they see the * `{"name":...}` instructions, decide to call * the tool, but emit it as natural-language NARRATION instead of the * exact markup we asked for. v2.0.71 fabricate detection just flagged * these as failures; v2.0.72 actually RECOVERS the call. * * Real probe captures (from scripts/probes/v2071-glm-kimi-tool-probe): * * GLM-4.7 → "I should call the shell_exec function with the command * 'echo HELLO_FROM_PROBE'." * GLM-5.1 → "I'll run the shell command as requested." (no args!) * GPT-5.5 → "PROBE_V0270_1777751588" (pure fabricated output) * * The first one carries enough signal to reconstruct the call; the * second has the intent but no args; the third is hopeless. Layered * extraction: * * Layer 1 (highest confidence) — explicit invocation syntax: * "Let me run shell_command(command='echo HELLO')" * "function_call: shell_exec(\"echo HELLO\")" * * Layer 2 — backtick-quoted name + value: * "I'll call `shell_exec` with command `echo HELLO`" * "use the `Read` function with file_path `/etc/hosts`" * * Layer 3 — natural narrative (model "thinking out loud"): * "I should call the shell_exec function with the command 'echo HI'" * "Let me invoke the Read tool to read /etc/hosts" * * Each layer requires the extracted name to match a caller-declared * tool. Layer 3 also requires the user prompt to plausibly want a * tool call (shell-style verbs in the most recent user message). * * Conservative by design: false-positive tool_calls drive agent loops * to execute things the model didn't actually decide on. When in * doubt, return []. */ import { log } from '../config.js'; /** * @typedef {Object} ExtractedToolCall * @property {string} name OpenAI tool name (matches caller's tools[]) * @property {string} argumentsJson JSON-stringified args * @property {'explicit-syntax'|'backtick-quoted'|'narrative'} layer * @property {number} confidence 0..1 */ /** * Build a Set of declared tool names + a name → primaryParamName map * for inference of single-arg shorthands ("with command 'echo X'" → * arguments.command = 'echo X'). */ function indexTools(tools) { const names = new Set(); const primaryParam = new Map(); // tool name → first required string param if (!Array.isArray(tools)) return { names, primaryParam }; for (const t of tools) { if (t?.type !== 'function') continue; const name = t.function?.name; if (!name || typeof name !== 'string') continue; names.add(name); const params = t.function?.parameters; if (params?.type === 'object' && params.properties) { const required = Array.isArray(params.required) ? params.required : []; let primary = required[0]; // Prefer the first required string-typed param (`command`, // `file_path`, `query`) — that's the one models naturally // mention with "with command X" / "with file Y" narrative. for (const r of required) { const p = params.properties[r]; if (p?.type === 'string') { primary = r; break; } } // Fall through to first declared property if no required ones. if (!primary) { const keys = Object.keys(params.properties || {}); primary = keys.find(k => params.properties[k]?.type === 'string') || keys[0]; } if (primary) primaryParam.set(name, primary); } } return { names, primaryParam }; } // Regex utilities — escape user-controlled tool name for regex insertion. function escapeRe(s) { return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } // v2.0.78 (#120 follow-up + audit H-2): values extracted from narrative // can easily be a generic noun phrase ("a shell command", "the file", // "your input") or a literal placeholder keyword ("command", // "argument"). Both produce garbage tool_calls — the agent loop will // then try to execute `command` as a literal command, fail, and recurse. // Reject these uniformly across all three layers. const PLACEHOLDER_KEYWORDS = new Set([ 'command', 'argument', 'arguments', 'param', 'parameter', 'parameters', 'input', 'value', 'file_path', 'filepath', 'path', 'query', 'string', 'text', 'name', 'arg', 'output', // v2.0.81 (#125 — GLM-5.1 Chinese narrate): models echo Chinese // param-name keywords as the value too. "调用 shell_exec 命令 '命令'" // would otherwise produce a real tool_call with command='命令'. '命令', '参数', '文件', '路径', '输入', '值', '字符串', '文本', '名称', '查询', '输出', ]); const ARTICLE_PREFIX_RE = /^(?:a|an|the|this|that|these|those|your|my|our|some|any|each|every)\s+/i; // Chinese article-led / vague phrase prefixes — "某个命令" / "一个命令" // / "某种参数" — same idea as ARTICLE_PREFIX_RE but for CJK. const CN_VAGUE_PREFIX_RE = /^(?:某个?|一个|这个|那个|某种|什么|任何|每个|所有的?)/; function looksLikePlaceholderValue(value) { if (typeof value !== 'string' || !value.trim()) return true; const v = value.trim(); // Strip trailing punctuation (`.`, `,`, `;`, `:`, `。`, `,`) before comparison. const stripped = v.replace(/[.,;:!?。,;:!?]+$/, ''); if (PLACEHOLDER_KEYWORDS.has(stripped.toLowerCase())) return true; // Article-led phrase ("a shell command", "the file") — model // narrating about the call rather than supplying the call value. if (ARTICLE_PREFIX_RE.test(stripped)) return true; // Chinese vague prefix — "某个命令", "一个文件", "这个参数" if (CN_VAGUE_PREFIX_RE.test(stripped)) return true; return false; } /** * Layer 1: explicit invocation syntax. * * shell_command(command="echo X") * shell_exec("echo X") * function_call: name=shell_exec args={"command":"echo X"} */ function extractLayer1(text, names) { const out = []; // function_name(arg=value) or function_name("value") const reExplicit = /\b([A-Za-z_][A-Za-z0-9_]*)\s*\(\s*(?:([A-Za-z_][A-Za-z0-9_]*)\s*=\s*)?["'`]([^"'`)]{1,2000})["'`]\s*\)/g; let m; while ((m = reExplicit.exec(text)) !== null) { const [, fn, paramName, value] = m; if (!names.has(fn)) continue; if (looksLikePlaceholderValue(value)) continue; const args = paramName ? { [paramName]: value } : { _value: value }; out.push({ name: fn, argumentsJson: JSON.stringify(args), layer: 'explicit-syntax', confidence: paramName ? 0.95 : 0.85, }); } // function_call: name=X args={...} const reFc = /function[_\s]?call\s*[:=][^{]*?\bname\s*[:=]\s*["'`]?([A-Za-z_][A-Za-z0-9_]*)["'`]?[^{]*?(\{[\s\S]{1,2000}?\})/g; while ((m = reFc.exec(text)) !== null) { const [, fn, argsBlob] = m; if (!names.has(fn)) continue; let args = {}; try { args = JSON.parse(argsBlob); } catch {} out.push({ name: fn, argumentsJson: JSON.stringify(args), layer: 'explicit-syntax', confidence: 0.9, }); } return out; } /** * Layer 2: backtick-quoted name + later backtick-quoted value. * * "I'll call `shell_exec` with command `echo HELLO`" * "use the `Read` function with file_path `/etc/hosts`" */ function extractLayer2(text, names, primaryParam) { const out = []; for (const fn of names) { const fnRe = new RegExp(`\\\`${escapeRe(fn)}\\\``, 'g'); let m; while ((m = fnRe.exec(text)) !== null) { // Look for next backtick-quoted token within 200 chars const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 200); // Capture optional "with PARAM `value`" or just "`value`" const argRe = /(?:with\s+)?(?:the\s+)?(?:argument|param|parameter|input|command|file[_-]?path|path|query)?\s*[:=]?\s*`([^`]{1,1000})`/i; const a = tail.match(argRe); if (!a) continue; const value = a[1]; if (looksLikePlaceholderValue(value)) continue; const param = primaryParam.get(fn) || 'input'; out.push({ name: fn, argumentsJson: JSON.stringify({ [param]: value }), layer: 'backtick-quoted', confidence: 0.8, }); } } return out; } /** * Layer 3: natural narrative. * * "I should call the shell_exec function with the command 'echo HI'" * "Let me invoke the Read tool to read /etc/hosts" * "I'll run shell_command with command echo HELLO" */ function extractLayer3(text, names, primaryParam) { const out = []; // v2.0.81 (#125 DuZunTianXia): GLM-5.1 narrate in Chinese — log // showed "让我用 Bash 来列出..." / "用户想查看..." / "我会调用 X // 工具" — none of which the English-only verb regex picked up. // Add Chinese verbs alongside English so the name pattern matches // either language (or mixed). The primary tool-name match still // requires the literal tool name (e.g. `Bash`, `shell_exec`) since // those are emitted in the original alphabet by every model. const verbs = '(?:call|invoke|run|use|execute|exec|trigger|fire' + '|调用|使用|运行|执行|触发|启动|让我用|让我使用|我会用|我将用|通过|借助|采用)'; const articles = '(?:the\\s+)?'; // Suffix matches ONLY tool/function meta-words (not arg labels like // "command" / "命令") so the latter stay in the tail and feed the // argPatterns. Pre-v2.0.81 it included "command" / "命令" which // greedily consumed the very keyword that argPattern 2/4 needs. const suffix = '(?:\\s+(?:function|tool|method|函数|工具|方法))?'; for (const fn of names) { // Pattern: " [the] [function|tool] [function|tool]" // \b doesn't match between Chinese and Latin, so we drop the // leading word boundary and rely on the verb list itself. const namePat = new RegExp( `${verbs}\\s*${articles}(?:function|tool|method|函数|工具|方法)?\\s*\\\`?${escapeRe(fn)}\\\`?${suffix}`, 'gi', ); let m; while ((m = namePat.exec(text)) !== null) { // Hunt for value within next 300 chars const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 300); // ordered by specificity: const argPatterns = [ // with the command 'echo X' / with command "echo X" / with command `echo X` /\bwith\s+(?:the\s+)?(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i, // bare keyword + value (no "with"): command 'echo X' / argument "X" /(?:^|\s)(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i, // 中文:用命令 'X' / 传入 'X' / 参数 'X' / 命令 'X' / 路径 'X' /(?:用|使用|传入|输入|参数(?:为)?|命令(?:为)?|路径(?:为)?|文件(?:为)?|查询(?:为)?)\s*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/, // with 'echo X' (no param keyword) /\bwith\s+["'`]([^"'`\n]{1,500})["'`]/i, // to read /etc/hosts (positional after action verb) /\bto\s+(?:read|run|execute|view|search|find|cat|ls)\s+([\S][^\n]{0,200})/i, // : 'echo X' / = 'echo X' /[:=]\s*["'`]([^"'`\n]{1,500})["'`]/, // last resort: very first quoted string in the tail /^[\s,,。.]*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/, ]; let value = null; for (const pat of argPatterns) { const a = tail.match(pat); if (a && a[1]) { value = a[1].trim(); break; } } if (!value) continue; // v2.0.76 + v2.0.78 (audit H-2): reject placeholder keywords // (`command` / `argument` / ...) AND article-led prose phrases // (`a shell command` / `the file` / `your input`). GLM-4.7 // narrative reproducer "to run a shell command" was capturing // "a shell command." as the value pre-v2.0.78 even with the // single-word filter in place. if (looksLikePlaceholderValue(value)) continue; const param = primaryParam.get(fn) || 'input'; out.push({ name: fn, argumentsJson: JSON.stringify({ [param]: value }), layer: 'narrative', confidence: 0.65, }); } } return out; } /** * Detect whether the user prompt asked for an action a function could * perform. Layer 3 (narrative) only fires when this is true to avoid * false-positive tool_call extraction from casual chat. */ function userPromptLooksActionable(lastUserText) { if (!lastUserText) return false; // v2.0.81 (#125): widen to Chinese verbs/nouns so GLM-5.1 / Kimi // running with a Chinese system prompt + Chinese user turn still // routes through Layer 3. if (/\b(?:run|exec|execute|cat|ls|echo|grep|find|read|search|list|invoke|call|fetch|get|fix|edit|write|patch)\b/i.test(lastUserText)) return true; if (/\b(?:shell|bash|terminal|command|tool|function|file|path)\b/i.test(lastUserText)) return true; if (/(?:运行|执行|读取|查看|列出|查找|搜索|获取|修改|编辑|写入|修复|分析|调用|使用|拉取|下载|找到|看一下|看看|检查)/.test(lastUserText)) return true; if (/(?:文件|目录|路径|命令|工具|函数|参数|项目|代码|配置)/.test(lastUserText)) return true; return false; } /** * Detect whether the model's narrative looks like it INTENDED to call * a tool but never produced a usable extraction. Used to gate the * retry-with-correction loop in chat.js — we only burn an extra * cascade round-trip when there's clear tool intent we couldn't * recover. * * Returns one of: * - the matched declared tool name (when the model named it inline) * - the FIRST declared tool name (when the narrative shows clear * action intent + user actionable prompt + an action verb, * even if the model didn't name a specific tool — GLM-5.1 will * say "Let me list the files" without saying "Bash") * - null when there's no usable signal * * v2.0.82 (#125 — proper translator layer beyond NLU). */ export function detectToolIntentInNarrative(text, tools, opts = {}) { if (typeof text !== 'string' || !text.trim()) return null; if (!Array.isArray(tools) || !tools.length) return null; const lastUserText = opts.lastUserText || ''; if (!userPromptLooksActionable(lastUserText)) return null; const { names } = indexTools(tools); if (!names.size) return null; // Verb forms (English + Chinese) that signal "I'm about to call X". const verbPattern = /\b(?:call|invoke|run|use|execute|exec|trigger|fire|going to|will|let me|i'?ll|i'?m going|need to|should)\b|(?:调用|使用|运行|执行|触发|启动|让我|我会|我将|准备|打算|想要|需要|应该)/i; if (!verbPattern.test(text)) return null; // Action keywords (file ops, search, read, etc.) — these stand in // for "the model is talking about USING tools generically". const actionVerbPattern = /\b(?:list|show|read|cat|grep|find|search|view|fetch|get|create|write|edit|run|execute|check|inspect|examine|analyz|browse|explore)\b|(?:列出|展示|读取|查看|查找|搜索|获取|拉取|下载|创建|写入|编辑|运行|执行|检查|检视|分析|浏览|探索|看一下|看看)/i; // Pass 1: specific tool name in narrative (most precise). for (const fn of names) { const fnRe = new RegExp(`\\b${escapeRe(fn)}\\b|\\\`${escapeRe(fn)}\\\``); if (fnRe.test(text)) return fn; } // Pass 2: action keyword present (model said "let me list..." but // didn't name the tool). Return the first declared tool — caller's // correction prompt will name it explicitly so the retry knows // which tool to emit. if (actionVerbPattern.test(text)) return [...names][0]; return null; } /** * Top-level extractor. Returns a deduped, confidence-sorted list of * extracted tool_calls. Empty array when nothing is recoverable. * * Set the `WINDSURFAPI_NLU_RECOVERY=0` env to turn off entirely * (default ON). */ export function extractIntentFromNarrative(text, tools, opts = {}) { if (process.env.WINDSURFAPI_NLU_RECOVERY === '0') return []; if (typeof text !== 'string' || !text.trim()) return []; if (!Array.isArray(tools) || !tools.length) return []; const lastUserText = opts.lastUserText || ''; const minConfidence = typeof opts.minConfidence === 'number' ? opts.minConfidence : 0.65; // v2.0.78 (audit H-4): structural markers MAY indicate a malformed // protocol attempt — Layer 3 narrative around it tends to be // descriptive prose, not args. v2.0.79 narrowed the gate after // GLM-4.7 e2e probe regressed: GLM emits `markers=bare_json` // (because thinking text contains JSON-shaped fragments) AND a // legitimate narrate; Layer 3 is exactly what catches the narrate. // Now we only skip Layer 3 for `xml_tag` (Claude's tool_use shape) // — that's where parser-failure → Layer 3 most often produces // false positives. fenced_json / bare_json / openai_native still // allow Layer 3 because models emitting those shapes (GLM, Kimi, // some GPT) also reliably narrate the call in surrounding prose. const markers = Array.isArray(opts.markers) ? opts.markers : []; const skipLayer3 = markers.includes('xml_tag') && !markers.includes('natural_lang'); const { names, primaryParam } = indexTools(tools); if (!names.size) return []; const all = [ ...extractLayer1(text, names), ...extractLayer2(text, names, primaryParam), ...(!skipLayer3 && userPromptLooksActionable(lastUserText) ? extractLayer3(text, names, primaryParam) : []), ]; if (!all.length) return []; // Dedupe by (name, argumentsJson). Keep the highest-confidence pick. const byKey = new Map(); for (const tc of all) { if (tc.confidence < minConfidence) continue; const key = `${tc.name}::${tc.argumentsJson}`; const existing = byKey.get(key); if (!existing || tc.confidence > existing.confidence) byKey.set(key, tc); } const recovered = [...byKey.values()].sort((a, b) => b.confidence - a.confidence); if (recovered.length) { log.info(`NLU recovery: extracted ${recovered.length} tool_call(s) from narrative — ${recovered.map(t => `${t.name}@${t.layer}/${t.confidence.toFixed(2)}`).join(', ')}${skipLayer3 ? ' (layer3-skipped: structural markers seen)' : ''}`); } return recovered; }