File size: 18,488 Bytes
2b64d42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | /**
* v2.0.72 (#115 #120 root-cause workaround) — NLU intent extractor.
*
* Cascade upstream's `SendUserCascadeMessage` proto has no OpenAI
* `tools[]` field. The proxy injects tool definitions into the system
* prompt (additional_instructions_section), but GPT / GLM / Kimi
* weren't trained on prompt-level tool-calling protocols — they see the
* `<tool_call>{"name":...}</tool_call>` instructions, decide to call
* the tool, but emit it as natural-language NARRATION instead of the
* exact markup we asked for. v2.0.71 fabricate detection just flagged
* these as failures; v2.0.72 actually RECOVERS the call.
*
* Real probe captures (from scripts/probes/v2071-glm-kimi-tool-probe):
*
* GLM-4.7 → "I should call the shell_exec function with the command
* 'echo HELLO_FROM_PROBE'."
* GLM-5.1 → "I'll run the shell command as requested." (no args!)
* GPT-5.5 → "PROBE_V0270_1777751588" (pure fabricated output)
*
* The first one carries enough signal to reconstruct the call; the
* second has the intent but no args; the third is hopeless. Layered
* extraction:
*
* Layer 1 (highest confidence) — explicit invocation syntax:
* "Let me run shell_command(command='echo HELLO')"
* "function_call: shell_exec(\"echo HELLO\")"
*
* Layer 2 — backtick-quoted name + value:
* "I'll call `shell_exec` with command `echo HELLO`"
* "use the `Read` function with file_path `/etc/hosts`"
*
* Layer 3 — natural narrative (model "thinking out loud"):
* "I should call the shell_exec function with the command 'echo HI'"
* "Let me invoke the Read tool to read /etc/hosts"
*
* Each layer requires the extracted name to match a caller-declared
* tool. Layer 3 also requires the user prompt to plausibly want a
* tool call (shell-style verbs in the most recent user message).
*
* Conservative by design: false-positive tool_calls drive agent loops
* to execute things the model didn't actually decide on. When in
* doubt, return [].
*/
import { log } from '../config.js';
/**
* @typedef {Object} ExtractedToolCall
* @property {string} name OpenAI tool name (matches caller's tools[])
* @property {string} argumentsJson JSON-stringified args
* @property {'explicit-syntax'|'backtick-quoted'|'narrative'} layer
* @property {number} confidence 0..1
*/
/**
* Build a Set of declared tool names + a name → primaryParamName map
* for inference of single-arg shorthands ("with command 'echo X'" →
* arguments.command = 'echo X').
*/
function indexTools(tools) {
const names = new Set();
const primaryParam = new Map(); // tool name → first required string param
if (!Array.isArray(tools)) return { names, primaryParam };
for (const t of tools) {
if (t?.type !== 'function') continue;
const name = t.function?.name;
if (!name || typeof name !== 'string') continue;
names.add(name);
const params = t.function?.parameters;
if (params?.type === 'object' && params.properties) {
const required = Array.isArray(params.required) ? params.required : [];
let primary = required[0];
// Prefer the first required string-typed param (`command`,
// `file_path`, `query`) — that's the one models naturally
// mention with "with command X" / "with file Y" narrative.
for (const r of required) {
const p = params.properties[r];
if (p?.type === 'string') { primary = r; break; }
}
// Fall through to first declared property if no required ones.
if (!primary) {
const keys = Object.keys(params.properties || {});
primary = keys.find(k => params.properties[k]?.type === 'string') || keys[0];
}
if (primary) primaryParam.set(name, primary);
}
}
return { names, primaryParam };
}
// Regex utilities — escape user-controlled tool name for regex insertion.
function escapeRe(s) {
return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
// v2.0.78 (#120 follow-up + audit H-2): values extracted from narrative
// can easily be a generic noun phrase ("a shell command", "the file",
// "your input") or a literal placeholder keyword ("command",
// "argument"). Both produce garbage tool_calls — the agent loop will
// then try to execute `command` as a literal command, fail, and recurse.
// Reject these uniformly across all three layers.
const PLACEHOLDER_KEYWORDS = new Set([
'command', 'argument', 'arguments', 'param', 'parameter',
'parameters', 'input', 'value', 'file_path', 'filepath', 'path',
'query', 'string', 'text', 'name', 'arg', 'output',
// v2.0.81 (#125 — GLM-5.1 Chinese narrate): models echo Chinese
// param-name keywords as the value too. "调用 shell_exec 命令 '命令'"
// would otherwise produce a real tool_call with command='命令'.
'命令', '参数', '文件', '路径', '输入', '值', '字符串', '文本', '名称', '查询', '输出',
]);
const ARTICLE_PREFIX_RE = /^(?:a|an|the|this|that|these|those|your|my|our|some|any|each|every)\s+/i;
// Chinese article-led / vague phrase prefixes — "某个命令" / "一个命令"
// / "某种参数" — same idea as ARTICLE_PREFIX_RE but for CJK.
const CN_VAGUE_PREFIX_RE = /^(?:某个?|一个|这个|那个|某种|什么|任何|每个|所有的?)/;
function looksLikePlaceholderValue(value) {
if (typeof value !== 'string' || !value.trim()) return true;
const v = value.trim();
// Strip trailing punctuation (`.`, `,`, `;`, `:`, `。`, `,`) before comparison.
const stripped = v.replace(/[.,;:!?。,;:!?]+$/, '');
if (PLACEHOLDER_KEYWORDS.has(stripped.toLowerCase())) return true;
// Article-led phrase ("a shell command", "the file") — model
// narrating about the call rather than supplying the call value.
if (ARTICLE_PREFIX_RE.test(stripped)) return true;
// Chinese vague prefix — "某个命令", "一个文件", "这个参数"
if (CN_VAGUE_PREFIX_RE.test(stripped)) return true;
return false;
}
/**
* Layer 1: explicit invocation syntax.
*
* shell_command(command="echo X")
* shell_exec("echo X")
* function_call: name=shell_exec args={"command":"echo X"}
*/
function extractLayer1(text, names) {
const out = [];
// function_name(arg=value) or function_name("value")
const reExplicit = /\b([A-Za-z_][A-Za-z0-9_]*)\s*\(\s*(?:([A-Za-z_][A-Za-z0-9_]*)\s*=\s*)?["'`]([^"'`)]{1,2000})["'`]\s*\)/g;
let m;
while ((m = reExplicit.exec(text)) !== null) {
const [, fn, paramName, value] = m;
if (!names.has(fn)) continue;
if (looksLikePlaceholderValue(value)) continue;
const args = paramName ? { [paramName]: value } : { _value: value };
out.push({
name: fn,
argumentsJson: JSON.stringify(args),
layer: 'explicit-syntax',
confidence: paramName ? 0.95 : 0.85,
});
}
// function_call: name=X args={...}
const reFc = /function[_\s]?call\s*[:=][^{]*?\bname\s*[:=]\s*["'`]?([A-Za-z_][A-Za-z0-9_]*)["'`]?[^{]*?(\{[\s\S]{1,2000}?\})/g;
while ((m = reFc.exec(text)) !== null) {
const [, fn, argsBlob] = m;
if (!names.has(fn)) continue;
let args = {};
try { args = JSON.parse(argsBlob); } catch {}
out.push({
name: fn,
argumentsJson: JSON.stringify(args),
layer: 'explicit-syntax',
confidence: 0.9,
});
}
return out;
}
/**
* Layer 2: backtick-quoted name + later backtick-quoted value.
*
* "I'll call `shell_exec` with command `echo HELLO`"
* "use the `Read` function with file_path `/etc/hosts`"
*/
function extractLayer2(text, names, primaryParam) {
const out = [];
for (const fn of names) {
const fnRe = new RegExp(`\\\`${escapeRe(fn)}\\\``, 'g');
let m;
while ((m = fnRe.exec(text)) !== null) {
// Look for next backtick-quoted token within 200 chars
const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 200);
// Capture optional "with PARAM `value`" or just "`value`"
const argRe = /(?:with\s+)?(?:the\s+)?(?:argument|param|parameter|input|command|file[_-]?path|path|query)?\s*[:=]?\s*`([^`]{1,1000})`/i;
const a = tail.match(argRe);
if (!a) continue;
const value = a[1];
if (looksLikePlaceholderValue(value)) continue;
const param = primaryParam.get(fn) || 'input';
out.push({
name: fn,
argumentsJson: JSON.stringify({ [param]: value }),
layer: 'backtick-quoted',
confidence: 0.8,
});
}
}
return out;
}
/**
* Layer 3: natural narrative.
*
* "I should call the shell_exec function with the command 'echo HI'"
* "Let me invoke the Read tool to read /etc/hosts"
* "I'll run shell_command with command echo HELLO"
*/
function extractLayer3(text, names, primaryParam) {
const out = [];
// v2.0.81 (#125 DuZunTianXia): GLM-5.1 narrate in Chinese — log
// showed "让我用 Bash 来列出..." / "用户想查看..." / "我会调用 X
// 工具" — none of which the English-only verb regex picked up.
// Add Chinese verbs alongside English so the name pattern matches
// either language (or mixed). The primary tool-name match still
// requires the literal tool name (e.g. `Bash`, `shell_exec`) since
// those are emitted in the original alphabet by every model.
const verbs = '(?:call|invoke|run|use|execute|exec|trigger|fire'
+ '|调用|使用|运行|执行|触发|启动|让我用|让我使用|我会用|我将用|通过|借助|采用)';
const articles = '(?:the\\s+)?';
// Suffix matches ONLY tool/function meta-words (not arg labels like
// "command" / "命令") so the latter stay in the tail and feed the
// argPatterns. Pre-v2.0.81 it included "command" / "命令" which
// greedily consumed the very keyword that argPattern 2/4 needs.
const suffix = '(?:\\s+(?:function|tool|method|函数|工具|方法))?';
for (const fn of names) {
// Pattern: "<verb> [the] [function|tool] <fn> [function|tool]"
// \b doesn't match between Chinese and Latin, so we drop the
// leading word boundary and rely on the verb list itself.
const namePat = new RegExp(
`${verbs}\\s*${articles}(?:function|tool|method|函数|工具|方法)?\\s*\\\`?${escapeRe(fn)}\\\`?${suffix}`,
'gi',
);
let m;
while ((m = namePat.exec(text)) !== null) {
// Hunt for value within next 300 chars
const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 300);
// ordered by specificity:
const argPatterns = [
// with the command 'echo X' / with command "echo X" / with command `echo X`
/\bwith\s+(?:the\s+)?(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
// bare keyword + value (no "with"): command 'echo X' / argument "X"
/(?:^|\s)(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
// 中文:用命令 'X' / 传入 'X' / 参数 'X' / 命令 'X' / 路径 'X'
/(?:用|使用|传入|输入|参数(?:为)?|命令(?:为)?|路径(?:为)?|文件(?:为)?|查询(?:为)?)\s*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
// with 'echo X' (no param keyword)
/\bwith\s+["'`]([^"'`\n]{1,500})["'`]/i,
// to read /etc/hosts (positional after action verb)
/\bto\s+(?:read|run|execute|view|search|find|cat|ls)\s+([\S][^\n]{0,200})/i,
// : 'echo X' / = 'echo X'
/[:=]\s*["'`]([^"'`\n]{1,500})["'`]/,
// last resort: very first quoted string in the tail
/^[\s,,。.]*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
];
let value = null;
for (const pat of argPatterns) {
const a = tail.match(pat);
if (a && a[1]) { value = a[1].trim(); break; }
}
if (!value) continue;
// v2.0.76 + v2.0.78 (audit H-2): reject placeholder keywords
// (`command` / `argument` / ...) AND article-led prose phrases
// (`a shell command` / `the file` / `your input`). GLM-4.7
// narrative reproducer "to run a shell command" was capturing
// "a shell command." as the value pre-v2.0.78 even with the
// single-word filter in place.
if (looksLikePlaceholderValue(value)) continue;
const param = primaryParam.get(fn) || 'input';
out.push({
name: fn,
argumentsJson: JSON.stringify({ [param]: value }),
layer: 'narrative',
confidence: 0.65,
});
}
}
return out;
}
/**
* Detect whether the user prompt asked for an action a function could
* perform. Layer 3 (narrative) only fires when this is true to avoid
* false-positive tool_call extraction from casual chat.
*/
function userPromptLooksActionable(lastUserText) {
if (!lastUserText) return false;
// v2.0.81 (#125): widen to Chinese verbs/nouns so GLM-5.1 / Kimi
// running with a Chinese system prompt + Chinese user turn still
// routes through Layer 3.
if (/\b(?:run|exec|execute|cat|ls|echo|grep|find|read|search|list|invoke|call|fetch|get|fix|edit|write|patch)\b/i.test(lastUserText)) return true;
if (/\b(?:shell|bash|terminal|command|tool|function|file|path)\b/i.test(lastUserText)) return true;
if (/(?:运行|执行|读取|查看|列出|查找|搜索|获取|修改|编辑|写入|修复|分析|调用|使用|拉取|下载|找到|看一下|看看|检查)/.test(lastUserText)) return true;
if (/(?:文件|目录|路径|命令|工具|函数|参数|项目|代码|配置)/.test(lastUserText)) return true;
return false;
}
/**
* Detect whether the model's narrative looks like it INTENDED to call
* a tool but never produced a usable extraction. Used to gate the
* retry-with-correction loop in chat.js — we only burn an extra
* cascade round-trip when there's clear tool intent we couldn't
* recover.
*
* Returns one of:
* - the matched declared tool name (when the model named it inline)
* - the FIRST declared tool name (when the narrative shows clear
* action intent + user actionable prompt + an action verb,
* even if the model didn't name a specific tool — GLM-5.1 will
* say "Let me list the files" without saying "Bash")
* - null when there's no usable signal
*
* v2.0.82 (#125 — proper translator layer beyond NLU).
*/
export function detectToolIntentInNarrative(text, tools, opts = {}) {
if (typeof text !== 'string' || !text.trim()) return null;
if (!Array.isArray(tools) || !tools.length) return null;
const lastUserText = opts.lastUserText || '';
if (!userPromptLooksActionable(lastUserText)) return null;
const { names } = indexTools(tools);
if (!names.size) return null;
// Verb forms (English + Chinese) that signal "I'm about to call X".
const verbPattern = /\b(?:call|invoke|run|use|execute|exec|trigger|fire|going to|will|let me|i'?ll|i'?m going|need to|should)\b|(?:调用|使用|运行|执行|触发|启动|让我|我会|我将|准备|打算|想要|需要|应该)/i;
if (!verbPattern.test(text)) return null;
// Action keywords (file ops, search, read, etc.) — these stand in
// for "the model is talking about USING tools generically".
const actionVerbPattern = /\b(?:list|show|read|cat|grep|find|search|view|fetch|get|create|write|edit|run|execute|check|inspect|examine|analyz|browse|explore)\b|(?:列出|展示|读取|查看|查找|搜索|获取|拉取|下载|创建|写入|编辑|运行|执行|检查|检视|分析|浏览|探索|看一下|看看)/i;
// Pass 1: specific tool name in narrative (most precise).
for (const fn of names) {
const fnRe = new RegExp(`\\b${escapeRe(fn)}\\b|\\\`${escapeRe(fn)}\\\``);
if (fnRe.test(text)) return fn;
}
// Pass 2: action keyword present (model said "let me list..." but
// didn't name the tool). Return the first declared tool — caller's
// correction prompt will name it explicitly so the retry knows
// which tool to emit.
if (actionVerbPattern.test(text)) return [...names][0];
return null;
}
/**
* Top-level extractor. Returns a deduped, confidence-sorted list of
* extracted tool_calls. Empty array when nothing is recoverable.
*
* Set the `WINDSURFAPI_NLU_RECOVERY=0` env to turn off entirely
* (default ON).
*/
export function extractIntentFromNarrative(text, tools, opts = {}) {
if (process.env.WINDSURFAPI_NLU_RECOVERY === '0') return [];
if (typeof text !== 'string' || !text.trim()) return [];
if (!Array.isArray(tools) || !tools.length) return [];
const lastUserText = opts.lastUserText || '';
const minConfidence = typeof opts.minConfidence === 'number' ? opts.minConfidence : 0.65;
// v2.0.78 (audit H-4): structural markers MAY indicate a malformed
// protocol attempt — Layer 3 narrative around it tends to be
// descriptive prose, not args. v2.0.79 narrowed the gate after
// GLM-4.7 e2e probe regressed: GLM emits `markers=bare_json`
// (because thinking text contains JSON-shaped fragments) AND a
// legitimate narrate; Layer 3 is exactly what catches the narrate.
// Now we only skip Layer 3 for `xml_tag` (Claude's tool_use shape)
// — that's where parser-failure → Layer 3 most often produces
// false positives. fenced_json / bare_json / openai_native still
// allow Layer 3 because models emitting those shapes (GLM, Kimi,
// some GPT) also reliably narrate the call in surrounding prose.
const markers = Array.isArray(opts.markers) ? opts.markers : [];
const skipLayer3 = markers.includes('xml_tag') && !markers.includes('natural_lang');
const { names, primaryParam } = indexTools(tools);
if (!names.size) return [];
const all = [
...extractLayer1(text, names),
...extractLayer2(text, names, primaryParam),
...(!skipLayer3 && userPromptLooksActionable(lastUserText) ? extractLayer3(text, names, primaryParam) : []),
];
if (!all.length) return [];
// Dedupe by (name, argumentsJson). Keep the highest-confidence pick.
const byKey = new Map();
for (const tc of all) {
if (tc.confidence < minConfidence) continue;
const key = `${tc.name}::${tc.argumentsJson}`;
const existing = byKey.get(key);
if (!existing || tc.confidence > existing.confidence) byKey.set(key, tc);
}
const recovered = [...byKey.values()].sort((a, b) => b.confidence - a.confidence);
if (recovered.length) {
log.info(`NLU recovery: extracted ${recovered.length} tool_call(s) from narrative — ${recovered.map(t => `${t.name}@${t.layer}/${t.confidence.toFixed(2)}`).join(', ')}${skipLayer3 ? ' (layer3-skipped: structural markers seen)' : ''}`);
}
return recovered;
}
|