W
File size: 18,488 Bytes
2b64d42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/**
 * v2.0.72 (#115 #120 root-cause workaround) — NLU intent extractor.
 *
 * Cascade upstream's `SendUserCascadeMessage` proto has no OpenAI
 * `tools[]` field. The proxy injects tool definitions into the system
 * prompt (additional_instructions_section), but GPT / GLM / Kimi
 * weren't trained on prompt-level tool-calling protocols — they see the
 * `<tool_call>{"name":...}</tool_call>` instructions, decide to call
 * the tool, but emit it as natural-language NARRATION instead of the
 * exact markup we asked for. v2.0.71 fabricate detection just flagged
 * these as failures; v2.0.72 actually RECOVERS the call.
 *
 * Real probe captures (from scripts/probes/v2071-glm-kimi-tool-probe):
 *
 *   GLM-4.7  → "I should call the shell_exec function with the command
 *               'echo HELLO_FROM_PROBE'."
 *   GLM-5.1  → "I'll run the shell command as requested."  (no args!)
 *   GPT-5.5  → "PROBE_V0270_1777751588"  (pure fabricated output)
 *
 * The first one carries enough signal to reconstruct the call; the
 * second has the intent but no args; the third is hopeless. Layered
 * extraction:
 *
 *   Layer 1 (highest confidence) — explicit invocation syntax:
 *     "Let me run shell_command(command='echo HELLO')"
 *     "function_call: shell_exec(\"echo HELLO\")"
 *
 *   Layer 2 — backtick-quoted name + value:
 *     "I'll call `shell_exec` with command `echo HELLO`"
 *     "use the `Read` function with file_path `/etc/hosts`"
 *
 *   Layer 3 — natural narrative (model "thinking out loud"):
 *     "I should call the shell_exec function with the command 'echo HI'"
 *     "Let me invoke the Read tool to read /etc/hosts"
 *
 * Each layer requires the extracted name to match a caller-declared
 * tool. Layer 3 also requires the user prompt to plausibly want a
 * tool call (shell-style verbs in the most recent user message).
 *
 * Conservative by design: false-positive tool_calls drive agent loops
 * to execute things the model didn't actually decide on. When in
 * doubt, return [].
 */

import { log } from '../config.js';

/**
 * @typedef {Object} ExtractedToolCall
 * @property {string} name        OpenAI tool name (matches caller's tools[])
 * @property {string} argumentsJson  JSON-stringified args
 * @property {'explicit-syntax'|'backtick-quoted'|'narrative'} layer
 * @property {number} confidence  0..1
 */

/**
 * Build a Set of declared tool names + a name → primaryParamName map
 * for inference of single-arg shorthands ("with command 'echo X'" →
 * arguments.command = 'echo X').
 */
function indexTools(tools) {
  const names = new Set();
  const primaryParam = new Map(); // tool name → first required string param
  if (!Array.isArray(tools)) return { names, primaryParam };
  for (const t of tools) {
    if (t?.type !== 'function') continue;
    const name = t.function?.name;
    if (!name || typeof name !== 'string') continue;
    names.add(name);
    const params = t.function?.parameters;
    if (params?.type === 'object' && params.properties) {
      const required = Array.isArray(params.required) ? params.required : [];
      let primary = required[0];
      // Prefer the first required string-typed param (`command`,
      // `file_path`, `query`) — that's the one models naturally
      // mention with "with command X" / "with file Y" narrative.
      for (const r of required) {
        const p = params.properties[r];
        if (p?.type === 'string') { primary = r; break; }
      }
      // Fall through to first declared property if no required ones.
      if (!primary) {
        const keys = Object.keys(params.properties || {});
        primary = keys.find(k => params.properties[k]?.type === 'string') || keys[0];
      }
      if (primary) primaryParam.set(name, primary);
    }
  }
  return { names, primaryParam };
}

// Regex utilities — escape user-controlled tool name for regex insertion.
function escapeRe(s) {
  return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// v2.0.78 (#120 follow-up + audit H-2): values extracted from narrative
// can easily be a generic noun phrase ("a shell command", "the file",
// "your input") or a literal placeholder keyword ("command",
// "argument"). Both produce garbage tool_calls — the agent loop will
// then try to execute `command` as a literal command, fail, and recurse.
// Reject these uniformly across all three layers.
const PLACEHOLDER_KEYWORDS = new Set([
  'command', 'argument', 'arguments', 'param', 'parameter',
  'parameters', 'input', 'value', 'file_path', 'filepath', 'path',
  'query', 'string', 'text', 'name', 'arg', 'output',
  // v2.0.81 (#125 — GLM-5.1 Chinese narrate): models echo Chinese
  // param-name keywords as the value too. "调用 shell_exec 命令 '命令'"
  // would otherwise produce a real tool_call with command='命令'.
  '命令', '参数', '文件', '路径', '输入', '值', '字符串', '文本', '名称', '查询', '输出',
]);
const ARTICLE_PREFIX_RE = /^(?:a|an|the|this|that|these|those|your|my|our|some|any|each|every)\s+/i;
// Chinese article-led / vague phrase prefixes — "某个命令" / "一个命令"
// / "某种参数" — same idea as ARTICLE_PREFIX_RE but for CJK.
const CN_VAGUE_PREFIX_RE = /^(?:某个?|一个|这个|那个|某种|什么|任何|每个|所有的?)/;

function looksLikePlaceholderValue(value) {
  if (typeof value !== 'string' || !value.trim()) return true;
  const v = value.trim();
  // Strip trailing punctuation (`.`, `,`, `;`, `:`, `。`, `,`) before comparison.
  const stripped = v.replace(/[.,;:!?。,;:!?]+$/, '');
  if (PLACEHOLDER_KEYWORDS.has(stripped.toLowerCase())) return true;
  // Article-led phrase ("a shell command", "the file") — model
  // narrating about the call rather than supplying the call value.
  if (ARTICLE_PREFIX_RE.test(stripped)) return true;
  // Chinese vague prefix — "某个命令", "一个文件", "这个参数"
  if (CN_VAGUE_PREFIX_RE.test(stripped)) return true;
  return false;
}

/**
 * Layer 1: explicit invocation syntax.
 *
 *   shell_command(command="echo X")
 *   shell_exec("echo X")
 *   function_call: name=shell_exec args={"command":"echo X"}
 */
function extractLayer1(text, names) {
  const out = [];
  // function_name(arg=value) or function_name("value")
  const reExplicit = /\b([A-Za-z_][A-Za-z0-9_]*)\s*\(\s*(?:([A-Za-z_][A-Za-z0-9_]*)\s*=\s*)?["'`]([^"'`)]{1,2000})["'`]\s*\)/g;
  let m;
  while ((m = reExplicit.exec(text)) !== null) {
    const [, fn, paramName, value] = m;
    if (!names.has(fn)) continue;
    if (looksLikePlaceholderValue(value)) continue;
    const args = paramName ? { [paramName]: value } : { _value: value };
    out.push({
      name: fn,
      argumentsJson: JSON.stringify(args),
      layer: 'explicit-syntax',
      confidence: paramName ? 0.95 : 0.85,
    });
  }
  // function_call: name=X args={...}
  const reFc = /function[_\s]?call\s*[:=][^{]*?\bname\s*[:=]\s*["'`]?([A-Za-z_][A-Za-z0-9_]*)["'`]?[^{]*?(\{[\s\S]{1,2000}?\})/g;
  while ((m = reFc.exec(text)) !== null) {
    const [, fn, argsBlob] = m;
    if (!names.has(fn)) continue;
    let args = {};
    try { args = JSON.parse(argsBlob); } catch {}
    out.push({
      name: fn,
      argumentsJson: JSON.stringify(args),
      layer: 'explicit-syntax',
      confidence: 0.9,
    });
  }
  return out;
}

/**
 * Layer 2: backtick-quoted name + later backtick-quoted value.
 *
 *   "I'll call `shell_exec` with command `echo HELLO`"
 *   "use the `Read` function with file_path `/etc/hosts`"
 */
function extractLayer2(text, names, primaryParam) {
  const out = [];
  for (const fn of names) {
    const fnRe = new RegExp(`\\\`${escapeRe(fn)}\\\``, 'g');
    let m;
    while ((m = fnRe.exec(text)) !== null) {
      // Look for next backtick-quoted token within 200 chars
      const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 200);
      // Capture optional "with PARAM `value`" or just "`value`"
      const argRe = /(?:with\s+)?(?:the\s+)?(?:argument|param|parameter|input|command|file[_-]?path|path|query)?\s*[:=]?\s*`([^`]{1,1000})`/i;
      const a = tail.match(argRe);
      if (!a) continue;
      const value = a[1];
      if (looksLikePlaceholderValue(value)) continue;
      const param = primaryParam.get(fn) || 'input';
      out.push({
        name: fn,
        argumentsJson: JSON.stringify({ [param]: value }),
        layer: 'backtick-quoted',
        confidence: 0.8,
      });
    }
  }
  return out;
}

/**
 * Layer 3: natural narrative.
 *
 *   "I should call the shell_exec function with the command 'echo HI'"
 *   "Let me invoke the Read tool to read /etc/hosts"
 *   "I'll run shell_command with command echo HELLO"
 */
function extractLayer3(text, names, primaryParam) {
  const out = [];
  // v2.0.81 (#125 DuZunTianXia): GLM-5.1 narrate in Chinese — log
  // showed "让我用 Bash 来列出..." / "用户想查看..." / "我会调用 X
  // 工具" — none of which the English-only verb regex picked up.
  // Add Chinese verbs alongside English so the name pattern matches
  // either language (or mixed). The primary tool-name match still
  // requires the literal tool name (e.g. `Bash`, `shell_exec`) since
  // those are emitted in the original alphabet by every model.
  const verbs = '(?:call|invoke|run|use|execute|exec|trigger|fire'
    + '|调用|使用|运行|执行|触发|启动|让我用|让我使用|我会用|我将用|通过|借助|采用)';
  const articles = '(?:the\\s+)?';
  // Suffix matches ONLY tool/function meta-words (not arg labels like
  // "command" / "命令") so the latter stay in the tail and feed the
  // argPatterns. Pre-v2.0.81 it included "command" / "命令" which
  // greedily consumed the very keyword that argPattern 2/4 needs.
  const suffix = '(?:\\s+(?:function|tool|method|函数|工具|方法))?';
  for (const fn of names) {
    // Pattern: "<verb> [the] [function|tool] <fn> [function|tool]"
    // \b doesn't match between Chinese and Latin, so we drop the
    // leading word boundary and rely on the verb list itself.
    const namePat = new RegExp(
      `${verbs}\\s*${articles}(?:function|tool|method|函数|工具|方法)?\\s*\\\`?${escapeRe(fn)}\\\`?${suffix}`,
      'gi',
    );
    let m;
    while ((m = namePat.exec(text)) !== null) {
      // Hunt for value within next 300 chars
      const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 300);
      // ordered by specificity:
      const argPatterns = [
        // with the command 'echo X' / with command "echo X" / with command `echo X`
        /\bwith\s+(?:the\s+)?(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
        // bare keyword + value (no "with"): command 'echo X' / argument "X"
        /(?:^|\s)(?:command|argument|param(?:eter)?|input|file[_-]?path|path|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
        // 中文:用命令 'X' / 传入 'X' / 参数 'X' / 命令 'X' / 路径 'X'
        /(?:用|使用|传入|输入|参数(?:为)?|命令(?:为)?|路径(?:为)?|文件(?:为)?|查询(?:为)?)\s*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
        // with 'echo X' (no param keyword)
        /\bwith\s+["'`]([^"'`\n]{1,500})["'`]/i,
        // to read /etc/hosts (positional after action verb)
        /\bto\s+(?:read|run|execute|view|search|find|cat|ls)\s+([\S][^\n]{0,200})/i,
        // : 'echo X' / = 'echo X'
        /[:=]\s*["'`]([^"'`\n]{1,500})["'`]/,
        // last resort: very first quoted string in the tail
        /^[\s,,。.]*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
      ];
      let value = null;
      for (const pat of argPatterns) {
        const a = tail.match(pat);
        if (a && a[1]) { value = a[1].trim(); break; }
      }
      if (!value) continue;
      // v2.0.76 + v2.0.78 (audit H-2): reject placeholder keywords
      // (`command` / `argument` / ...) AND article-led prose phrases
      // (`a shell command` / `the file` / `your input`). GLM-4.7
      // narrative reproducer "to run a shell command" was capturing
      // "a shell command." as the value pre-v2.0.78 even with the
      // single-word filter in place.
      if (looksLikePlaceholderValue(value)) continue;
      const param = primaryParam.get(fn) || 'input';
      out.push({
        name: fn,
        argumentsJson: JSON.stringify({ [param]: value }),
        layer: 'narrative',
        confidence: 0.65,
      });
    }
  }
  return out;
}

/**
 * Detect whether the user prompt asked for an action a function could
 * perform. Layer 3 (narrative) only fires when this is true to avoid
 * false-positive tool_call extraction from casual chat.
 */
function userPromptLooksActionable(lastUserText) {
  if (!lastUserText) return false;
  // v2.0.81 (#125): widen to Chinese verbs/nouns so GLM-5.1 / Kimi
  // running with a Chinese system prompt + Chinese user turn still
  // routes through Layer 3.
  if (/\b(?:run|exec|execute|cat|ls|echo|grep|find|read|search|list|invoke|call|fetch|get|fix|edit|write|patch)\b/i.test(lastUserText)) return true;
  if (/\b(?:shell|bash|terminal|command|tool|function|file|path)\b/i.test(lastUserText)) return true;
  if (/(?:运行|执行|读取|查看|列出|查找|搜索|获取|修改|编辑|写入|修复|分析|调用|使用|拉取|下载|找到|看一下|看看|检查)/.test(lastUserText)) return true;
  if (/(?:文件|目录|路径|命令|工具|函数|参数|项目|代码|配置)/.test(lastUserText)) return true;
  return false;
}

/**
 * Detect whether the model's narrative looks like it INTENDED to call
 * a tool but never produced a usable extraction. Used to gate the
 * retry-with-correction loop in chat.js — we only burn an extra
 * cascade round-trip when there's clear tool intent we couldn't
 * recover.
 *
 * Returns one of:
 *   - the matched declared tool name (when the model named it inline)
 *   - the FIRST declared tool name (when the narrative shows clear
 *     action intent + user actionable prompt + an action verb,
 *     even if the model didn't name a specific tool — GLM-5.1 will
 *     say "Let me list the files" without saying "Bash")
 *   - null when there's no usable signal
 *
 * v2.0.82 (#125 — proper translator layer beyond NLU).
 */
export function detectToolIntentInNarrative(text, tools, opts = {}) {
  if (typeof text !== 'string' || !text.trim()) return null;
  if (!Array.isArray(tools) || !tools.length) return null;
  const lastUserText = opts.lastUserText || '';
  if (!userPromptLooksActionable(lastUserText)) return null;
  const { names } = indexTools(tools);
  if (!names.size) return null;
  // Verb forms (English + Chinese) that signal "I'm about to call X".
  const verbPattern = /\b(?:call|invoke|run|use|execute|exec|trigger|fire|going to|will|let me|i'?ll|i'?m going|need to|should)\b|(?:调用|使用|运行|执行|触发|启动|让我|我会|我将|准备|打算|想要|需要|应该)/i;
  if (!verbPattern.test(text)) return null;
  // Action keywords (file ops, search, read, etc.) — these stand in
  // for "the model is talking about USING tools generically".
  const actionVerbPattern = /\b(?:list|show|read|cat|grep|find|search|view|fetch|get|create|write|edit|run|execute|check|inspect|examine|analyz|browse|explore)\b|(?:列出|展示|读取|查看|查找|搜索|获取|拉取|下载|创建|写入|编辑|运行|执行|检查|检视|分析|浏览|探索|看一下|看看)/i;
  // Pass 1: specific tool name in narrative (most precise).
  for (const fn of names) {
    const fnRe = new RegExp(`\\b${escapeRe(fn)}\\b|\\\`${escapeRe(fn)}\\\``);
    if (fnRe.test(text)) return fn;
  }
  // Pass 2: action keyword present (model said "let me list..." but
  // didn't name the tool). Return the first declared tool — caller's
  // correction prompt will name it explicitly so the retry knows
  // which tool to emit.
  if (actionVerbPattern.test(text)) return [...names][0];
  return null;
}

/**
 * Top-level extractor. Returns a deduped, confidence-sorted list of
 * extracted tool_calls. Empty array when nothing is recoverable.
 *
 * Set the `WINDSURFAPI_NLU_RECOVERY=0` env to turn off entirely
 * (default ON).
 */
export function extractIntentFromNarrative(text, tools, opts = {}) {
  if (process.env.WINDSURFAPI_NLU_RECOVERY === '0') return [];
  if (typeof text !== 'string' || !text.trim()) return [];
  if (!Array.isArray(tools) || !tools.length) return [];
  const lastUserText = opts.lastUserText || '';
  const minConfidence = typeof opts.minConfidence === 'number' ? opts.minConfidence : 0.65;
  // v2.0.78 (audit H-4): structural markers MAY indicate a malformed
  // protocol attempt — Layer 3 narrative around it tends to be
  // descriptive prose, not args. v2.0.79 narrowed the gate after
  // GLM-4.7 e2e probe regressed: GLM emits `markers=bare_json`
  // (because thinking text contains JSON-shaped fragments) AND a
  // legitimate narrate; Layer 3 is exactly what catches the narrate.
  // Now we only skip Layer 3 for `xml_tag` (Claude's tool_use shape)
  // — that's where parser-failure → Layer 3 most often produces
  // false positives. fenced_json / bare_json / openai_native still
  // allow Layer 3 because models emitting those shapes (GLM, Kimi,
  // some GPT) also reliably narrate the call in surrounding prose.
  const markers = Array.isArray(opts.markers) ? opts.markers : [];
  const skipLayer3 = markers.includes('xml_tag') && !markers.includes('natural_lang');

  const { names, primaryParam } = indexTools(tools);
  if (!names.size) return [];

  const all = [
    ...extractLayer1(text, names),
    ...extractLayer2(text, names, primaryParam),
    ...(!skipLayer3 && userPromptLooksActionable(lastUserText) ? extractLayer3(text, names, primaryParam) : []),
  ];
  if (!all.length) return [];

  // Dedupe by (name, argumentsJson). Keep the highest-confidence pick.
  const byKey = new Map();
  for (const tc of all) {
    if (tc.confidence < minConfidence) continue;
    const key = `${tc.name}::${tc.argumentsJson}`;
    const existing = byKey.get(key);
    if (!existing || tc.confidence > existing.confidence) byKey.set(key, tc);
  }
  const recovered = [...byKey.values()].sort((a, b) => b.confidence - a.confidence);
  if (recovered.length) {
    log.info(`NLU recovery: extracted ${recovered.length} tool_call(s) from narrative — ${recovered.map(t => `${t.name}@${t.layer}/${t.confidence.toFixed(2)}`).join(', ')}${skipLayer3 ? ' (layer3-skipped: structural markers seen)' : ''}`);
  }
  return recovered;
}