Spaces:

Ac66
/

W

Sleeping

App Files Files Community

W / src /handlers /intent-extractor.js

Ac66's picture

Upload folder using huggingface_hub

2b64d42 verified 10 days ago

history blame contribute delete

18.5 kB

	/**
	* v2.0.72 (#115 #120 root-cause workaround) — NLU intent extractor.
	*
	* Cascade upstream's `SendUserCascadeMessage` proto has no OpenAI
	* `tools[]` field. The proxy injects tool definitions into the system
	* prompt (additional_instructions_section), but GPT / GLM / Kimi
	* weren't trained on prompt-level tool-calling protocols — they see the
	* `<tool_call>{"name":...}</tool_call>` instructions, decide to call
	* the tool, but emit it as natural-language NARRATION instead of the
	* exact markup we asked for. v2.0.71 fabricate detection just flagged
	* these as failures; v2.0.72 actually RECOVERS the call.
	*
	* Real probe captures (from scripts/probes/v2071-glm-kimi-tool-probe):
	*
	* GLM-4.7 → "I should call the shell_exec function with the command
	* 'echo HELLO_FROM_PROBE'."
	* GLM-5.1 → "I'll run the shell command as requested." (no args!)
	* GPT-5.5 → "PROBE_V0270_1777751588" (pure fabricated output)
	*
	* The first one carries enough signal to reconstruct the call; the
	* second has the intent but no args; the third is hopeless. Layered
	* extraction:
	*
	* Layer 1 (highest confidence) — explicit invocation syntax:
	* "Let me run shell_command(command='echo HELLO')"
	* "function_call: shell_exec(\"echo HELLO\")"
	*
	* Layer 2 — backtick-quoted name + value:
	* "I'll call `shell_exec` with command `echo HELLO`"
	* "use the `Read` function with file_path `/etc/hosts`"
	*
	* Layer 3 — natural narrative (model "thinking out loud"):
	* "I should call the shell_exec function with the command 'echo HI'"
	* "Let me invoke the Read tool to read /etc/hosts"
	*
	* Each layer requires the extracted name to match a caller-declared
	* tool. Layer 3 also requires the user prompt to plausibly want a
	* tool call (shell-style verbs in the most recent user message).
	*
	* Conservative by design: false-positive tool_calls drive agent loops
	* to execute things the model didn't actually decide on. When in
	* doubt, return [].
	*/

	import { log } from '../config.js';

	/**
	* @typedef {Object} ExtractedToolCall
	* @property {string} name OpenAI tool name (matches caller's tools[])
	* @property {string} argumentsJson JSON-stringified args
	* @property {'explicit-syntax'\|'backtick-quoted'\|'narrative'} layer
	* @property {number} confidence 0..1
	*/

	/**
	* Build a Set of declared tool names + a name → primaryParamName map
	* for inference of single-arg shorthands ("with command 'echo X'" →
	* arguments.command = 'echo X').
	*/
	function indexTools(tools) {
	const names = new Set();
	const primaryParam = new Map(); // tool name → first required string param
	if (!Array.isArray(tools)) return { names, primaryParam };
	for (const t of tools) {
	if (t?.type !== 'function') continue;
	const name = t.function?.name;
	if (!name \|\| typeof name !== 'string') continue;
	names.add(name);
	const params = t.function?.parameters;
	if (params?.type === 'object' && params.properties) {
	const required = Array.isArray(params.required) ? params.required : [];
	let primary = required[0];
	// Prefer the first required string-typed param (`command`,
	// `file_path`, `query`) — that's the one models naturally
	// mention with "with command X" / "with file Y" narrative.
	for (const r of required) {
	const p = params.properties[r];
	if (p?.type === 'string') { primary = r; break; }
	}
	// Fall through to first declared property if no required ones.
	if (!primary) {
	const keys = Object.keys(params.properties \|\| {});
	primary = keys.find(k => params.properties[k]?.type === 'string') \|\| keys[0];
	}
	if (primary) primaryParam.set(name, primary);
	}
	}
	return { names, primaryParam };
	}

	// Regex utilities — escape user-controlled tool name for regex insertion.
	function escapeRe(s) {
	return String(s).replace(/[.*+?^${}()\|[\]\\]/g, '\\$&');
	}

	// v2.0.78 (#120 follow-up + audit H-2): values extracted from narrative
	// can easily be a generic noun phrase ("a shell command", "the file",
	// "your input") or a literal placeholder keyword ("command",
	// "argument"). Both produce garbage tool_calls — the agent loop will
	// then try to execute `command` as a literal command, fail, and recurse.
	// Reject these uniformly across all three layers.
	const PLACEHOLDER_KEYWORDS = new Set([
	'command', 'argument', 'arguments', 'param', 'parameter',
	'parameters', 'input', 'value', 'file_path', 'filepath', 'path',
	'query', 'string', 'text', 'name', 'arg', 'output',
	// v2.0.81 (#125 — GLM-5.1 Chinese narrate): models echo Chinese
	// param-name keywords as the value too. "调用 shell_exec 命令 '命令'"
	// would otherwise produce a real tool_call with command='命令'.
	'命令', '参数', '文件', '路径', '输入', '值', '字符串', '文本', '名称', '查询', '输出',
	]);
	const ARTICLE_PREFIX_RE = /^(?:a\|an\|the\|this\|that\|these\|those\|your\|my\|our\|some\|any\|each\|every)\s+/i;
	// Chinese article-led / vague phrase prefixes — "某个命令" / "一个命令"
	// / "某种参数" — same idea as ARTICLE_PREFIX_RE but for CJK.
	const CN_VAGUE_PREFIX_RE = /^(?:某个?\|一个\|这个\|那个\|某种\|什么\|任何\|每个\|所有的?)/;

	function looksLikePlaceholderValue(value) {
	if (typeof value !== 'string' \|\| !value.trim()) return true;
	const v = value.trim();
	// Strip trailing punctuation (`.`, `,`, `;`, `:`, `。`, `，`) before comparison.
	const stripped = v.replace(/[.,;:!?。，；：！？]+$/, '');
	if (PLACEHOLDER_KEYWORDS.has(stripped.toLowerCase())) return true;
	// Article-led phrase ("a shell command", "the file") — model
	// narrating about the call rather than supplying the call value.
	if (ARTICLE_PREFIX_RE.test(stripped)) return true;
	// Chinese vague prefix — "某个命令", "一个文件", "这个参数"
	if (CN_VAGUE_PREFIX_RE.test(stripped)) return true;
	return false;
	}

	/**
	* Layer 1: explicit invocation syntax.
	*
	* shell_command(command="echo X")
	* shell_exec("echo X")
	* function_call: name=shell_exec args={"command":"echo X"}
	*/
	function extractLayer1(text, names) {
	const out = [];
	// function_name(arg=value) or function_name("value")
	const reExplicit = /\b([A-Za-z_][A-Za-z0-9_])\s\(\s(?:([A-Za-z_][A-Za-z0-9_])\s=\s)?["'`]([^"'`)]{1,2000})["'`]\s*\)/g;
	let m;
	while ((m = reExplicit.exec(text)) !== null) {
	const [, fn, paramName, value] = m;
	if (!names.has(fn)) continue;
	if (looksLikePlaceholderValue(value)) continue;
	const args = paramName ? { [paramName]: value } : { _value: value };
	out.push({
	name: fn,
	argumentsJson: JSON.stringify(args),
	layer: 'explicit-syntax',
	confidence: paramName ? 0.95 : 0.85,
	});
	}
	// function_call: name=X args={...}
	const reFc = /function[_\s]?call\s[:=][^{]?\bname\s[:=]\s["'`]?([A-Za-z_][A-Za-z0-9_])["'`]?[^{]?(\{[\s\S]{1,2000}?\})/g;
	while ((m = reFc.exec(text)) !== null) {
	const [, fn, argsBlob] = m;
	if (!names.has(fn)) continue;
	let args = {};
	try { args = JSON.parse(argsBlob); } catch {}
	out.push({
	name: fn,
	argumentsJson: JSON.stringify(args),
	layer: 'explicit-syntax',
	confidence: 0.9,
	});
	}
	return out;
	}

	/**
	* Layer 2: backtick-quoted name + later backtick-quoted value.
	*
	* "I'll call `shell_exec` with command `echo HELLO`"
	* "use the `Read` function with file_path `/etc/hosts`"
	*/
	function extractLayer2(text, names, primaryParam) {
	const out = [];
	for (const fn of names) {
	const fnRe = new RegExp(`\\\`${escapeRe(fn)}\\\``, 'g');
	let m;
	while ((m = fnRe.exec(text)) !== null) {
	// Look for next backtick-quoted token within 200 chars
	const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 200);
	// Capture optional "with PARAM `value`" or just "`value`"
	const argRe = /(?:with\s+)?(?:the\s+)?(?:argument\|param\|parameter\|input\|command\|file[_-]?path\|path\|query)?\s[:=]?\s`([^`]{1,1000})`/i;
	const a = tail.match(argRe);
	if (!a) continue;
	const value = a[1];
	if (looksLikePlaceholderValue(value)) continue;
	const param = primaryParam.get(fn) \|\| 'input';
	out.push({
	name: fn,
	argumentsJson: JSON.stringify({ [param]: value }),
	layer: 'backtick-quoted',
	confidence: 0.8,
	});
	}
	}
	return out;
	}

	/**
	* Layer 3: natural narrative.
	*
	* "I should call the shell_exec function with the command 'echo HI'"
	* "Let me invoke the Read tool to read /etc/hosts"
	* "I'll run shell_command with command echo HELLO"
	*/
	function extractLayer3(text, names, primaryParam) {
	const out = [];
	// v2.0.81 (#125 DuZunTianXia): GLM-5.1 narrate in Chinese — log
	// showed "让我用 Bash 来列出..." / "用户想查看..." / "我会调用 X
	// 工具" — none of which the English-only verb regex picked up.
	// Add Chinese verbs alongside English so the name pattern matches
	// either language (or mixed). The primary tool-name match still
	// requires the literal tool name (e.g. `Bash`, `shell_exec`) since
	// those are emitted in the original alphabet by every model.
	const verbs = '(?:call\|invoke\|run\|use\|execute\|exec\|trigger\|fire'
	+ '\|调用\|使用\|运行\|执行\|触发\|启动\|让我用\|让我使用\|我会用\|我将用\|通过\|借助\|采用)';
	const articles = '(?:the\\s+)?';
	// Suffix matches ONLY tool/function meta-words (not arg labels like
	// "command" / "命令") so the latter stay in the tail and feed the
	// argPatterns. Pre-v2.0.81 it included "command" / "命令" which
	// greedily consumed the very keyword that argPattern 2/4 needs.
	const suffix = '(?:\\s+(?:function\|tool\|method\|函数\|工具\|方法))?';
	for (const fn of names) {
	// Pattern: "<verb> [the] [function\|tool] <fn> [function\|tool]"
	// \b doesn't match between Chinese and Latin, so we drop the
	// leading word boundary and rely on the verb list itself.
	const namePat = new RegExp(
	`${verbs}\\s${articles}(?:function\|tool\|method\|函数\|工具\|方法)?\\s\\\`?${escapeRe(fn)}\\\`?${suffix}`,
	'gi',
	);
	let m;
	while ((m = namePat.exec(text)) !== null) {
	// Hunt for value within next 300 chars
	const tail = text.slice(m.index + m[0].length, m.index + m[0].length + 300);
	// ordered by specificity:
	const argPatterns = [
	// with the command 'echo X' / with command "echo X" / with command `echo X`
	/\bwith\s+(?:the\s+)?(?:command\|argument\|param(?:eter)?\|input\|file[_-]?path\|path\|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
	// bare keyword + value (no "with"): command 'echo X' / argument "X"
	/(?:^\|\s)(?:command\|argument\|param(?:eter)?\|input\|file[_-]?path\|path\|query)\s+["'`]([^"'`\n]{1,500})["'`]/i,
	// 中文：用命令 'X' / 传入 'X' / 参数 'X' / 命令 'X' / 路径 'X'
	/(?:用\|使用\|传入\|输入\|参数(?:为)?\|命令(?:为)?\|路径(?:为)?\|文件(?:为)?\|查询(?:为)?)\s*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
	// with 'echo X' (no param keyword)
	/\bwith\s+["'`]([^"'`\n]{1,500})["'`]/i,
	// to read /etc/hosts (positional after action verb)
	/\bto\s+(?:read\|run\|execute\|view\|search\|find\|cat\|ls)\s+([\S][^\n]{0,200})/i,
	// : 'echo X' / = 'echo X'
	/[:=]\s*["'`]([^"'`\n]{1,500})["'`]/,
	// last resort: very first quoted string in the tail
	/^[\s,，。.]*["'`「『]([^"'`\n「」『』]{1,500})["'`」』]/,
	];
	let value = null;
	for (const pat of argPatterns) {
	const a = tail.match(pat);
	if (a && a[1]) { value = a[1].trim(); break; }
	}
	if (!value) continue;
	// v2.0.76 + v2.0.78 (audit H-2): reject placeholder keywords
	// (`command` / `argument` / ...) AND article-led prose phrases
	// (`a shell command` / `the file` / `your input`). GLM-4.7
	// narrative reproducer "to run a shell command" was capturing
	// "a shell command." as the value pre-v2.0.78 even with the
	// single-word filter in place.
	if (looksLikePlaceholderValue(value)) continue;
	const param = primaryParam.get(fn) \|\| 'input';
	out.push({
	name: fn,
	argumentsJson: JSON.stringify({ [param]: value }),
	layer: 'narrative',
	confidence: 0.65,
	});
	}
	}
	return out;
	}

	/**
	* Detect whether the user prompt asked for an action a function could
	* perform. Layer 3 (narrative) only fires when this is true to avoid
	* false-positive tool_call extraction from casual chat.
	*/
	function userPromptLooksActionable(lastUserText) {
	if (!lastUserText) return false;
	// v2.0.81 (#125): widen to Chinese verbs/nouns so GLM-5.1 / Kimi
	// running with a Chinese system prompt + Chinese user turn still
	// routes through Layer 3.
	if (/\b(?:run\|exec\|execute\|cat\|ls\|echo\|grep\|find\|read\|search\|list\|invoke\|call\|fetch\|get\|fix\|edit\|write\|patch)\b/i.test(lastUserText)) return true;
	if (/\b(?:shell\|bash\|terminal\|command\|tool\|function\|file\|path)\b/i.test(lastUserText)) return true;
	if (/(?:运行\|执行\|读取\|查看\|列出\|查找\|搜索\|获取\|修改\|编辑\|写入\|修复\|分析\|调用\|使用\|拉取\|下载\|找到\|看一下\|看看\|检查)/.test(lastUserText)) return true;
	if (/(?:文件\|目录\|路径\|命令\|工具\|函数\|参数\|项目\|代码\|配置)/.test(lastUserText)) return true;
	return false;
	}

	/**
	* Detect whether the model's narrative looks like it INTENDED to call
	* a tool but never produced a usable extraction. Used to gate the
	* retry-with-correction loop in chat.js — we only burn an extra
	* cascade round-trip when there's clear tool intent we couldn't
	* recover.
	*
	* Returns one of:
	* - the matched declared tool name (when the model named it inline)
	* - the FIRST declared tool name (when the narrative shows clear
	* action intent + user actionable prompt + an action verb,
	* even if the model didn't name a specific tool — GLM-5.1 will
	* say "Let me list the files" without saying "Bash")
	* - null when there's no usable signal
	*
	* v2.0.82 (#125 — proper translator layer beyond NLU).
	*/
	export function detectToolIntentInNarrative(text, tools, opts = {}) {
	if (typeof text !== 'string' \|\| !text.trim()) return null;
	if (!Array.isArray(tools) \|\| !tools.length) return null;
	const lastUserText = opts.lastUserText \|\| '';
	if (!userPromptLooksActionable(lastUserText)) return null;
	const { names } = indexTools(tools);
	if (!names.size) return null;
	// Verb forms (English + Chinese) that signal "I'm about to call X".
	const verbPattern = /\b(?:call\|invoke\|run\|use\|execute\|exec\|trigger\|fire\|going to\|will\|let me\|i'?ll\|i'?m going\|need to\|should)\b\|(?:调用\|使用\|运行\|执行\|触发\|启动\|让我\|我会\|我将\|准备\|打算\|想要\|需要\|应该)/i;
	if (!verbPattern.test(text)) return null;
	// Action keywords (file ops, search, read, etc.) — these stand in
	// for "the model is talking about USING tools generically".
	const actionVerbPattern = /\b(?:list\|show\|read\|cat\|grep\|find\|search\|view\|fetch\|get\|create\|write\|edit\|run\|execute\|check\|inspect\|examine\|analyz\|browse\|explore)\b\|(?:列出\|展示\|读取\|查看\|查找\|搜索\|获取\|拉取\|下载\|创建\|写入\|编辑\|运行\|执行\|检查\|检视\|分析\|浏览\|探索\|看一下\|看看)/i;
	// Pass 1: specific tool name in narrative (most precise).
	for (const fn of names) {
	const fnRe = new RegExp(`\\b${escapeRe(fn)}\\b\|\\\`${escapeRe(fn)}\\\``);
	if (fnRe.test(text)) return fn;
	}
	// Pass 2: action keyword present (model said "let me list..." but
	// didn't name the tool). Return the first declared tool — caller's
	// correction prompt will name it explicitly so the retry knows
	// which tool to emit.
	if (actionVerbPattern.test(text)) return [...names][0];
	return null;
	}

	/**
	* Top-level extractor. Returns a deduped, confidence-sorted list of
	* extracted tool_calls. Empty array when nothing is recoverable.
	*
	* Set the `WINDSURFAPI_NLU_RECOVERY=0` env to turn off entirely
	* (default ON).
	*/
	export function extractIntentFromNarrative(text, tools, opts = {}) {
	if (process.env.WINDSURFAPI_NLU_RECOVERY === '0') return [];
	if (typeof text !== 'string' \|\| !text.trim()) return [];
	if (!Array.isArray(tools) \|\| !tools.length) return [];
	const lastUserText = opts.lastUserText \|\| '';
	const minConfidence = typeof opts.minConfidence === 'number' ? opts.minConfidence : 0.65;
	// v2.0.78 (audit H-4): structural markers MAY indicate a malformed
	// protocol attempt — Layer 3 narrative around it tends to be
	// descriptive prose, not args. v2.0.79 narrowed the gate after
	// GLM-4.7 e2e probe regressed: GLM emits `markers=bare_json`
	// (because thinking text contains JSON-shaped fragments) AND a
	// legitimate narrate; Layer 3 is exactly what catches the narrate.
	// Now we only skip Layer 3 for `xml_tag` (Claude's tool_use shape)
	// — that's where parser-failure → Layer 3 most often produces
	// false positives. fenced_json / bare_json / openai_native still
	// allow Layer 3 because models emitting those shapes (GLM, Kimi,
	// some GPT) also reliably narrate the call in surrounding prose.
	const markers = Array.isArray(opts.markers) ? opts.markers : [];
	const skipLayer3 = markers.includes('xml_tag') && !markers.includes('natural_lang');

	const { names, primaryParam } = indexTools(tools);
	if (!names.size) return [];

	const all = [
	...extractLayer1(text, names),
	...extractLayer2(text, names, primaryParam),
	...(!skipLayer3 && userPromptLooksActionable(lastUserText) ? extractLayer3(text, names, primaryParam) : []),
	];
	if (!all.length) return [];

	// Dedupe by (name, argumentsJson). Keep the highest-confidence pick.
	const byKey = new Map();
	for (const tc of all) {
	if (tc.confidence < minConfidence) continue;
	const key = `${tc.name}::${tc.argumentsJson}`;
	const existing = byKey.get(key);
	if (!existing \|\| tc.confidence > existing.confidence) byKey.set(key, tc);
	}
	const recovered = [...byKey.values()].sort((a, b) => b.confidence - a.confidence);
	if (recovered.length) {
	log.info(`NLU recovery: extracted ${recovered.length} tool_call(s) from narrative — ${recovered.map(t => `${t.name}@${t.layer}/${t.confidence.toFixed(2)}`).join(', ')}${skipLayer3 ? ' (layer3-skipped: structural markers seen)' : ''}`);
	}
	return recovered;
	}