Spaces:

Ac66
/

W

Sleeping

App Files Files Community

W / src /sanitize.js

Ac66's picture

Upload folder using huggingface_hub

2b64d42 verified 10 days ago

history blame contribute delete

11.6 kB

	/**
	* Strip server-internal filesystem paths from model output before it reaches
	* the API caller.
	*
	* Background: Cascade's baked-in system context tells the model its workspace
	* lives at /tmp/windsurf-workspace. Even after we removed CascadeToolConfig
	* .run_command (see windsurf.js buildCascadeConfig) the model still
	* (a) narrates "I'll look at /tmp/windsurf-workspace/config.yaml" in plain
	* text, and
	* (b) occasionally emits built-in edit_file / view_file / list_directory
	* trajectory steps whose argumentsJson references these paths.
	* Both routes leak the proxy's internal filesystem layout to API callers.
	*
	* This module provides two scrubbers:
	* - sanitizeText(s) — one-shot, use on accumulated buffers
	* - PathSanitizeStream — incremental, use on streaming chunks
	*
	* The streaming version holds back any tail that could be an incomplete
	* prefix of a sensitive literal OR a match-in-progress whose path-tail hasn't
	* hit a terminator yet, so a path cannot slip through by straddling a chunk
	* boundary.
	*/

	// Detect the actual project root from this module's path so the sanitizer
	// covers deployments outside /root/WindsurfAPI (e.g. /srv/WindsurfAPI).
	import { fileURLToPath as _fileURLToPath } from 'url';
	const _repoRoot = (() => {
	try {
	const thisFile = _fileURLToPath(import.meta.url);
	// sanitize.js is in src/, so project root is one directory up.
	// Handle both / and \ separators for cross-platform support.
	return thisFile.replace(/[/\\]src[/\\]sanitize\.js$/, '');
	} catch { return process.cwd(); }
	})();

	// Placeholder history: every marker has to avoid becoming either a fake path
	// the model reuses in tool calls or a fake answer the model repeats to users.
	// ./tail → LLM Reads ./src/main.py → ENOENT → loops
	// [internal] → LLM runs `ls [internal]` → ENOENT → loops
	// <redacted-path> → LLM passes to Read/Bash → ENOENT (Linux) /
	// Errno 22 (Windows) → loops
	// (internal path redacted) → zsh parses `cd (internal path redacted)`
	// as glob-qualifier syntax → cryptic
	// "unknown file attribute: i" error
	// redacted internal path → Opus 4.7 echoes it verbatim into bash
	// commands; reads to the model as a
	// plausible directory name and the
	// failure mode is `cd: too many arguments`
	// which still wastes 2-3 turns
	// … → avoids shell loops, but Sonnet 4.6 can echo
	// it in prose as "your path is …", causing a
	// user-visible answer loop when asked for the
	// project path.
	// Current marker is structural and explicit: it tells the user/model the
	// workspace path is intentionally hidden, without looking like a real absolute
	// path or a literal ellipsis answer. The proto/tool preamble also tells the
	// model not to answer project-path questions by echoing this marker.
	// Verified with the drift probe (scripts/_agent_drift_probe.py).
	const REDACTED_PATH = '<workspace>';

	// Path body char class: anything that's not whitespace or syntax-terminator.
	// Used in patterns and in cut-point detection — must match.
	// Note: `\\` is INSIDE the char class so backslash-separated tails (Windows
	// style: `\home\user\projects\workspace-x\src\index.js`) keep extending the
	// match instead of terminating at the first backslash.
	const PATTERNS = [
	[/\/tmp\/windsurf-workspace(?:[/\\][^\s"'`<>)}\],;])?/g, REDACTED_PATH],
	// Unix and Windows-mixed forms — issue #86 reports of
	// `C:\home\user\projects\workspace-devinxse` leaking despite the Unix-only
	// regex catching `/home/user/projects/workspace-skxwsx01`. Cover:
	// /home/user/projects/workspace-x[/...]
	// \home\user\projects\workspace-x[\...]
	// C:\home\user\projects\workspace-x[\...]
	// C:\home/user/projects/workspace-x (mixed separators, GLM-style hallucination)
	[/(?:[A-Za-z]:)?[/\\]home[/\\]user[/\\]projects[/\\]workspace-[a-z0-9]+(?:[/\\][^\s"'`<>)}\],;])?/g, REDACTED_PATH],
	[/\/opt\/windsurf(?:[/\\][^\s"'`<>)}\],;])?/g, REDACTED_PATH],
	[new RegExp(_repoRoot.replace(/[.+?^${}()\|[\]\\]/g, '\\$&') + '(?:[/\\\\][^\\s"\'`<>)}\\],;]*)?', 'g'), REDACTED_PATH],
	// v2.0.78 (#108 zhangzhang-bit) — Cascade upstream injects these XML
	// blocks into the system prompt to describe its sandbox state:
	// <workspace_information>...workspace path / metadata...</workspace_information>
	// <workspace_layout>...file tree...</workspace_layout>
	// <user_information>...account / config...</user_information>
	// The model sometimes echoes them verbatim into its response, leaking
	// server-internal sandbox state to API callers (the actual #108
	// screenshot showed `workspace-devinxse` paths surrounded by these
	// wrappers). Strip the entire block (greedy across newlines) — these
	// are upstream-injected and have no legitimate reason to surface in
	// client-facing output.
	[/<workspace_information>[\s\S]*?<\/workspace_information>/gi, ''],
	[/<workspace_layout>[\s\S]*?<\/workspace_layout>/gi, ''],
	[/<user_information>[\s\S]*?<\/user_information>/gi, ''],
	];

	// Tags whose ENTIRE block (open → close) is upstream-injected and must
	// be held back during streaming until we see the closing tag — otherwise
	// chunk N might emit `<workspace_information>file:///home/user/proj...`
	// before chunk N+1 arrives with the rest. Used by PathSanitizeStream
	// alongside SENSITIVE_LITERALS.
	const STRIP_BLOCK_TAGS = ['workspace_information', 'workspace_layout', 'user_information'];

	// Bare literals (no path tail) used by the streaming cut-point finder.
	// Listed once per separator/prefix shape so the partial-prefix detection
	// can hold back the right tail length on stream chunks.
	const SENSITIVE_LITERALS = [
	'/tmp/windsurf-workspace',
	'/home/user/projects/workspace-',
	'\\home\\user\\projects\\workspace-',
	'/opt/windsurf',
	_repoRoot,
	];

	// Character class that counts as part of a path body. Mirrors the PATTERNS
	// regex char class so cut-point detection matches replacement behaviour.
	const PATH_BODY_RE = /[^\s"'`<>)}\],*;]/;

	/**
	* Apply all path redactions to `s` in one pass. Safe to call on any string;
	* non-strings and empty strings are returned unchanged.
	*/
	export function sanitizeText(s) {
	if (typeof s !== 'string' \|\| !s) return s;
	let out = s;
	for (const [re, rep] of PATTERNS) out = out.replace(re, rep);
	return out;
	}

	/**
	* Incremental sanitizer for streamed deltas.
	*
	* Usage:
	* const stream = new PathSanitizeStream();
	* for (const chunk of deltas) emit(stream.feed(chunk));
	* emit(stream.flush());
	*
	* The returned string from feed()/flush() is guaranteed to contain no
	* sensitive literal. Any trailing text that COULD extend into a sensitive
	* literal (either as a partial prefix or as an unterminated path tail) is
	* held internally until the next feed or the flush.
	*/
	export class PathSanitizeStream {
	constructor() {
	this.buffer = '';
	}

	feed(delta) {
	if (!delta) return '';
	this.buffer += delta;
	const cut = this._safeCutPoint();
	if (cut === 0) return '';
	const safeRegion = this.buffer.slice(0, cut);
	this.buffer = this.buffer.slice(cut);
	return sanitizeText(safeRegion);
	}

	// Largest index into this.buffer such that buffer[0:cut] contains no
	// match that could extend past `cut`. Two conditions back off the cut:
	// (1) a full sensitive literal was found but its path body ran to the
	// end of the buffer — the next delta might append more path chars,
	// in which case the fully-rendered path would differ. Hold from the
	// literal's start.
	// (2) the buffer tail is itself a proper prefix of a sensitive literal
	// (e.g., ends with "/tmp/win") — the next delta might complete it.
	// Hold from that tail start.
	_safeCutPoint() {
	const buf = this.buffer;
	const len = buf.length;
	let cut = len;

	// (1) unterminated full literal
	for (const lit of SENSITIVE_LITERALS) {
	let searchFrom = 0;
	while (searchFrom < len) {
	const idx = buf.indexOf(lit, searchFrom);
	if (idx === -1) break;
	let end = idx + lit.length;
	while (end < len && PATH_BODY_RE.test(buf[end])) end++;
	if (end === len) {
	if (idx < cut) cut = idx;
	break;
	}
	searchFrom = end + 1;
	}
	}

	// (2) partial-prefix tail
	for (const lit of SENSITIVE_LITERALS) {
	const maxLen = Math.min(lit.length - 1, len);
	for (let plen = maxLen; plen > 0; plen--) {
	if (buf.endsWith(lit.slice(0, plen))) {
	const start = len - plen;
	if (start < cut) cut = start;
	break;
	}
	}
	}

	// (3) v2.0.78 (#108) — XML block strip-tags. If the buffer contains
	// an open `<workspace_information>` (etc.) without its matching
	// close tag yet, hold the cut at the open-tag start so the next
	// delta can extend the block; we only emit it once we see </tag>.
	// Also handle the partial-prefix case where buffer ends with
	// `<workspace_inform` (still being typed by the model).
	for (const tag of STRIP_BLOCK_TAGS) {
	const open = `<${tag}`;
	const close = `</${tag}>`;
	let searchFrom = 0;
	while (searchFrom < len) {
	const openIdx = buf.indexOf(open, searchFrom);
	if (openIdx === -1) break;
	const closeIdx = buf.indexOf(close, openIdx + open.length);
	if (closeIdx === -1) {
	// No close yet — hold from openIdx so the next feed can
	// accumulate more of the block before we emit.
	if (openIdx < cut) cut = openIdx;
	break;
	}
	searchFrom = closeIdx + close.length;
	}
	// Partial-prefix tail of the open tag (`<workspace_inform`).
	const openMax = Math.min(open.length - 1, len);
	for (let plen = openMax; plen > 0; plen--) {
	if (buf.endsWith(open.slice(0, plen))) {
	const start = len - plen;
	if (start < cut) cut = start;
	break;
	}
	}
	}

	return cut;
	}

	flush() {
	const out = sanitizeText(this.buffer);
	this.buffer = '';
	return out;
	}
	}

	/**
	* Sanitize a tool call before surfacing to the client. Covers three carriers
	* a leaked path can ride:
	* - argumentsJson (OpenAI-emulated + legacy native)
	* - result (native Cascade tool result)
	* - input (Anthropic-format parsed input dict — the hot path
	* used by Claude Code streaming, issue #38)
	* Without the `input` scrub, the stream handler would emit a tool_use
	* delta whose file_path still references /home/user/projects/workspace-x
	* and Claude Code would try to Read a path that doesn't exist locally.
	*/
	export function sanitizeToolCall(tc) {
	if (!tc) return tc;
	const out = { ...tc };
	if (typeof tc.argumentsJson === 'string') out.argumentsJson = sanitizeText(tc.argumentsJson);
	if (typeof tc.result === 'string') out.result = sanitizeText(tc.result);
	if (tc.input && typeof tc.input === 'object' && !Array.isArray(tc.input)) {
	const safe = {};
	for (const [k, v] of Object.entries(tc.input)) {
	safe[k] = typeof v === 'string' ? sanitizeText(v) : v;
	}
	out.input = safe;
	}
	return out;
	}