File size: 11,625 Bytes
2b64d42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | /**
* Strip server-internal filesystem paths from model output before it reaches
* the API caller.
*
* Background: Cascade's baked-in system context tells the model its workspace
* lives at /tmp/windsurf-workspace. Even after we removed CascadeToolConfig
* .run_command (see windsurf.js buildCascadeConfig) the model still
* (a) narrates "I'll look at /tmp/windsurf-workspace/config.yaml" in plain
* text, and
* (b) occasionally emits built-in edit_file / view_file / list_directory
* trajectory steps whose argumentsJson references these paths.
* Both routes leak the proxy's internal filesystem layout to API callers.
*
* This module provides two scrubbers:
* - sanitizeText(s) β one-shot, use on accumulated buffers
* - PathSanitizeStream β incremental, use on streaming chunks
*
* The streaming version holds back any tail that could be an incomplete
* prefix of a sensitive literal OR a match-in-progress whose path-tail hasn't
* hit a terminator yet, so a path cannot slip through by straddling a chunk
* boundary.
*/
// Detect the actual project root from this module's path so the sanitizer
// covers deployments outside /root/WindsurfAPI (e.g. /srv/WindsurfAPI).
import { fileURLToPath as _fileURLToPath } from 'url';
const _repoRoot = (() => {
try {
const thisFile = _fileURLToPath(import.meta.url);
// sanitize.js is in src/, so project root is one directory up.
// Handle both / and \ separators for cross-platform support.
return thisFile.replace(/[/\\]src[/\\]sanitize\.js$/, '');
} catch { return process.cwd(); }
})();
// Placeholder history: every marker has to avoid becoming either a fake path
// the model reuses in tool calls or a fake answer the model repeats to users.
// ./tail β LLM Reads ./src/main.py β ENOENT β loops
// [internal] β LLM runs `ls [internal]` β ENOENT β loops
// <redacted-path> β LLM passes to Read/Bash β ENOENT (Linux) /
// Errno 22 (Windows) β loops
// (internal path redacted) β zsh parses `cd (internal path redacted)`
// as glob-qualifier syntax β cryptic
// "unknown file attribute: i" error
// redacted internal path β Opus 4.7 echoes it verbatim into bash
// commands; reads to the model as a
// plausible directory name and the
// failure mode is `cd: too many arguments`
// which still wastes 2-3 turns
// β¦ β avoids shell loops, but Sonnet 4.6 can echo
// it in prose as "your path is β¦", causing a
// user-visible answer loop when asked for the
// project path.
// Current marker is structural and explicit: it tells the user/model the
// workspace path is intentionally hidden, without looking like a real absolute
// path or a literal ellipsis answer. The proto/tool preamble also tells the
// model not to answer project-path questions by echoing this marker.
// Verified with the drift probe (scripts/_agent_drift_probe.py).
const REDACTED_PATH = '<workspace>';
// Path body char class: anything that's not whitespace or syntax-terminator.
// Used in patterns and in cut-point detection β must match.
// Note: `\\` is INSIDE the char class so backslash-separated tails (Windows
// style: `\home\user\projects\workspace-x\src\index.js`) keep extending the
// match instead of terminating at the first backslash.
const PATTERNS = [
[/\/tmp\/windsurf-workspace(?:[/\\][^\s"'`<>)}\],*;]*)?/g, REDACTED_PATH],
// Unix and Windows-mixed forms β issue #86 reports of
// `C:\home\user\projects\workspace-devinxse` leaking despite the Unix-only
// regex catching `/home/user/projects/workspace-skxwsx01`. Cover:
// /home/user/projects/workspace-x[/...]
// \home\user\projects\workspace-x[\...]
// C:\home\user\projects\workspace-x[\...]
// C:\home/user/projects/workspace-x (mixed separators, GLM-style hallucination)
[/(?:[A-Za-z]:)?[/\\]home[/\\]user[/\\]projects[/\\]workspace-[a-z0-9]+(?:[/\\][^\s"'`<>)}\],*;]*)?/g, REDACTED_PATH],
[/\/opt\/windsurf(?:[/\\][^\s"'`<>)}\],*;]*)?/g, REDACTED_PATH],
[new RegExp(_repoRoot.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '(?:[/\\\\][^\\s"\'`<>)}\\],*;]*)?', 'g'), REDACTED_PATH],
// v2.0.78 (#108 zhangzhang-bit) β Cascade upstream injects these XML
// blocks into the system prompt to describe its sandbox state:
// <workspace_information>...workspace path / metadata...</workspace_information>
// <workspace_layout>...file tree...</workspace_layout>
// <user_information>...account / config...</user_information>
// The model sometimes echoes them verbatim into its response, leaking
// server-internal sandbox state to API callers (the actual #108
// screenshot showed `workspace-devinxse` paths surrounded by these
// wrappers). Strip the entire block (greedy across newlines) β these
// are upstream-injected and have no legitimate reason to surface in
// client-facing output.
[/<workspace_information>[\s\S]*?<\/workspace_information>/gi, ''],
[/<workspace_layout>[\s\S]*?<\/workspace_layout>/gi, ''],
[/<user_information>[\s\S]*?<\/user_information>/gi, ''],
];
// Tags whose ENTIRE block (open β close) is upstream-injected and must
// be held back during streaming until we see the closing tag β otherwise
// chunk N might emit `<workspace_information>file:///home/user/proj...`
// before chunk N+1 arrives with the rest. Used by PathSanitizeStream
// alongside SENSITIVE_LITERALS.
const STRIP_BLOCK_TAGS = ['workspace_information', 'workspace_layout', 'user_information'];
// Bare literals (no path tail) used by the streaming cut-point finder.
// Listed once per separator/prefix shape so the partial-prefix detection
// can hold back the right tail length on stream chunks.
const SENSITIVE_LITERALS = [
'/tmp/windsurf-workspace',
'/home/user/projects/workspace-',
'\\home\\user\\projects\\workspace-',
'/opt/windsurf',
_repoRoot,
];
// Character class that counts as part of a path body. Mirrors the PATTERNS
// regex char class so cut-point detection matches replacement behaviour.
const PATH_BODY_RE = /[^\s"'`<>)}\],*;]/;
/**
* Apply all path redactions to `s` in one pass. Safe to call on any string;
* non-strings and empty strings are returned unchanged.
*/
export function sanitizeText(s) {
if (typeof s !== 'string' || !s) return s;
let out = s;
for (const [re, rep] of PATTERNS) out = out.replace(re, rep);
return out;
}
/**
* Incremental sanitizer for streamed deltas.
*
* Usage:
* const stream = new PathSanitizeStream();
* for (const chunk of deltas) emit(stream.feed(chunk));
* emit(stream.flush());
*
* The returned string from feed()/flush() is guaranteed to contain no
* sensitive literal. Any trailing text that COULD extend into a sensitive
* literal (either as a partial prefix or as an unterminated path tail) is
* held internally until the next feed or the flush.
*/
export class PathSanitizeStream {
constructor() {
this.buffer = '';
}
feed(delta) {
if (!delta) return '';
this.buffer += delta;
const cut = this._safeCutPoint();
if (cut === 0) return '';
const safeRegion = this.buffer.slice(0, cut);
this.buffer = this.buffer.slice(cut);
return sanitizeText(safeRegion);
}
// Largest index into this.buffer such that buffer[0:cut] contains no
// match that could extend past `cut`. Two conditions back off the cut:
// (1) a full sensitive literal was found but its path body ran to the
// end of the buffer β the next delta might append more path chars,
// in which case the fully-rendered path would differ. Hold from the
// literal's start.
// (2) the buffer tail is itself a proper prefix of a sensitive literal
// (e.g., ends with "/tmp/win") β the next delta might complete it.
// Hold from that tail start.
_safeCutPoint() {
const buf = this.buffer;
const len = buf.length;
let cut = len;
// (1) unterminated full literal
for (const lit of SENSITIVE_LITERALS) {
let searchFrom = 0;
while (searchFrom < len) {
const idx = buf.indexOf(lit, searchFrom);
if (idx === -1) break;
let end = idx + lit.length;
while (end < len && PATH_BODY_RE.test(buf[end])) end++;
if (end === len) {
if (idx < cut) cut = idx;
break;
}
searchFrom = end + 1;
}
}
// (2) partial-prefix tail
for (const lit of SENSITIVE_LITERALS) {
const maxLen = Math.min(lit.length - 1, len);
for (let plen = maxLen; plen > 0; plen--) {
if (buf.endsWith(lit.slice(0, plen))) {
const start = len - plen;
if (start < cut) cut = start;
break;
}
}
}
// (3) v2.0.78 (#108) β XML block strip-tags. If the buffer contains
// an open `<workspace_information>` (etc.) without its matching
// close tag yet, hold the cut at the open-tag start so the next
// delta can extend the block; we only emit it once we see </tag>.
// Also handle the partial-prefix case where buffer ends with
// `<workspace_inform` (still being typed by the model).
for (const tag of STRIP_BLOCK_TAGS) {
const open = `<${tag}`;
const close = `</${tag}>`;
let searchFrom = 0;
while (searchFrom < len) {
const openIdx = buf.indexOf(open, searchFrom);
if (openIdx === -1) break;
const closeIdx = buf.indexOf(close, openIdx + open.length);
if (closeIdx === -1) {
// No close yet β hold from openIdx so the next feed can
// accumulate more of the block before we emit.
if (openIdx < cut) cut = openIdx;
break;
}
searchFrom = closeIdx + close.length;
}
// Partial-prefix tail of the open tag (`<workspace_inform`).
const openMax = Math.min(open.length - 1, len);
for (let plen = openMax; plen > 0; plen--) {
if (buf.endsWith(open.slice(0, plen))) {
const start = len - plen;
if (start < cut) cut = start;
break;
}
}
}
return cut;
}
flush() {
const out = sanitizeText(this.buffer);
this.buffer = '';
return out;
}
}
/**
* Sanitize a tool call before surfacing to the client. Covers three carriers
* a leaked path can ride:
* - argumentsJson (OpenAI-emulated + legacy native)
* - result (native Cascade tool result)
* - input (Anthropic-format parsed input dict β the hot path
* used by Claude Code streaming, issue #38)
* Without the `input` scrub, the stream handler would emit a tool_use
* delta whose file_path still references /home/user/projects/workspace-x
* and Claude Code would try to Read a path that doesn't exist locally.
*/
export function sanitizeToolCall(tc) {
if (!tc) return tc;
const out = { ...tc };
if (typeof tc.argumentsJson === 'string') out.argumentsJson = sanitizeText(tc.argumentsJson);
if (typeof tc.result === 'string') out.result = sanitizeText(tc.result);
if (tc.input && typeof tc.input === 'object' && !Array.isArray(tc.input)) {
const safe = {};
for (const [k, v] of Object.entries(tc.input)) {
safe[k] = typeof v === 'string' ? sanitizeText(v) : v;
}
out.input = safe;
}
return out;
}
|