File size: 13,246 Bytes
a21c316 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 | //! 工具结果输出压缩模块
//!
//! 提供智能压缩功能:
//! - 浏览器快照压缩 (头+尾保留)
//! - 大文件提示压缩 (提取关键信息)
//! - 通用截断 (200,000 字符限制)
use regex::Regex;
use serde_json::Value;
use tracing::{debug, info};
/// 最大工具结果字符数 (约 20 万,防止 prompt 超长)
const MAX_TOOL_RESULT_CHARS: usize = 200_000;
/// 浏览器快照检测阈值
const SNAPSHOT_DETECTION_THRESHOLD: usize = 20_000;
/// 浏览器快照压缩后的最大字符数
const SNAPSHOT_MAX_CHARS: usize = 16_000;
/// 浏览器快照头部保留比例
const SNAPSHOT_HEAD_RATIO: f64 = 0.7;
/// 浏览器快照尾部保留比例
#[allow(dead_code)]
const SNAPSHOT_TAIL_RATIO: f64 = 0.3;
/// 压缩工具结果文本
///
/// 根据内容类型自动选择最佳压缩策略:
/// 1. 大文件提示 → 提取关键信息
/// 2. 浏览器快照 → 头+尾保留
/// 3. 其他 → 简单截断
pub fn compact_tool_result_text(text: &str, max_chars: usize) -> String {
if text.is_empty() || text.len() <= max_chars {
return text.to_string();
}
// [NEW] 针对可能的 HTML 内容进行深度预处理
let cleaned_text = if text.contains("<html") || text.contains("<body") || text.contains("<!DOCTYPE") {
let cleaned = deep_clean_html(text);
debug!("[ToolCompressor] Deep cleaned HTML, reduced {} -> {} chars", text.len(), cleaned.len());
cleaned
} else {
text.to_string()
};
if cleaned_text.len() <= max_chars {
return cleaned_text;
}
// 1. 检测大文件提示模式
if let Some(compacted) = compact_saved_output_notice(&cleaned_text, max_chars) {
debug!("[ToolCompressor] Detected saved output notice, compacted to {} chars", compacted.len());
return compacted;
}
// 2. 检测浏览器快照模式
if cleaned_text.len() > SNAPSHOT_DETECTION_THRESHOLD {
if let Some(compacted) = compact_browser_snapshot(&cleaned_text, max_chars) {
debug!("[ToolCompressor] Detected browser snapshot, compacted to {} chars", compacted.len());
return compacted;
}
}
// 3. 结构化截断
debug!("[ToolCompressor] Using structured truncation for {} chars", cleaned_text.len());
truncate_text_safe(&cleaned_text, max_chars)
}
/// 压缩"输出已保存到文件"类型的提示
///
/// 检测模式: "result (N characters) exceeds maximum allowed tokens. Output saved to <path>"
/// 策略: 提取关键信息(文件路径、字符数、格式说明)
///
/// 根据提示内容类型自动提取关键信息
fn compact_saved_output_notice(text: &str, max_chars: usize) -> Option<String> {
// 正则匹配: result (N characters) exceeds maximum allowed tokens. Output saved to <path>
let re = Regex::new(
r"(?i)result\s*\(\s*(?P<count>[\d,]+)\s*characters\s*\)\s*exceeds\s+maximum\s+allowed\s+tokens\.\s*Output\s+(?:has\s+been\s+)?saved\s+to\s+(?P<path>[^\r\n]+)"
).ok()?;
let caps = re.captures(text)?;
let count = caps.name("count")?.as_str();
let raw_path = caps.name("path")?.as_str();
// 清理文件路径 (移除尾部的括号、引号、句号)
let file_path = raw_path
.trim()
.trim_end_matches(&[')', ']', '"', '\'', '.'][..])
.trim();
// 提取关键行
let lines: Vec<&str> = text.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
// 查找通知行
let notice_line = lines.iter()
.find(|l| l.to_lowercase().contains("exceeds maximum allowed tokens") && l.to_lowercase().contains("saved to"))
.map(|s| s.to_string())
.unwrap_or_else(|| format!("result ({} characters) exceeds maximum allowed tokens. Output has been saved to {}", count, file_path));
// 查找格式说明行
let format_line = lines.iter()
.find(|l| l.starts_with("Format:") || l.contains("JSON array with schema") || l.to_lowercase().starts_with("schema:"))
.map(|s| s.to_string());
// 构建压缩后的输出
let mut compact_lines = vec![notice_line];
if let Some(fmt) = format_line {
if !compact_lines.contains(&fmt) {
compact_lines.push(fmt);
}
}
compact_lines.push(format!(
"[tool_result omitted to reduce prompt size; read file locally if needed: {}]",
file_path
));
let result = compact_lines.join("\n");
Some(truncate_text_safe(&result, max_chars))
}
/// 压缩浏览器快照 (头+尾保留策略)
///
/// 检测: "page snapshot" 或 "页面快照" 或大量 "ref=" 引用
/// 策略: 保留头部 70% + 尾部 30%,中间省略
///
/// 使用头+尾保留策略压缩较长的页面快照数据
fn compact_browser_snapshot(text: &str, max_chars: usize) -> Option<String> {
// 检测是否是浏览器快照
let is_snapshot = text.to_lowercase().contains("page snapshot")
|| text.contains("页面快照")
|| text.matches("ref=").count() > 30
|| text.matches("[ref=").count() > 30;
if !is_snapshot {
return None;
}
let desired_max = max_chars.min(SNAPSHOT_MAX_CHARS);
if desired_max < 2000 || text.len() <= desired_max {
return None;
}
let meta = format!("[page snapshot summarized to reduce prompt size; original {} chars]", text.len());
let overhead = meta.len() + 200;
let budget = desired_max.saturating_sub(overhead);
if budget < 1000 {
return None;
}
// 计算头部和尾部长度
let head_len = (budget as f64 * SNAPSHOT_HEAD_RATIO).floor() as usize;
let head_len = head_len.min(10_000).max(500);
let tail_len = budget.saturating_sub(head_len).min(3_000);
let head = &text[..head_len.min(text.len())];
let tail = if tail_len > 0 && text.len() > head_len {
let start = text.len().saturating_sub(tail_len);
&text[start..]
} else {
""
};
let omitted = text.len().saturating_sub(head_len).saturating_sub(tail_len);
let summarized = if tail.is_empty() {
format!("{}\n---[HEAD]---\n{}\n---[...omitted {} chars]---", meta, head, omitted)
} else {
format!(
"{}\n---[HEAD]---\n{}\n---[...omitted {} chars]---\n---[TAIL]---\n{}",
meta, head, omitted, tail
)
};
Some(truncate_text_safe(&summarized, max_chars))
}
/// 安全的文本截断 (尽量不在标签中间截断)
fn truncate_text_safe(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
// 尝试寻找一个安全的截断点 (不在 < 和 > 之间)
let mut split_pos = max_chars;
// 向前查找是否有未闭合的标签开始符
let sub = &text[..max_chars];
if let Some(last_open) = sub.rfind('<') {
if let Some(last_close) = sub.rfind('>') {
if last_open > last_close {
// 截断点在标签中间,回退到标签开始前
split_pos = last_open;
}
} else {
// 只有开始没有结束,回退到标签开始前
split_pos = last_open;
}
}
// 也要避免在 JSON 大括号中间截断
if let Some(last_open_brace) = sub.rfind('{') {
if let Some(last_close_brace) = sub.rfind('}') {
if last_open_brace > last_close_brace {
// 可能在 JSON 中间,如果距离截断点较近,尝试回退
if max_chars - last_open_brace < 100 {
split_pos = split_pos.min(last_open_brace);
}
}
}
}
let truncated = &text[..split_pos];
let omitted = text.len() - split_pos;
format!("{}\n...[truncated {} chars]", truncated, omitted)
}
/// 深度清理 HTML (移除 style, script, base64 等)
fn deep_clean_html(html: &str) -> String {
let mut result = html.to_string();
// 1. 移除 <style>...</style> 及其内容
if let Ok(re) = Regex::new(r"(?is)<style\b[^>]*>.*?</style>") {
result = re.replace_all(&result, "[style omitted]").to_string();
}
// 2. 移除 <script>...</script> 及其内容
if let Ok(re) = Regex::new(r"(?is)<script\b[^>]*>.*?</script>") {
result = re.replace_all(&result, "[script omitted]").to_string();
}
// 3. 移除 inline Base64 数据 (如 src="data:image/png;base64,...")
if let Ok(re) = Regex::new(r#"(?i)data:[^;/]+/[^;]+;base64,[A-Za-z0-9+/=]+"#) {
result = re.replace_all(&result, "[base64 omitted]").to_string();
}
// 4. 移除冗余的空白字符
if let Ok(re) = Regex::new(r"\n\s*\n") {
result = re.replace_all(&result, "\n").to_string();
}
result
}
/// 清理工具结果 content blocks
///
/// 处理逻辑:
/// 1. 移除 base64 图片 (避免体积过大)
/// 2. 压缩文本内容 (使用智能压缩策略)
/// 3. 限制总字符数 (默认 200,000)
///
/// 清理并截断工具调用结果内容块
pub fn sanitize_tool_result_blocks(blocks: &mut Vec<Value>) {
let mut used_chars = 0;
let mut cleaned_blocks = Vec::new();
if !blocks.is_empty() {
info!(
"[ToolCompressor] Processing {} blocks for truncation (MAX: {} chars)",
blocks.len(),
MAX_TOOL_RESULT_CHARS
);
}
for block in blocks.iter() {
// 压缩文本内容
if let Some(text) = block.get("text").and_then(|v| v.as_str()) {
let remaining = MAX_TOOL_RESULT_CHARS.saturating_sub(used_chars);
if remaining == 0 {
debug!("[ToolCompressor] Reached character limit, stopping");
break;
}
let compacted = compact_tool_result_text(text, remaining);
let mut new_block = block.clone();
new_block["text"] = Value::String(compacted.clone());
cleaned_blocks.push(new_block);
used_chars += compacted.len();
debug!(
"[ToolCompressor] Compacted text block: {} → {} chars",
text.len(),
compacted.len()
);
} else {
// 保留其他类型的块 (例如图片), 但受总长度块数限制, 此处不单独截断
cleaned_blocks.push(block.clone());
used_chars += 100; // 估算非文本块大小
}
if used_chars >= MAX_TOOL_RESULT_CHARS {
break;
}
}
info!(
"[ToolCompressor] Sanitization complete: {} → {} blocks, {} chars used",
blocks.len(),
cleaned_blocks.len(),
used_chars
);
*blocks = cleaned_blocks;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_truncate_text() {
let text = "a".repeat(300_000);
let result = truncate_text_safe(&text, 200_000);
assert!(result.len() < 210_000); // 包含截断提示
assert!(result.contains("[truncated"));
assert!(result.contains("100000 chars]"));
}
#[test]
fn test_truncate_text_no_truncation() {
let text = "short text";
let result = truncate_text_safe(text, 1000);
assert_eq!(result, text);
}
#[test]
fn test_compact_browser_snapshot() {
let snapshot = format!("page snapshot: {}", "ref=abc ".repeat(10_000));
let result = compact_tool_result_text(&snapshot, 16_000);
assert!(result.len() <= 16_500); // 允许一些 overhead
assert!(result.contains("[HEAD]"));
assert!(result.contains("[TAIL]"));
assert!(result.contains("page snapshot summarized"));
}
#[test]
fn test_compact_saved_output_notice() {
let text = r#"result (150000 characters) exceeds maximum allowed tokens. Output has been saved to /tmp/output.txt
Format: JSON array with schema
Please read the file locally."#;
let result = compact_tool_result_text(text, 500);
println!("Result: {}", result);
assert!(result.contains("150000 characters") || result.contains("150,000 characters"));
assert!(result.contains("/tmp/output.txt"));
assert!(result.contains("[tool_result omitted") || result.len() <= 500);
}
#[test]
fn test_sanitize_tool_result_blocks() {
let mut blocks = vec![
serde_json::json!({
"type": "text",
"text": "a".repeat(100_000)
}),
serde_json::json!({
"type": "text",
"text": "b".repeat(150_000)
}),
serde_json::json!({
"type": "image",
"source": {
"type": "base64",
"data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
}
}),
serde_json::json!({
"type": "text",
"text": "some text"
}),
];
// 确认工具结果不再剔除图片
sanitize_tool_result_blocks(&mut blocks);
assert_eq!(blocks.len(), 4);
}
}
|