Spaces:
Sleeping
Sleeping
File size: 6,505 Bytes
c6dedd5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import { getConfig } from './config.js';
import type { AnthropicMessage, AnthropicContentBlock } from './types.js';
import { getVisionProxyFetchOptions } from './proxy-agent.js';
import { createWorker } from 'tesseract.js';
export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise<void> {
const config = getConfig();
if (!config.vision?.enabled) return;
// ★ 仅处理最后一条 user 消息中的图片
// 历史消息的图片已在前几轮被转换为文本描述,无需重复处理
// 这避免了多轮对话中重复消耗 Vision API 配额和增加延迟
let lastUserMsg: AnthropicMessage | null = null;
for (let i = messages.length - 1; i >= 0; i--) {
if (messages[i].role === 'user') {
lastUserMsg = messages[i];
break;
}
}
if (!lastUserMsg || !Array.isArray(lastUserMsg.content)) return;
let hasImages = false;
const newContent: AnthropicContentBlock[] = [];
const imagesToAnalyze: AnthropicContentBlock[] = [];
for (const block of lastUserMsg.content) {
if (block.type === 'image') {
// ★ 跳过 SVG 矢量图 — tesseract.js 无法处理 SVG,会导致进程崩溃 (#69)
const mediaType = (block as any).source?.media_type || '';
if (mediaType === 'image/svg+xml') {
console.log('[Vision] ⚠️ 跳过 SVG 矢量图(不支持 OCR/Vision 处理)');
newContent.push({
type: 'text',
text: '[SVG vector image was attached but cannot be processed by OCR/Vision. It likely contains a logo, icon, badge, or diagram.]',
});
continue;
}
hasImages = true;
imagesToAnalyze.push(block);
} else {
newContent.push(block);
}
}
if (hasImages && imagesToAnalyze.length > 0) {
try {
let descriptions = '';
if (config.vision.mode === 'ocr') {
descriptions = await processWithLocalOCR(imagesToAnalyze);
} else {
descriptions = await callVisionAPI(imagesToAnalyze);
}
// Add descriptions as a simulated system text block
newContent.push({
type: 'text',
text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n`
});
lastUserMsg.content = newContent;
} catch (e) {
console.error("[Vision API Error]", e);
newContent.push({
type: 'text',
text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n`
});
lastUserMsg.content = newContent;
}
}
}
// ★ 不支持 OCR 的图片格式(矢量图、动画等)
const UNSUPPORTED_OCR_TYPES = new Set(['image/svg+xml']);
async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise<string> {
const worker = await createWorker('eng+chi_sim');
let combinedText = '';
for (let i = 0; i < imageBlocks.length; i++) {
const img = imageBlocks[i];
let imageSource: string | Buffer = '';
if (img.type === 'image' && img.source) {
// ★ 防御性检查:跳过不支持 OCR 的格式(#69 - SVG 导致 tesseract 崩溃)
if (UNSUPPORTED_OCR_TYPES.has(img.source.media_type || '')) {
combinedText += `--- Image ${i + 1} ---\n(Skipped: ${img.source.media_type} format is not supported by OCR)\n\n`;
continue;
}
const sourceData = img.source.data || img.source.url;
if (img.source.type === 'base64' && sourceData) {
const mime = img.source.media_type || 'image/jpeg';
imageSource = `data:${mime};base64,${sourceData}`;
} else if (img.source.type === 'url' && sourceData) {
imageSource = sourceData;
}
}
if (imageSource) {
try {
const { data: { text } } = await worker.recognize(imageSource);
combinedText += `--- Image ${i + 1} OCR Text ---\n${text.trim() || '(No text detected in this image)'}\n\n`;
} catch (err) {
console.error(`[Vision OCR] Failed to parse image ${i + 1}:`, err);
combinedText += `--- Image ${i + 1} ---\n(Failed to parse image with local OCR)\n\n`;
}
}
}
await worker.terminate();
return combinedText;
}
async function callVisionAPI(imageBlocks: AnthropicContentBlock[]): Promise<string> {
const config = getConfig().vision!;
// Construct an array of OpenAI format message parts
const parts: any[] = [
{ type: 'text', text: 'Please describe the attached images in detail. If they contain code, UI elements, or error messages, explicitly write them out.' }
];
for (const img of imageBlocks) {
if (img.type === 'image' && img.source) {
const sourceData = img.source.data || img.source.url;
let url = '';
// If it's a raw base64 string
if (img.source.type === 'base64' && sourceData) {
const mime = img.source.media_type || 'image/jpeg';
url = `data:${mime};base64,${sourceData}`;
} else if (img.source.type === 'url' && sourceData) {
// Handle remote URLs natively mapped from OpenAI/Anthropic payloads
url = sourceData;
}
if (url) {
parts.push({ type: 'image_url', image_url: { url } });
}
}
}
const payload = {
model: config.model,
messages: [{ role: 'user', content: parts }],
max_tokens: 1500
};
const res = await fetch(config.baseUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${config.apiKey}`
},
body: JSON.stringify(payload),
...getVisionProxyFetchOptions(),
} as any);
if (!res.ok) {
throw new Error(`Vision API returned status ${res.status}: ${await res.text()}`);
}
const data = await res.json() as any;
return data.choices?.[0]?.message?.content || 'No description returned.';
}
|