File size: 10,524 Bytes
5844451 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 | import { getConfig } from './config.js';
import type { AnthropicMessage, AnthropicContentBlock, VisionProvider } from './types.js';
import { getProxyFetchOptions } from './proxy-agent.js';
import { createWorker } from 'tesseract.js';
import crypto from 'crypto';
// Global cache for image parsing results
// Key: SHA-256 hash of the image data string, Value: Extracted text
const imageParsingCache = new Map<string, string>();
const MAX_CACHE_SIZE = 100;
function setCache(hash: string, text: string) {
if (imageParsingCache.size >= MAX_CACHE_SIZE) {
// Evict oldest entry (Map preserves insertion order)
const firstKey = imageParsingCache.keys().next().value;
if (firstKey) {
imageParsingCache.delete(firstKey);
}
}
imageParsingCache.set(hash, text);
}
function getImageHash(imageSource: string): string {
return crypto.createHash('sha256').update(imageSource).digest('hex');
}
export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise<void> {
const config = getConfig();
if (!config.vision?.enabled) return;
for (const msg of messages) {
if (!Array.isArray(msg.content)) continue;
let hasImages = false;
const newContent: AnthropicContentBlock[] = [];
const imagesToAnalyze: AnthropicContentBlock[] = [];
for (const block of msg.content) {
if (block.type === 'image') {
hasImages = true;
imagesToAnalyze.push(block);
} else {
newContent.push(block);
}
}
if (hasImages && imagesToAnalyze.length > 0) {
try {
let descriptions = '';
if (config.vision.mode === 'ocr') {
console.log(`[Vision] 启用纯本地 OCR 模式,正在处理 ${imagesToAnalyze.length} 张图片... (无需 API Key)`);
descriptions = await processWithLocalOCR(imagesToAnalyze);
} else {
// API mode: try providers in order with fallback
descriptions = await processWithAPIFallback(imagesToAnalyze);
}
// Add descriptions as a simulated system text block
newContent.push({
type: 'text',
text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n`
});
msg.content = newContent;
} catch (e) {
console.error("[Vision API Error]", e);
newContent.push({
type: 'text',
text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n`
});
msg.content = newContent;
}
}
}
}
/**
* Try each API provider in order. If all fail and fallbackToOcr is enabled,
* fall back to local OCR as the last resort.
*/
async function processWithAPIFallback(imagesToAnalyze: AnthropicContentBlock[]): Promise<string> {
const visionConfig = getConfig().vision!;
const providers = visionConfig.providers;
const errors: string[] = [];
// If we have providers, try them in order
if (providers.length > 0) {
for (let i = 0; i < providers.length; i++) {
const provider = providers[i];
const providerLabel = provider.name || `Provider #${i + 1} (${provider.model})`;
try {
console.log(`[Vision] 尝试 API ${providerLabel},正在处理 ${imagesToAnalyze.length} 张图片...`);
const result = await callVisionAPIWithProvider(imagesToAnalyze, provider);
console.log(`[Vision] ✅ ${providerLabel} 处理成功`);
return result;
} catch (err) {
const errMsg = (err as Error).message;
console.warn(`[Vision] ❌ ${providerLabel} 失败: ${errMsg}`);
errors.push(`${providerLabel}: ${errMsg}`);
// Continue to next provider
}
}
} else if (visionConfig.baseUrl && visionConfig.apiKey) {
// Legacy fallback: single provider from top-level fields
const legacyProvider: VisionProvider = {
name: 'default',
baseUrl: visionConfig.baseUrl,
apiKey: visionConfig.apiKey,
model: visionConfig.model,
};
try {
console.log(`[Vision] 启用外部 API 模式,正在处理 ${imagesToAnalyze.length} 张图片...`);
const result = await callVisionAPIWithProvider(imagesToAnalyze, legacyProvider);
return result;
} catch (err) {
const errMsg = (err as Error).message;
console.warn(`[Vision] ❌ API 调用失败: ${errMsg}`);
errors.push(`default: ${errMsg}`);
}
}
// All API providers failed — try OCR fallback
if (visionConfig.fallbackToOcr) {
console.log(`[Vision] 所有 API 均失败 (${errors.length} 个错误),兜底使用本地 OCR...`);
try {
return await processWithLocalOCR(imagesToAnalyze);
} catch (ocrErr) {
throw new Error(
`All ${errors.length} API provider(s) failed AND local OCR fallback also failed. ` +
`API errors: [${errors.join(' | ')}]. OCR error: ${(ocrErr as Error).message}`
);
}
}
// fallbackToOcr is disabled and all providers failed
throw new Error(
`All ${errors.length} API provider(s) failed and fallback_to_ocr is disabled. ` +
`Errors: [${errors.join(' | ')}]`
);
}
async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise<string> {
let combinedText = '';
const imagesToProcess: { index: number, source: string, hash: string }[] = [];
// Check cache first
for (let i = 0; i < imageBlocks.length; i++) {
const img = imageBlocks[i];
let imageSource: string = '';
if (img.type === 'image' && img.source?.data) {
if (img.source.type === 'base64') {
const mime = img.source.media_type || 'image/jpeg';
imageSource = `data:${mime};base64,${img.source.data}`;
} else if (img.source.type === 'url') {
imageSource = img.source.data;
}
}
if (imageSource) {
const hash = getImageHash(imageSource);
if (imageParsingCache.has(hash)) {
console.log(`[Vision] Image ${i + 1} found in cache, skipping OCR.`);
combinedText += `--- Image ${i + 1} OCR Text ---\n${imageParsingCache.get(hash)}\n\n`;
} else {
imagesToProcess.push({ index: i, source: imageSource, hash });
}
}
}
if (imagesToProcess.length > 0) {
const worker = await createWorker('eng+chi_sim');
for (const { index, source, hash } of imagesToProcess) {
try {
const { data: { text } } = await worker.recognize(source);
const extractedText = text.trim() || '(No text detected in this image)';
setCache(hash, extractedText);
combinedText += `--- Image ${index + 1} OCR Text ---\n${extractedText}\n\n`;
} catch (err) {
console.error(`[Vision OCR] Failed to parse image ${index + 1}:`, err);
combinedText += `--- Image ${index + 1} ---\n(Failed to parse image with local OCR)\n\n`;
}
}
await worker.terminate();
}
return combinedText;
}
/**
* Call a specific Vision API provider for image analysis.
* Processes images individually for per-image caching.
* Throws on failure so the caller can try the next provider.
*/
async function callVisionAPIWithProvider(imageBlocks: AnthropicContentBlock[], provider: VisionProvider): Promise<string> {
let combinedText = '';
let hasAnyFailure = false;
for (let i = 0; i < imageBlocks.length; i++) {
const img = imageBlocks[i];
let url = '';
if (img.type === 'image' && img.source?.data) {
if (img.source.type === 'base64') {
const mime = img.source.media_type || 'image/jpeg';
url = `data:${mime};base64,${img.source.data}`;
} else if (img.source.type === 'url') {
url = img.source.data;
}
}
if (url) {
const hash = getImageHash(url);
if (imageParsingCache.has(hash)) {
console.log(`[Vision] Image ${i + 1} found in cache, skipping API call.`);
combinedText += `--- Image ${i + 1} Description ---\n${imageParsingCache.get(hash)}\n\n`;
continue;
}
const parts = [
{ type: 'text', text: 'Please describe this image in detail. If it contains code, UI elements, or error messages, explicitly write them out.' },
{ type: 'image_url', image_url: { url } }
];
const payload = {
model: provider.model,
messages: [{ role: 'user', content: parts }],
max_tokens: 1500
};
const res = await fetch(provider.baseUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${provider.apiKey}`
},
body: JSON.stringify(payload),
...getProxyFetchOptions(),
} as any);
if (!res.ok) {
const errBody = await res.text();
throw new Error(`API returned status ${res.status}: ${errBody}`);
}
const data = await res.json() as any;
const description = data.choices?.[0]?.message?.content || 'No description returned.';
setCache(hash, description);
combinedText += `--- Image ${i + 1} Description ---\n${description}\n\n`;
}
}
return combinedText;
}
|