import { getConfig } from './config.js'; import type { AnthropicMessage, AnthropicContentBlock, VisionProvider } from './types.js'; import { getProxyFetchOptions } from './proxy-agent.js'; import { createWorker } from 'tesseract.js'; import crypto from 'crypto'; // Global cache for image parsing results // Key: SHA-256 hash of the image data string, Value: Extracted text const imageParsingCache = new Map(); const MAX_CACHE_SIZE = 100; function setCache(hash: string, text: string) { if (imageParsingCache.size >= MAX_CACHE_SIZE) { // Evict oldest entry (Map preserves insertion order) const firstKey = imageParsingCache.keys().next().value; if (firstKey) { imageParsingCache.delete(firstKey); } } imageParsingCache.set(hash, text); } function getImageHash(imageSource: string): string { return crypto.createHash('sha256').update(imageSource).digest('hex'); } export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise { const config = getConfig(); if (!config.vision?.enabled) return; for (const msg of messages) { if (!Array.isArray(msg.content)) continue; let hasImages = false; const newContent: AnthropicContentBlock[] = []; const imagesToAnalyze: AnthropicContentBlock[] = []; for (const block of msg.content) { if (block.type === 'image') { hasImages = true; imagesToAnalyze.push(block); } else { newContent.push(block); } } if (hasImages && imagesToAnalyze.length > 0) { try { let descriptions = ''; if (config.vision.mode === 'ocr') { console.log(`[Vision] 启用纯本地 OCR 模式,正在处理 ${imagesToAnalyze.length} 张图片... (无需 API Key)`); descriptions = await processWithLocalOCR(imagesToAnalyze); } else { // API mode: try providers in order with fallback descriptions = await processWithAPIFallback(imagesToAnalyze); } // Add descriptions as a simulated system text block newContent.push({ type: 'text', text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n` }); msg.content = newContent; } catch (e) { console.error("[Vision API Error]", e); newContent.push({ type: 'text', text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n` }); msg.content = newContent; } } } } /** * Try each API provider in order. If all fail and fallbackToOcr is enabled, * fall back to local OCR as the last resort. */ async function processWithAPIFallback(imagesToAnalyze: AnthropicContentBlock[]): Promise { const visionConfig = getConfig().vision!; const providers = visionConfig.providers; const errors: string[] = []; // If we have providers, try them in order if (providers.length > 0) { for (let i = 0; i < providers.length; i++) { const provider = providers[i]; const providerLabel = provider.name || `Provider #${i + 1} (${provider.model})`; try { console.log(`[Vision] 尝试 API ${providerLabel},正在处理 ${imagesToAnalyze.length} 张图片...`); const result = await callVisionAPIWithProvider(imagesToAnalyze, provider); console.log(`[Vision] ✅ ${providerLabel} 处理成功`); return result; } catch (err) { const errMsg = (err as Error).message; console.warn(`[Vision] ❌ ${providerLabel} 失败: ${errMsg}`); errors.push(`${providerLabel}: ${errMsg}`); // Continue to next provider } } } else if (visionConfig.baseUrl && visionConfig.apiKey) { // Legacy fallback: single provider from top-level fields const legacyProvider: VisionProvider = { name: 'default', baseUrl: visionConfig.baseUrl, apiKey: visionConfig.apiKey, model: visionConfig.model, }; try { console.log(`[Vision] 启用外部 API 模式,正在处理 ${imagesToAnalyze.length} 张图片...`); const result = await callVisionAPIWithProvider(imagesToAnalyze, legacyProvider); return result; } catch (err) { const errMsg = (err as Error).message; console.warn(`[Vision] ❌ API 调用失败: ${errMsg}`); errors.push(`default: ${errMsg}`); } } // All API providers failed — try OCR fallback if (visionConfig.fallbackToOcr) { console.log(`[Vision] 所有 API 均失败 (${errors.length} 个错误),兜底使用本地 OCR...`); try { return await processWithLocalOCR(imagesToAnalyze); } catch (ocrErr) { throw new Error( `All ${errors.length} API provider(s) failed AND local OCR fallback also failed. ` + `API errors: [${errors.join(' | ')}]. OCR error: ${(ocrErr as Error).message}` ); } } // fallbackToOcr is disabled and all providers failed throw new Error( `All ${errors.length} API provider(s) failed and fallback_to_ocr is disabled. ` + `Errors: [${errors.join(' | ')}]` ); } async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise { let combinedText = ''; const imagesToProcess: { index: number, source: string, hash: string }[] = []; // Check cache first for (let i = 0; i < imageBlocks.length; i++) { const img = imageBlocks[i]; let imageSource: string = ''; if (img.type === 'image' && img.source?.data) { if (img.source.type === 'base64') { const mime = img.source.media_type || 'image/jpeg'; imageSource = `data:${mime};base64,${img.source.data}`; } else if (img.source.type === 'url') { imageSource = img.source.data; } } if (imageSource) { const hash = getImageHash(imageSource); if (imageParsingCache.has(hash)) { console.log(`[Vision] Image ${i + 1} found in cache, skipping OCR.`); combinedText += `--- Image ${i + 1} OCR Text ---\n${imageParsingCache.get(hash)}\n\n`; } else { imagesToProcess.push({ index: i, source: imageSource, hash }); } } } if (imagesToProcess.length > 0) { const worker = await createWorker('eng+chi_sim'); for (const { index, source, hash } of imagesToProcess) { try { const { data: { text } } = await worker.recognize(source); const extractedText = text.trim() || '(No text detected in this image)'; setCache(hash, extractedText); combinedText += `--- Image ${index + 1} OCR Text ---\n${extractedText}\n\n`; } catch (err) { console.error(`[Vision OCR] Failed to parse image ${index + 1}:`, err); combinedText += `--- Image ${index + 1} ---\n(Failed to parse image with local OCR)\n\n`; } } await worker.terminate(); } return combinedText; } /** * Call a specific Vision API provider for image analysis. * Processes images individually for per-image caching. * Throws on failure so the caller can try the next provider. */ async function callVisionAPIWithProvider(imageBlocks: AnthropicContentBlock[], provider: VisionProvider): Promise { let combinedText = ''; let hasAnyFailure = false; for (let i = 0; i < imageBlocks.length; i++) { const img = imageBlocks[i]; let url = ''; if (img.type === 'image' && img.source?.data) { if (img.source.type === 'base64') { const mime = img.source.media_type || 'image/jpeg'; url = `data:${mime};base64,${img.source.data}`; } else if (img.source.type === 'url') { url = img.source.data; } } if (url) { const hash = getImageHash(url); if (imageParsingCache.has(hash)) { console.log(`[Vision] Image ${i + 1} found in cache, skipping API call.`); combinedText += `--- Image ${i + 1} Description ---\n${imageParsingCache.get(hash)}\n\n`; continue; } const parts = [ { type: 'text', text: 'Please describe this image in detail. If it contains code, UI elements, or error messages, explicitly write them out.' }, { type: 'image_url', image_url: { url } } ]; const payload = { model: provider.model, messages: [{ role: 'user', content: parts }], max_tokens: 1500 }; const res = await fetch(provider.baseUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${provider.apiKey}` }, body: JSON.stringify(payload), ...getProxyFetchOptions(), } as any); if (!res.ok) { const errBody = await res.text(); throw new Error(`API returned status ${res.status}: ${errBody}`); } const data = await res.json() as any; const description = data.choices?.[0]?.message?.content || 'No description returned.'; setCache(hash, description); combinedText += `--- Image ${i + 1} Description ---\n${description}\n\n`; } } return combinedText; }