| import { getConfig } from './config.js';
|
| import type { AnthropicMessage, AnthropicContentBlock, VisionProvider } from './types.js';
|
| import { getProxyFetchOptions } from './proxy-agent.js';
|
| import { createWorker } from 'tesseract.js';
|
| import crypto from 'crypto';
|
|
|
|
|
|
|
| const imageParsingCache = new Map<string, string>();
|
| const MAX_CACHE_SIZE = 100;
|
|
|
| function setCache(hash: string, text: string) {
|
| if (imageParsingCache.size >= MAX_CACHE_SIZE) {
|
|
|
| const firstKey = imageParsingCache.keys().next().value;
|
| if (firstKey) {
|
| imageParsingCache.delete(firstKey);
|
| }
|
| }
|
| imageParsingCache.set(hash, text);
|
| }
|
|
|
| function getImageHash(imageSource: string): string {
|
| return crypto.createHash('sha256').update(imageSource).digest('hex');
|
| }
|
|
|
| export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise<void> {
|
| const config = getConfig();
|
| if (!config.vision?.enabled) return;
|
|
|
| for (const msg of messages) {
|
| if (!Array.isArray(msg.content)) continue;
|
|
|
| let hasImages = false;
|
| const newContent: AnthropicContentBlock[] = [];
|
| const imagesToAnalyze: AnthropicContentBlock[] = [];
|
|
|
| for (const block of msg.content) {
|
| if (block.type === 'image') {
|
| hasImages = true;
|
| imagesToAnalyze.push(block);
|
| } else {
|
| newContent.push(block);
|
| }
|
| }
|
|
|
| if (hasImages && imagesToAnalyze.length > 0) {
|
| try {
|
| let descriptions = '';
|
| if (config.vision.mode === 'ocr') {
|
| console.log(`[Vision] 启用纯本地 OCR 模式,正在处理 ${imagesToAnalyze.length} 张图片... (无需 API Key)`);
|
| descriptions = await processWithLocalOCR(imagesToAnalyze);
|
| } else {
|
|
|
| descriptions = await processWithAPIFallback(imagesToAnalyze);
|
| }
|
|
|
|
|
| newContent.push({
|
| type: 'text',
|
| text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n`
|
| });
|
|
|
| msg.content = newContent;
|
| } catch (e) {
|
| console.error("[Vision API Error]", e);
|
| newContent.push({
|
| type: 'text',
|
| text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n`
|
| });
|
| msg.content = newContent;
|
| }
|
| }
|
| }
|
| }
|
|
|
| |
| |
| |
|
|
| async function processWithAPIFallback(imagesToAnalyze: AnthropicContentBlock[]): Promise<string> {
|
| const visionConfig = getConfig().vision!;
|
| const providers = visionConfig.providers;
|
| const errors: string[] = [];
|
|
|
|
|
| if (providers.length > 0) {
|
| for (let i = 0; i < providers.length; i++) {
|
| const provider = providers[i];
|
| const providerLabel = provider.name || `Provider #${i + 1} (${provider.model})`;
|
| try {
|
| console.log(`[Vision] 尝试 API ${providerLabel},正在处理 ${imagesToAnalyze.length} 张图片...`);
|
| const result = await callVisionAPIWithProvider(imagesToAnalyze, provider);
|
| console.log(`[Vision] ✅ ${providerLabel} 处理成功`);
|
| return result;
|
| } catch (err) {
|
| const errMsg = (err as Error).message;
|
| console.warn(`[Vision] ❌ ${providerLabel} 失败: ${errMsg}`);
|
| errors.push(`${providerLabel}: ${errMsg}`);
|
|
|
| }
|
| }
|
| } else if (visionConfig.baseUrl && visionConfig.apiKey) {
|
|
|
| const legacyProvider: VisionProvider = {
|
| name: 'default',
|
| baseUrl: visionConfig.baseUrl,
|
| apiKey: visionConfig.apiKey,
|
| model: visionConfig.model,
|
| };
|
| try {
|
| console.log(`[Vision] 启用外部 API 模式,正在处理 ${imagesToAnalyze.length} 张图片...`);
|
| const result = await callVisionAPIWithProvider(imagesToAnalyze, legacyProvider);
|
| return result;
|
| } catch (err) {
|
| const errMsg = (err as Error).message;
|
| console.warn(`[Vision] ❌ API 调用失败: ${errMsg}`);
|
| errors.push(`default: ${errMsg}`);
|
| }
|
| }
|
|
|
|
|
| if (visionConfig.fallbackToOcr) {
|
| console.log(`[Vision] 所有 API 均失败 (${errors.length} 个错误),兜底使用本地 OCR...`);
|
| try {
|
| return await processWithLocalOCR(imagesToAnalyze);
|
| } catch (ocrErr) {
|
| throw new Error(
|
| `All ${errors.length} API provider(s) failed AND local OCR fallback also failed. ` +
|
| `API errors: [${errors.join(' | ')}]. OCR error: ${(ocrErr as Error).message}`
|
| );
|
| }
|
| }
|
|
|
|
|
| throw new Error(
|
| `All ${errors.length} API provider(s) failed and fallback_to_ocr is disabled. ` +
|
| `Errors: [${errors.join(' | ')}]`
|
| );
|
| }
|
|
|
| async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise<string> {
|
| let combinedText = '';
|
| const imagesToProcess: { index: number, source: string, hash: string }[] = [];
|
|
|
|
|
| for (let i = 0; i < imageBlocks.length; i++) {
|
| const img = imageBlocks[i];
|
| let imageSource: string = '';
|
|
|
| if (img.type === 'image' && img.source?.data) {
|
| if (img.source.type === 'base64') {
|
| const mime = img.source.media_type || 'image/jpeg';
|
| imageSource = `data:${mime};base64,${img.source.data}`;
|
| } else if (img.source.type === 'url') {
|
| imageSource = img.source.data;
|
| }
|
| }
|
|
|
| if (imageSource) {
|
| const hash = getImageHash(imageSource);
|
| if (imageParsingCache.has(hash)) {
|
| console.log(`[Vision] Image ${i + 1} found in cache, skipping OCR.`);
|
| combinedText += `--- Image ${i + 1} OCR Text ---\n${imageParsingCache.get(hash)}\n\n`;
|
| } else {
|
| imagesToProcess.push({ index: i, source: imageSource, hash });
|
| }
|
| }
|
| }
|
|
|
| if (imagesToProcess.length > 0) {
|
| const worker = await createWorker('eng+chi_sim');
|
|
|
| for (const { index, source, hash } of imagesToProcess) {
|
| try {
|
| const { data: { text } } = await worker.recognize(source);
|
| const extractedText = text.trim() || '(No text detected in this image)';
|
| setCache(hash, extractedText);
|
| combinedText += `--- Image ${index + 1} OCR Text ---\n${extractedText}\n\n`;
|
| } catch (err) {
|
| console.error(`[Vision OCR] Failed to parse image ${index + 1}:`, err);
|
| combinedText += `--- Image ${index + 1} ---\n(Failed to parse image with local OCR)\n\n`;
|
| }
|
| }
|
| await worker.terminate();
|
| }
|
|
|
| return combinedText;
|
| }
|
|
|
| |
| |
| |
| |
|
|
| async function callVisionAPIWithProvider(imageBlocks: AnthropicContentBlock[], provider: VisionProvider): Promise<string> {
|
| let combinedText = '';
|
| let hasAnyFailure = false;
|
|
|
| for (let i = 0; i < imageBlocks.length; i++) {
|
| const img = imageBlocks[i];
|
| let url = '';
|
|
|
| if (img.type === 'image' && img.source?.data) {
|
| if (img.source.type === 'base64') {
|
| const mime = img.source.media_type || 'image/jpeg';
|
| url = `data:${mime};base64,${img.source.data}`;
|
| } else if (img.source.type === 'url') {
|
| url = img.source.data;
|
| }
|
| }
|
|
|
| if (url) {
|
| const hash = getImageHash(url);
|
| if (imageParsingCache.has(hash)) {
|
| console.log(`[Vision] Image ${i + 1} found in cache, skipping API call.`);
|
| combinedText += `--- Image ${i + 1} Description ---\n${imageParsingCache.get(hash)}\n\n`;
|
| continue;
|
| }
|
|
|
| const parts = [
|
| { type: 'text', text: 'Please describe this image in detail. If it contains code, UI elements, or error messages, explicitly write them out.' },
|
| { type: 'image_url', image_url: { url } }
|
| ];
|
|
|
| const payload = {
|
| model: provider.model,
|
| messages: [{ role: 'user', content: parts }],
|
| max_tokens: 1500
|
| };
|
|
|
| const res = await fetch(provider.baseUrl, {
|
| method: 'POST',
|
| headers: {
|
| 'Content-Type': 'application/json',
|
| 'Authorization': `Bearer ${provider.apiKey}`
|
| },
|
| body: JSON.stringify(payload),
|
| ...getProxyFetchOptions(),
|
| } as any);
|
|
|
| if (!res.ok) {
|
| const errBody = await res.text();
|
| throw new Error(`API returned status ${res.status}: ${errBody}`);
|
| }
|
|
|
| const data = await res.json() as any;
|
| const description = data.choices?.[0]?.message?.content || 'No description returned.';
|
|
|
| setCache(hash, description);
|
| combinedText += `--- Image ${i + 1} Description ---\n${description}\n\n`;
|
| }
|
| }
|
|
|
| return combinedText;
|
| }
|
|
|