File size: 10,524 Bytes
5844451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import { getConfig } from './config.js';
import type { AnthropicMessage, AnthropicContentBlock, VisionProvider } from './types.js';
import { getProxyFetchOptions } from './proxy-agent.js';
import { createWorker } from 'tesseract.js';
import crypto from 'crypto';

// Global cache for image parsing results
// Key: SHA-256 hash of the image data string, Value: Extracted text
const imageParsingCache = new Map<string, string>();
const MAX_CACHE_SIZE = 100;

function setCache(hash: string, text: string) {
    if (imageParsingCache.size >= MAX_CACHE_SIZE) {
        // Evict oldest entry (Map preserves insertion order)
        const firstKey = imageParsingCache.keys().next().value;
        if (firstKey) {
            imageParsingCache.delete(firstKey);
        }
    }
    imageParsingCache.set(hash, text);
}

function getImageHash(imageSource: string): string {
    return crypto.createHash('sha256').update(imageSource).digest('hex');
}

export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise<void> {
    const config = getConfig();
    if (!config.vision?.enabled) return;

    for (const msg of messages) {
        if (!Array.isArray(msg.content)) continue;

        let hasImages = false;
        const newContent: AnthropicContentBlock[] = [];
        const imagesToAnalyze: AnthropicContentBlock[] = [];

        for (const block of msg.content) {
            if (block.type === 'image') {
                hasImages = true;
                imagesToAnalyze.push(block);
            } else {
                newContent.push(block);
            }
        }

        if (hasImages && imagesToAnalyze.length > 0) {
            try {
                let descriptions = '';
                if (config.vision.mode === 'ocr') {
                    console.log(`[Vision] 启用纯本地 OCR 模式,正在处理 ${imagesToAnalyze.length} 张图片... (无需 API Key)`);
                    descriptions = await processWithLocalOCR(imagesToAnalyze);
                } else {
                    // API mode: try providers in order with fallback
                    descriptions = await processWithAPIFallback(imagesToAnalyze);
                }

                // Add descriptions as a simulated system text block
                newContent.push({
                    type: 'text',
                    text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n`
                });

                msg.content = newContent;
            } catch (e) {
                console.error("[Vision API Error]", e);
                newContent.push({
                    type: 'text',
                    text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n`
                });
                msg.content = newContent;
            }
        }
    }
}

/**

 * Try each API provider in order. If all fail and fallbackToOcr is enabled,

 * fall back to local OCR as the last resort.

 */
async function processWithAPIFallback(imagesToAnalyze: AnthropicContentBlock[]): Promise<string> {
    const visionConfig = getConfig().vision!;
    const providers = visionConfig.providers;
    const errors: string[] = [];

    // If we have providers, try them in order
    if (providers.length > 0) {
        for (let i = 0; i < providers.length; i++) {
            const provider = providers[i];
            const providerLabel = provider.name || `Provider #${i + 1} (${provider.model})`;
            try {
                console.log(`[Vision] 尝试 API ${providerLabel},正在处理 ${imagesToAnalyze.length} 张图片...`);
                const result = await callVisionAPIWithProvider(imagesToAnalyze, provider);
                console.log(`[Vision] ✅ ${providerLabel} 处理成功`);
                return result;
            } catch (err) {
                const errMsg = (err as Error).message;
                console.warn(`[Vision] ❌ ${providerLabel} 失败: ${errMsg}`);
                errors.push(`${providerLabel}: ${errMsg}`);
                // Continue to next provider
            }
        }
    } else if (visionConfig.baseUrl && visionConfig.apiKey) {
        // Legacy fallback: single provider from top-level fields
        const legacyProvider: VisionProvider = {
            name: 'default',
            baseUrl: visionConfig.baseUrl,
            apiKey: visionConfig.apiKey,
            model: visionConfig.model,
        };
        try {
            console.log(`[Vision] 启用外部 API 模式,正在处理 ${imagesToAnalyze.length} 张图片...`);
            const result = await callVisionAPIWithProvider(imagesToAnalyze, legacyProvider);
            return result;
        } catch (err) {
            const errMsg = (err as Error).message;
            console.warn(`[Vision] ❌ API 调用失败: ${errMsg}`);
            errors.push(`default: ${errMsg}`);
        }
    }

    // All API providers failed — try OCR fallback
    if (visionConfig.fallbackToOcr) {
        console.log(`[Vision] 所有 API 均失败 (${errors.length} 个错误),兜底使用本地 OCR...`);
        try {
            return await processWithLocalOCR(imagesToAnalyze);
        } catch (ocrErr) {
            throw new Error(
                `All ${errors.length} API provider(s) failed AND local OCR fallback also failed. ` +
                `API errors: [${errors.join(' | ')}]. OCR error: ${(ocrErr as Error).message}`
            );
        }
    }

    // fallbackToOcr is disabled and all providers failed
    throw new Error(
        `All ${errors.length} API provider(s) failed and fallback_to_ocr is disabled. ` +
        `Errors: [${errors.join(' | ')}]`
    );
}

async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise<string> {
    let combinedText = '';
    const imagesToProcess: { index: number, source: string, hash: string }[] = [];

    // Check cache first
    for (let i = 0; i < imageBlocks.length; i++) {
        const img = imageBlocks[i];
        let imageSource: string = '';

        if (img.type === 'image' && img.source?.data) {
            if (img.source.type === 'base64') {
                const mime = img.source.media_type || 'image/jpeg';
                imageSource = `data:${mime};base64,${img.source.data}`;
            } else if (img.source.type === 'url') {
                imageSource = img.source.data;
            }
        }

        if (imageSource) {
            const hash = getImageHash(imageSource);
            if (imageParsingCache.has(hash)) {
                console.log(`[Vision] Image ${i + 1} found in cache, skipping OCR.`);
                combinedText += `--- Image ${i + 1} OCR Text ---\n${imageParsingCache.get(hash)}\n\n`;
            } else {
                imagesToProcess.push({ index: i, source: imageSource, hash });
            }
        }
    }

    if (imagesToProcess.length > 0) {
        const worker = await createWorker('eng+chi_sim');
        
        for (const { index, source, hash } of imagesToProcess) {
            try {
                const { data: { text } } = await worker.recognize(source);
                const extractedText = text.trim() || '(No text detected in this image)';
                setCache(hash, extractedText);
                combinedText += `--- Image ${index + 1} OCR Text ---\n${extractedText}\n\n`;
            } catch (err) {
                console.error(`[Vision OCR] Failed to parse image ${index + 1}:`, err);
                combinedText += `--- Image ${index + 1} ---\n(Failed to parse image with local OCR)\n\n`;
            }
        }
        await worker.terminate();
    }

    return combinedText;
}

/**

 * Call a specific Vision API provider for image analysis.

 * Processes images individually for per-image caching.

 * Throws on failure so the caller can try the next provider.

 */
async function callVisionAPIWithProvider(imageBlocks: AnthropicContentBlock[], provider: VisionProvider): Promise<string> {
    let combinedText = '';
    let hasAnyFailure = false;
    
    for (let i = 0; i < imageBlocks.length; i++) {
        const img = imageBlocks[i];
        let url = '';
        
        if (img.type === 'image' && img.source?.data) {
            if (img.source.type === 'base64') {
                const mime = img.source.media_type || 'image/jpeg';
                url = `data:${mime};base64,${img.source.data}`;
            } else if (img.source.type === 'url') {
                url = img.source.data;
            }
        }

        if (url) {
            const hash = getImageHash(url);
            if (imageParsingCache.has(hash)) {
                console.log(`[Vision] Image ${i + 1} found in cache, skipping API call.`);
                combinedText += `--- Image ${i + 1} Description ---\n${imageParsingCache.get(hash)}\n\n`;
                continue;
            }

            const parts = [
                { type: 'text', text: 'Please describe this image in detail. If it contains code, UI elements, or error messages, explicitly write them out.' },
                { type: 'image_url', image_url: { url } }
            ];

            const payload = {
                model: provider.model,
                messages: [{ role: 'user', content: parts }],
                max_tokens: 1500
            };

            const res = await fetch(provider.baseUrl, {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                    'Authorization': `Bearer ${provider.apiKey}`
                },
                body: JSON.stringify(payload),
                ...getProxyFetchOptions(),
            } as any);

            if (!res.ok) {
                const errBody = await res.text();
                throw new Error(`API returned status ${res.status}: ${errBody}`);
            }

            const data = await res.json() as any;
            const description = data.choices?.[0]?.message?.content || 'No description returned.';
            
            setCache(hash, description);
            combinedText += `--- Image ${i + 1} Description ---\n${description}\n\n`;
        }
    }

    return combinedText;
}