Spaces:
Sleeping
Sleeping
File size: 5,106 Bytes
02d34ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import { getConfig } from './config.js';
import type { AnthropicMessage, AnthropicContentBlock } from './types.js';
import { createWorker } from 'tesseract.js';
export async function applyVisionInterceptor(messages: AnthropicMessage[]): Promise<void> {
const config = getConfig();
if (!config.vision?.enabled) return;
for (const msg of messages) {
if (!Array.isArray(msg.content)) continue;
let hasImages = false;
const newContent: AnthropicContentBlock[] = [];
const imagesToAnalyze: AnthropicContentBlock[] = [];
for (const block of msg.content) {
if (block.type === 'image') {
hasImages = true;
imagesToAnalyze.push(block);
} else {
newContent.push(block);
}
}
if (hasImages && imagesToAnalyze.length > 0) {
try {
let descriptions = '';
if (config.vision.mode === 'ocr') {
console.log(`[Vision] 启用纯本地 OCR 模式,正在提取 ${imagesToAnalyze.length} 张图片上的文字... (无需 API Key)`);
descriptions = await processWithLocalOCR(imagesToAnalyze);
} else {
console.log(`[Vision] 启用外部 API 模式,正在分析 ${imagesToAnalyze.length} 张图片...`);
descriptions = await callVisionAPI(imagesToAnalyze);
}
// Add descriptions as a simulated system text block
newContent.push({
type: 'text',
text: `\n\n[System: The user attached ${imagesToAnalyze.length} image(s). Visual analysis/OCR extracted the following context:\n${descriptions}]\n\n`
});
msg.content = newContent;
} catch (e) {
console.error("[Vision API Error]", e);
newContent.push({
type: 'text',
text: `\n\n[System: The user attached image(s), but the Vision interceptor failed to process them. Error: ${(e as Error).message}]\n\n`
});
msg.content = newContent;
}
}
}
}
async function processWithLocalOCR(imageBlocks: AnthropicContentBlock[]): Promise<string> {
const worker = await createWorker('eng+chi_sim');
let combinedText = '';
for (let i = 0; i < imageBlocks.length; i++) {
const img = imageBlocks[i];
let imageSource: string | Buffer = '';
if (img.type === 'image' && img.source?.data) {
if (img.source.type === 'base64') {
const mime = img.source.media_type || 'image/jpeg';
imageSource = `data:${mime};base64,${img.source.data}`;
} else if (img.source.type === 'url') {
imageSource = img.source.data;
}
}
if (imageSource) {
try {
const { data: { text } } = await worker.recognize(imageSource);
combinedText += `--- Image ${i + 1} OCR Text ---\n${text.trim() || '(No text detected in this image)'}\n\n`;
} catch (err) {
console.error(`[Vision OCR] Failed to parse image ${i + 1}:`, err);
combinedText += `--- Image ${i + 1} ---\n(Failed to parse image with local OCR)\n\n`;
}
}
}
await worker.terminate();
return combinedText;
}
async function callVisionAPI(imageBlocks: AnthropicContentBlock[]): Promise<string> {
const config = getConfig().vision!;
// Construct an array of OpenAI format message parts
const parts: any[] = [
{ type: 'text', text: 'Please describe the attached images in detail. If they contain code, UI elements, or error messages, explicitly write them out.' }
];
for (const img of imageBlocks) {
if (img.type === 'image' && img.source?.data) {
let url = '';
// If it's a raw base64 string
if (img.source.type === 'base64') {
const mime = img.source.media_type || 'image/jpeg';
url = `data:${mime};base64,${img.source.data}`;
} else if (img.source.type === 'url') {
// Handle remote URLs natively mapped from OpenAI payloads
url = img.source.data;
}
if (url) {
parts.push({ type: 'image_url', image_url: { url } });
}
}
}
const payload = {
model: config.model,
messages: [{ role: 'user', content: parts }],
max_tokens: 1500
};
const res = await fetch(config.baseUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${config.apiKey}`
},
body: JSON.stringify(payload)
});
if (!res.ok) {
throw new Error(`Vision API returned status ${res.status}: ${await res.text()}`);
}
const data = await res.json() as any;
return data.choices?.[0]?.message?.content || 'No description returned.';
}
|