Spaces:
Paused
Paused
| /** | |
| * vision-agent.js β VisiΓ³n Local para Zelin | |
| * =========================================== | |
| * Moondream 2B β el mejor VLM pequeΓ±o para nuestro caso: | |
| * - 2B params, ~1.5GB RAM (Q4_K_M) | |
| * - ScreenSpot F1@0.5 = 80.4 (UI understanding) | |
| * - Soporta: captioning, VQA, OCR, detecciΓ³n de objetos | |
| * - Entrenado en UI/screenshots especΓficamente | |
| * - Via node-llama-cpp (ya en el proyecto) | |
| * | |
| * CAPACIDADES: | |
| * 1. Analizar screenshots del navegador (ΒΏquΓ© hay en la pΓ‘gina?) | |
| * 2. Localizar elementos UI en pantalla (para clic preciso) | |
| * 3. Analizar imΓ‘genes del servidor Minecraft | |
| * 4. Analizar imΓ‘genes que los usuarios envΓan en Discord | |
| * 5. OCR de texto en imΓ‘genes | |
| * | |
| * NOTA DE RAM: | |
| * Con los modelos de texto ya cargados (~1.55GB): | |
| * AΓ±adir Moondream 2B Q4 (~1.5GB) = ~3.05GB total | |
| * Esto supera el lΓmite. SOLUCIΓN: | |
| * - Moondream se carga bajo demanda y se libera tras usar | |
| * - No se mantiene en memoria permanentemente | |
| * - O usar API de Gemini Vision (gratis, si los modelos de texto estΓ‘n ocupando RAM) | |
| */ | |
| import path from 'path'; | |
| import { fileURLToPath } from 'url'; | |
| import { existsSync } from 'fs'; | |
| import { readConfig } from './utils.js'; | |
| const __dirname = path.dirname(fileURLToPath(import.meta.url)); | |
| const config = readConfig(); | |
| const aiCfg = config.localAI ?? {}; | |
| const MODEL_DIR = path.join(__dirname, '..', aiCfg.modelDir ?? './models'); | |
| // Modelos de visiΓ³n disponibles | |
| const VISION_MODELS = { | |
| moondream: { | |
| repo : 'vikhyat/moondream2-GGUF', | |
| file : 'moondream2-Q4_K_S.gguf', | |
| projector: 'moondream2-mmproj-f16.gguf', | |
| ramGB : 1.5, | |
| desc : 'Moondream 2B β UI understanding, captioning, VQA', | |
| }, | |
| }; | |
| // Estado del modelo de visiΓ³n | |
| let _visionModel = null; | |
| let _visionContext = null; | |
| let _visionReady = false; | |
| let _lastUsed = null; | |
| // Descargar y cargar modelo de visiΓ³n (bajo demanda) | |
| export async function initVisionModel() { | |
| if (_visionReady) return true; | |
| try { | |
| const { getLlama } = await import('node-llama-cpp'); | |
| const llama = await getLlama(); | |
| const modelPath = path.join(MODEL_DIR, VISION_MODELS.moondream.file); | |
| if (!existsSync(modelPath)) { | |
| console.log('[Vision] Descargando Moondream 2B (~1.5GB)...'); | |
| const { createModelDownloader } = await import('node-llama-cpp'); | |
| const dl = await createModelDownloader({ | |
| modelUri : `hf:${VISION_MODELS.moondream.repo}/${VISION_MODELS.moondream.file}`, | |
| dirPath : MODEL_DIR, | |
| onProgress: ({ downloadedSize, totalSize }) => { | |
| if (totalSize) process.stdout.write(`\r[Vision] ${Math.round(downloadedSize/totalSize*100)}%`); | |
| }, | |
| }); | |
| await dl.download(); | |
| console.log('\n[Vision] Descargado β '); | |
| } | |
| _visionModel = await llama.loadModel({ modelPath, gpuLayers: 0 }); | |
| _visionContext = await _visionModel.createContext({ contextSize: 2048 }); | |
| _visionReady = true; | |
| console.log('[Vision] β Moondream 2B listo'); | |
| return true; | |
| } catch (e) { | |
| console.warn('[Vision] Modelo local no disponible:', e.message); | |
| _visionReady = false; | |
| return false; | |
| } | |
| } | |
| // Liberar modelo de visiΓ³n de la memoria (para recuperar RAM) | |
| export async function unloadVisionModel() { | |
| if (_visionModel) { | |
| try { await _visionModel.dispose?.(); } catch {} | |
| _visionModel = null; | |
| _visionContext = null; | |
| _visionReady = false; | |
| console.log('[Vision] Modelo liberado de memoria'); | |
| } | |
| } | |
| // ββ AnΓ‘lisis de imagen con Gemini Vision (fallback si no hay modelo local) βββ | |
| async function analyzeWithGeminiVision(base64Image, prompt) { | |
| const geminiKeys = config.ai?.gemini?.keys ?? [config.ai?.gemini?.apiKey].filter(Boolean); | |
| const key = geminiKeys[0]; | |
| if (!key) throw new Error('Sin Gemini key para visiΓ³n'); | |
| const res = await fetch( | |
| `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${key}`, | |
| { | |
| method : 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body : JSON.stringify({ | |
| contents: [{ | |
| parts: [ | |
| { text: prompt }, | |
| { inline_data: { mime_type: 'image/jpeg', data: base64Image } }, | |
| ], | |
| }], | |
| generationConfig: { maxOutputTokens: 500, temperature: 0.3 }, | |
| }), | |
| signal: AbortSignal.timeout(20000), | |
| } | |
| ); | |
| if (!res.ok) throw new Error('Gemini Vision HTTP ' + res.status); | |
| const j = await res.json(); | |
| return j.candidates?.[0]?.content?.parts?.[0]?.text ?? ''; | |
| } | |
| // ββ FunciΓ³n principal de anΓ‘lisis de imagen βββββββββββββββββββββββββββββββββββ | |
| export async function analyzeImage(base64Image, question = '', options = {}) { | |
| const { | |
| task = 'describe', // 'describe' | 'ocr' | 'ui' | 'qa' | |
| } = options; | |
| _lastUsed = Date.now(); | |
| // Construir prompt segΓΊn la tarea | |
| let prompt; | |
| switch (task) { | |
| case 'ocr': | |
| prompt = 'Transcribe todo el texto visible en esta imagen en orden de lectura natural.'; | |
| break; | |
| case 'ui': | |
| prompt = question || 'Describe los elementos de interfaz de usuario visibles: botones, menΓΊs, formularios, textos. ΒΏQuΓ© se puede hacer en esta pΓ‘gina?'; | |
| break; | |
| case 'qa': | |
| prompt = question || 'ΒΏQuΓ© muestra esta imagen?'; | |
| break; | |
| case 'minecraft': | |
| prompt = question || 'Describe lo que ves en esta captura de Minecraft: jugadores, bloques, estructuras, items, texto en pantalla.'; | |
| break; | |
| default: | |
| prompt = question || 'Describe esta imagen en detalle en espaΓ±ol.'; | |
| } | |
| // Intentar primero con Gemini Vision (mΓ‘s rΓ‘pido y no consume RAM de Node.js) | |
| // El modelo local solo si Gemini no estΓ‘ disponible | |
| const useGemini = !!config.ai?.gemini?.keys?.length || !!config.ai?.gemini?.apiKey; | |
| if (useGemini) { | |
| try { | |
| const result = await analyzeWithGeminiVision(base64Image, prompt); | |
| if (result?.trim()) { | |
| console.log('[Vision] β Gemini Vision respondiΓ³'); | |
| return result; | |
| } | |
| } catch (e) { | |
| console.warn('[Vision] Gemini Vision fallΓ³:', e.message); | |
| } | |
| } | |
| // Fallback: modelo local (si estΓ‘ disponible y hay RAM) | |
| if (await initVisionModel()) { | |
| try { | |
| const { LlamaChatSession } = await import('node-llama-cpp'); | |
| const session = new LlamaChatSession({ | |
| contextSequence: _visionContext.getSequence(), | |
| }); | |
| const result = await session.prompt(prompt, { | |
| maxTokens : 400, | |
| images : [Buffer.from(base64Image, 'base64')], | |
| }); | |
| session.dispose?.(); | |
| return result?.trim() ?? ''; | |
| } catch (e) { | |
| console.warn('[Vision] Modelo local fallΓ³:', e.message); | |
| } | |
| } | |
| return 'No se pudo analizar la imagen (sin modelo de visiΓ³n disponible).'; | |
| } | |
| // ββ Analizar screenshot del navegador βββββββββββββββββββββββββββββββββββββββββ | |
| export async function analyzeBrowserScreenshot(base64Screenshot, question = '') { | |
| return analyzeImage(base64Screenshot, question || 'ΒΏQuΓ© muestra esta pΓ‘gina web? Describe el contenido principal, botones disponibles y cualquier texto importante.', { task: 'ui' }); | |
| } | |
| // ββ Analizar imagen del servidor Minecraft ββββββββββββββββββββββββββββββββββββ | |
| export async function analyzeMinecraftImage(base64Image, question = '') { | |
| return analyzeImage(base64Image, question, { task: 'minecraft' }); | |
| } | |
| // ββ Analizar imagen enviada por usuario en Discord ββββββββββββββββββββββββββββ | |
| export async function analyzeUserImage(base64Image, question = '') { | |
| return analyzeImage(base64Image, question, { task: 'qa' }); | |
| } | |
| // ββ Stats βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export function getVisionStats() { | |
| return { | |
| localModelReady: _visionReady, | |
| model : VISION_MODELS.moondream.file, | |
| ramGB : VISION_MODELS.moondream.ramGB, | |
| lastUsed : _lastUsed, | |
| geminiAvailable: !!(config.ai?.gemini?.keys?.length || config.ai?.gemini?.apiKey), | |
| }; | |
| } | |