/** * Browser Agent - AI-Powered Browser Automation with Vision * Uses Qwen VL to see screenshots and decide actions like a human */ // Vision Model Configuration export const VISION_MODEL = 'qwen/qwen-2.5-vl-7b-instruct:free'; // Get OpenRouter API Keys with fallback (same pattern as Vercel) function getOpenRouterKeys() { const keys = []; for (let i = 1; i <= 10; i++) { const key = process.env[`OPENROUTER_API_KEY_${i}`]; if (key && key.trim()) keys.push(key.trim()); } // Also check the base key if (process.env.OPENROUTER_API_KEY) { keys.push(process.env.OPENROUTER_API_KEY.trim()); } return keys; } /** * Analyze screenshot with Vision AI and decide next action */ async function analyzeWithVision(screenshotBase64, task, previousSteps = [], currentUrl = '') { const stepHistory = previousSteps.map((s, i) => `Step ${i + 1}: ${s.action.type} - ${s.action.description || ''}` ).join('\n'); const prompt = `You are a browser automation agent. You can see the current webpage screenshot. TASK: ${task} CURRENT URL: ${currentUrl} PREVIOUS STEPS: ${stepHistory || 'None yet'} Based on what you see in the screenshot, decide the NEXT ACTION to complete the task. Respond in this exact JSON format: { "observation": "Brief description of what you see on the page", "thinking": "Your reasoning about what to do next", "action": { "type": "click" | "type" | "scroll" | "goto" | "wait" | "done", "x": 500, "y": 300, "text": "text to type if action is type", "url": "url if action is goto", "direction": "up or down if scroll", "description": "human readable description of this action" }, "taskComplete": false, "result": "Only fill this if taskComplete is true - the final answer/result" } IMPORTANT: - If you see search results with the information needed, extract it and set taskComplete: true - For click actions, estimate x,y coordinates based on where you see the element - If you see a search box, type the search query - If the page needs to scroll to see more, use scroll action - Be efficient - don't take unnecessary steps`; const keys = getOpenRouterKeys(); if (keys.length === 0) { console.error('[BrowserAgent] No OpenRouter API keys found!'); return { observation: 'No API keys configured', thinking: 'Cannot analyze without API keys', action: { type: 'wait', description: 'Waiting - no API keys' }, taskComplete: false }; } // Try each key until one works for (const apiKey of keys) { try { const response = await fetch('https://openrouter.ai/api/v1/chat/completions', { method: 'POST', headers: { 'Authorization': `Bearer ${apiKey}`, 'Content-Type': 'application/json', 'HTTP-Referer': 'https://luks-pied.vercel.app', 'X-Title': 'Lukas Browser Agent' }, body: JSON.stringify({ model: VISION_MODEL, messages: [{ role: 'user', content: [ { type: 'text', text: prompt }, { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshotBase64}` } } ] }], max_tokens: 1000 }) }); if (response.status === 429) { console.log('[BrowserAgent] Rate limited, trying next key...'); continue; } const data = await response.json(); if (data.choices && data.choices[0]?.message?.content) { const content = data.choices[0].message.content; // Extract JSON from response (handle markdown code blocks) const jsonMatch = content.match(/\{[\s\S]*\}/); if (jsonMatch) { return JSON.parse(jsonMatch[0]); } } if (data.error) { console.log(`[BrowserAgent] API error: ${data.error.message}, trying next key...`); continue; } } catch (error) { console.log(`[BrowserAgent] Key failed: ${error.message}, trying next...`); continue; } } console.error('[BrowserAgent] All API keys failed'); return { observation: 'Error analyzing screenshot', thinking: 'All API keys failed', action: { type: 'wait', description: 'Waiting due to error' }, taskComplete: false }; } /** * Execute a browser action */ async function executeAction(page, action) { try { switch (action.type) { case 'click': await page.mouse.click(action.x, action.y); await page.waitForTimeout(1000); break; case 'type': if (action.x && action.y) { await page.mouse.click(action.x, action.y); await page.waitForTimeout(300); } await page.keyboard.type(action.text, { delay: 50 }); await page.keyboard.press('Enter'); await page.waitForTimeout(2000); break; case 'scroll': const amount = action.direction === 'up' ? -400 : 400; await page.mouse.wheel(0, amount); await page.waitForTimeout(500); break; case 'goto': await page.goto(action.url, { waitUntil: 'domcontentloaded', timeout: 30000 }); await page.waitForTimeout(2000); break; case 'wait': await page.waitForTimeout(2000); break; case 'done': // Task is complete, no action needed break; default: console.log('[BrowserAgent] Unknown action:', action.type); } return true; } catch (error) { console.error('[BrowserAgent] Action execution error:', error.message); return false; } } /** * Run the Browser Agent loop */ async function runBrowserAgent(page, task, socket, maxSteps = 10) { console.log(`[BrowserAgent] Starting task: "${task}"`); const steps = []; let taskComplete = false; let finalResult = null; // Start by going to Google await page.goto('https://www.google.com', { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(1000); for (let i = 0; i < maxSteps && !taskComplete; i++) { console.log(`[BrowserAgent] Step ${i + 1}/${maxSteps}`); // 1. Take screenshot const screenshotBuffer = await page.screenshot({ type: 'png' }); const screenshotBase64 = screenshotBuffer.toString('base64'); // 2. Get current URL const currentUrl = page.url(); // 3. Analyze with Vision AI console.log('[BrowserAgent] Analyzing screenshot with Vision AI...'); const analysis = await analyzeWithVision(screenshotBase64, task, steps, currentUrl); console.log(`[BrowserAgent] Observation: ${analysis.observation}`); console.log(`[BrowserAgent] Thinking: ${analysis.thinking}`); console.log(`[BrowserAgent] Action: ${analysis.action?.type} - ${analysis.action?.description}`); // 4. Record step const step = { stepNumber: i + 1, screenshot: screenshotBase64, observation: analysis.observation, thinking: analysis.thinking, action: analysis.action, timestamp: Date.now() }; steps.push(step); // 5. Send update to frontend if (socket) { socket.emit('agent:step', { step: i + 1, total: maxSteps, screenshot: screenshotBase64, observation: analysis.observation, action: analysis.action?.description || analysis.action?.type, taskComplete: analysis.taskComplete }); } // 6. Check if task is complete if (analysis.taskComplete) { taskComplete = true; finalResult = analysis.result; console.log('[BrowserAgent] Task completed!'); console.log('[BrowserAgent] Result:', finalResult); break; } // 7. Execute the action if (analysis.action && analysis.action.type !== 'done') { await executeAction(page, analysis.action); } } // Take final screenshot const finalScreenshot = await page.screenshot({ type: 'png' }); return { success: taskComplete, steps: steps, result: finalResult, finalScreenshot: finalScreenshot.toString('base64'), totalSteps: steps.length }; } export { runBrowserAgent, analyzeWithVision, executeAction };