lukas-worker / browser-agent.js
yusef75's picture
Upload 2 files
033af1d verified
/**
* Browser Agent - AI-Powered Browser Automation with Vision
* Uses Qwen VL to see screenshots and decide actions like a human
*/
// Vision Model Configuration
export const VISION_MODEL = 'qwen/qwen-2.5-vl-7b-instruct:free';
// Get OpenRouter API Keys with fallback (same pattern as Vercel)
function getOpenRouterKeys() {
const keys = [];
for (let i = 1; i <= 10; i++) {
const key = process.env[`OPENROUTER_API_KEY_${i}`];
if (key && key.trim()) keys.push(key.trim());
}
// Also check the base key
if (process.env.OPENROUTER_API_KEY) {
keys.push(process.env.OPENROUTER_API_KEY.trim());
}
return keys;
}
/**
* Analyze screenshot with Vision AI and decide next action
*/
async function analyzeWithVision(screenshotBase64, task, previousSteps = [], currentUrl = '') {
const stepHistory = previousSteps.map((s, i) =>
`Step ${i + 1}: ${s.action.type} - ${s.action.description || ''}`
).join('\n');
const prompt = `You are a browser automation agent. You can see the current webpage screenshot.
TASK: ${task}
CURRENT URL: ${currentUrl}
PREVIOUS STEPS:
${stepHistory || 'None yet'}
Based on what you see in the screenshot, decide the NEXT ACTION to complete the task.
Respond in this exact JSON format:
{
"observation": "Brief description of what you see on the page",
"thinking": "Your reasoning about what to do next",
"action": {
"type": "click" | "type" | "scroll" | "goto" | "wait" | "done",
"x": 500,
"y": 300,
"text": "text to type if action is type",
"url": "url if action is goto",
"direction": "up or down if scroll",
"description": "human readable description of this action"
},
"taskComplete": false,
"result": "Only fill this if taskComplete is true - the final answer/result"
}
IMPORTANT:
- If you see search results with the information needed, extract it and set taskComplete: true
- For click actions, estimate x,y coordinates based on where you see the element
- If you see a search box, type the search query
- If the page needs to scroll to see more, use scroll action
- Be efficient - don't take unnecessary steps`;
const keys = getOpenRouterKeys();
if (keys.length === 0) {
console.error('[BrowserAgent] No OpenRouter API keys found!');
return {
observation: 'No API keys configured',
thinking: 'Cannot analyze without API keys',
action: { type: 'wait', description: 'Waiting - no API keys' },
taskComplete: false
};
}
// Try each key until one works
for (const apiKey of keys) {
try {
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://luks-pied.vercel.app',
'X-Title': 'Lukas Browser Agent'
},
body: JSON.stringify({
model: VISION_MODEL,
messages: [{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${screenshotBase64}`
}
}
]
}],
max_tokens: 1000
})
});
if (response.status === 429) {
console.log('[BrowserAgent] Rate limited, trying next key...');
continue;
}
const data = await response.json();
if (data.choices && data.choices[0]?.message?.content) {
const content = data.choices[0].message.content;
// Extract JSON from response (handle markdown code blocks)
const jsonMatch = content.match(/\{[\s\S]*\}/);
if (jsonMatch) {
return JSON.parse(jsonMatch[0]);
}
}
if (data.error) {
console.log(`[BrowserAgent] API error: ${data.error.message}, trying next key...`);
continue;
}
} catch (error) {
console.log(`[BrowserAgent] Key failed: ${error.message}, trying next...`);
continue;
}
}
console.error('[BrowserAgent] All API keys failed');
return {
observation: 'Error analyzing screenshot',
thinking: 'All API keys failed',
action: { type: 'wait', description: 'Waiting due to error' },
taskComplete: false
};
}
/**
* Execute a browser action
*/
async function executeAction(page, action) {
try {
switch (action.type) {
case 'click':
await page.mouse.click(action.x, action.y);
await page.waitForTimeout(1000);
break;
case 'type':
if (action.x && action.y) {
await page.mouse.click(action.x, action.y);
await page.waitForTimeout(300);
}
await page.keyboard.type(action.text, { delay: 50 });
await page.keyboard.press('Enter');
await page.waitForTimeout(2000);
break;
case 'scroll':
const amount = action.direction === 'up' ? -400 : 400;
await page.mouse.wheel(0, amount);
await page.waitForTimeout(500);
break;
case 'goto':
await page.goto(action.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForTimeout(2000);
break;
case 'wait':
await page.waitForTimeout(2000);
break;
case 'done':
// Task is complete, no action needed
break;
default:
console.log('[BrowserAgent] Unknown action:', action.type);
}
return true;
} catch (error) {
console.error('[BrowserAgent] Action execution error:', error.message);
return false;
}
}
/**
* Run the Browser Agent loop
*/
async function runBrowserAgent(page, task, socket, maxSteps = 10) {
console.log(`[BrowserAgent] Starting task: "${task}"`);
const steps = [];
let taskComplete = false;
let finalResult = null;
// Start by going to Google
await page.goto('https://www.google.com', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1000);
for (let i = 0; i < maxSteps && !taskComplete; i++) {
console.log(`[BrowserAgent] Step ${i + 1}/${maxSteps}`);
// 1. Take screenshot
const screenshotBuffer = await page.screenshot({ type: 'png' });
const screenshotBase64 = screenshotBuffer.toString('base64');
// 2. Get current URL
const currentUrl = page.url();
// 3. Analyze with Vision AI
console.log('[BrowserAgent] Analyzing screenshot with Vision AI...');
const analysis = await analyzeWithVision(screenshotBase64, task, steps, currentUrl);
console.log(`[BrowserAgent] Observation: ${analysis.observation}`);
console.log(`[BrowserAgent] Thinking: ${analysis.thinking}`);
console.log(`[BrowserAgent] Action: ${analysis.action?.type} - ${analysis.action?.description}`);
// 4. Record step
const step = {
stepNumber: i + 1,
screenshot: screenshotBase64,
observation: analysis.observation,
thinking: analysis.thinking,
action: analysis.action,
timestamp: Date.now()
};
steps.push(step);
// 5. Send update to frontend
if (socket) {
socket.emit('agent:step', {
step: i + 1,
total: maxSteps,
screenshot: screenshotBase64,
observation: analysis.observation,
action: analysis.action?.description || analysis.action?.type,
taskComplete: analysis.taskComplete
});
}
// 6. Check if task is complete
if (analysis.taskComplete) {
taskComplete = true;
finalResult = analysis.result;
console.log('[BrowserAgent] Task completed!');
console.log('[BrowserAgent] Result:', finalResult);
break;
}
// 7. Execute the action
if (analysis.action && analysis.action.type !== 'done') {
await executeAction(page, analysis.action);
}
}
// Take final screenshot
const finalScreenshot = await page.screenshot({ type: 'png' });
return {
success: taskComplete,
steps: steps,
result: finalResult,
finalScreenshot: finalScreenshot.toString('base64'),
totalSteps: steps.length
};
}
export {
runBrowserAgent,
analyzeWithVision,
executeAction
};