Spaces:

yusef75
/

lukas-worker

Sleeping

App Files Files Community

lukas-worker / browser-agent.js

yusef75's picture

Upload 2 files

033af1d verified 21 days ago

history blame contribute delete

9.57 kB

	/**
	* Browser Agent - AI-Powered Browser Automation with Vision
	* Uses Qwen VL to see screenshots and decide actions like a human
	*/

	// Vision Model Configuration
	export const VISION_MODEL = 'qwen/qwen-2.5-vl-7b-instruct:free';

	// Get OpenRouter API Keys with fallback (same pattern as Vercel)
	function getOpenRouterKeys() {
	const keys = [];
	for (let i = 1; i <= 10; i++) {
	const key = process.env[`OPENROUTER_API_KEY_${i}`];
	if (key && key.trim()) keys.push(key.trim());
	}
	// Also check the base key
	if (process.env.OPENROUTER_API_KEY) {
	keys.push(process.env.OPENROUTER_API_KEY.trim());
	}
	return keys;
	}

	/**
	* Analyze screenshot with Vision AI and decide next action
	*/
	async function analyzeWithVision(screenshotBase64, task, previousSteps = [], currentUrl = '') {
	const stepHistory = previousSteps.map((s, i) =>
	`Step ${i + 1}: ${s.action.type} - ${s.action.description \|\| ''}`
	).join('\n');

	const prompt = `You are a browser automation agent. You can see the current webpage screenshot.

	TASK: ${task}

	CURRENT URL: ${currentUrl}

	PREVIOUS STEPS:
	${stepHistory \|\| 'None yet'}

	Based on what you see in the screenshot, decide the NEXT ACTION to complete the task.

	Respond in this exact JSON format:
	{
	"observation": "Brief description of what you see on the page",
	"thinking": "Your reasoning about what to do next",
	"action": {
	"type": "click" \| "type" \| "scroll" \| "goto" \| "wait" \| "done",
	"x": 500,
	"y": 300,
	"text": "text to type if action is type",
	"url": "url if action is goto",
	"direction": "up or down if scroll",
	"description": "human readable description of this action"
	},
	"taskComplete": false,
	"result": "Only fill this if taskComplete is true - the final answer/result"
	}

	IMPORTANT:
	- If you see search results with the information needed, extract it and set taskComplete: true
	- For click actions, estimate x,y coordinates based on where you see the element
	- If you see a search box, type the search query
	- If the page needs to scroll to see more, use scroll action
	- Be efficient - don't take unnecessary steps`;

	const keys = getOpenRouterKeys();
	if (keys.length === 0) {
	console.error('[BrowserAgent] No OpenRouter API keys found!');
	return {
	observation: 'No API keys configured',
	thinking: 'Cannot analyze without API keys',
	action: { type: 'wait', description: 'Waiting - no API keys' },
	taskComplete: false
	};
	}

	// Try each key until one works
	for (const apiKey of keys) {
	try {
	const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
	method: 'POST',
	headers: {
	'Authorization': `Bearer ${apiKey}`,
	'Content-Type': 'application/json',
	'HTTP-Referer': 'https://luks-pied.vercel.app',
	'X-Title': 'Lukas Browser Agent'
	},
	body: JSON.stringify({
	model: VISION_MODEL,
	messages: [{
	role: 'user',
	content: [
	{ type: 'text', text: prompt },
	{
	type: 'image_url',
	image_url: {
	url: `data:image/png;base64,${screenshotBase64}`
	}
	}
	]
	}],
	max_tokens: 1000
	})
	});

	if (response.status === 429) {
	console.log('[BrowserAgent] Rate limited, trying next key...');
	continue;
	}

	const data = await response.json();

	if (data.choices && data.choices[0]?.message?.content) {
	const content = data.choices[0].message.content;
	// Extract JSON from response (handle markdown code blocks)
	const jsonMatch = content.match(/\{[\s\S]*\}/);
	if (jsonMatch) {
	return JSON.parse(jsonMatch[0]);
	}
	}

	if (data.error) {
	console.log(`[BrowserAgent] API error: ${data.error.message}, trying next key...`);
	continue;
	}
	} catch (error) {
	console.log(`[BrowserAgent] Key failed: ${error.message}, trying next...`);
	continue;
	}
	}

	console.error('[BrowserAgent] All API keys failed');
	return {
	observation: 'Error analyzing screenshot',
	thinking: 'All API keys failed',
	action: { type: 'wait', description: 'Waiting due to error' },
	taskComplete: false
	};
	}

	/**
	* Execute a browser action
	*/
	async function executeAction(page, action) {
	try {
	switch (action.type) {
	case 'click':
	await page.mouse.click(action.x, action.y);
	await page.waitForTimeout(1000);
	break;

	case 'type':
	if (action.x && action.y) {
	await page.mouse.click(action.x, action.y);
	await page.waitForTimeout(300);
	}
	await page.keyboard.type(action.text, { delay: 50 });
	await page.keyboard.press('Enter');
	await page.waitForTimeout(2000);
	break;

	case 'scroll':
	const amount = action.direction === 'up' ? -400 : 400;
	await page.mouse.wheel(0, amount);
	await page.waitForTimeout(500);
	break;

	case 'goto':
	await page.goto(action.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
	await page.waitForTimeout(2000);
	break;

	case 'wait':
	await page.waitForTimeout(2000);
	break;

	case 'done':
	// Task is complete, no action needed
	break;

	default:
	console.log('[BrowserAgent] Unknown action:', action.type);
	}
	return true;
	} catch (error) {
	console.error('[BrowserAgent] Action execution error:', error.message);
	return false;
	}
	}

	/**
	* Run the Browser Agent loop
	*/
	async function runBrowserAgent(page, task, socket, maxSteps = 10) {
	console.log(`[BrowserAgent] Starting task: "${task}"`);

	const steps = [];
	let taskComplete = false;
	let finalResult = null;

	// Start by going to Google
	await page.goto('https://www.google.com', { waitUntil: 'domcontentloaded' });
	await page.waitForTimeout(1000);

	for (let i = 0; i < maxSteps && !taskComplete; i++) {
	console.log(`[BrowserAgent] Step ${i + 1}/${maxSteps}`);

	// 1. Take screenshot
	const screenshotBuffer = await page.screenshot({ type: 'png' });
	const screenshotBase64 = screenshotBuffer.toString('base64');

	// 2. Get current URL
	const currentUrl = page.url();

	// 3. Analyze with Vision AI
	console.log('[BrowserAgent] Analyzing screenshot with Vision AI...');
	const analysis = await analyzeWithVision(screenshotBase64, task, steps, currentUrl);

	console.log(`[BrowserAgent] Observation: ${analysis.observation}`);
	console.log(`[BrowserAgent] Thinking: ${analysis.thinking}`);
	console.log(`[BrowserAgent] Action: ${analysis.action?.type} - ${analysis.action?.description}`);

	// 4. Record step
	const step = {
	stepNumber: i + 1,
	screenshot: screenshotBase64,
	observation: analysis.observation,
	thinking: analysis.thinking,
	action: analysis.action,
	timestamp: Date.now()
	};
	steps.push(step);

	// 5. Send update to frontend
	if (socket) {
	socket.emit('agent:step', {
	step: i + 1,
	total: maxSteps,
	screenshot: screenshotBase64,
	observation: analysis.observation,
	action: analysis.action?.description \|\| analysis.action?.type,
	taskComplete: analysis.taskComplete
	});
	}

	// 6. Check if task is complete
	if (analysis.taskComplete) {
	taskComplete = true;
	finalResult = analysis.result;
	console.log('[BrowserAgent] Task completed!');
	console.log('[BrowserAgent] Result:', finalResult);
	break;
	}

	// 7. Execute the action
	if (analysis.action && analysis.action.type !== 'done') {
	await executeAction(page, analysis.action);
	}
	}

	// Take final screenshot
	const finalScreenshot = await page.screenshot({ type: 'png' });

	return {
	success: taskComplete,
	steps: steps,
	result: finalResult,
	finalScreenshot: finalScreenshot.toString('base64'),
	totalSteps: steps.length
	};
	}

	export {
	runBrowserAgent,
	analyzeWithVision,
	executeAction
	};