| import { NextRequest, NextResponse } from "next/server"; |
|
|
| export const runtime = "nodejs"; |
| export const dynamic = "force-dynamic"; |
|
|
| type ClickableElement = { |
| tag: string; |
| text: string; |
| x: number; |
| y: number; |
| width: number; |
| height: number; |
| type?: string; |
| href?: string; |
| }; |
|
|
| type ActionType = |
| | "click" |
| | "type" |
| | "scroll" |
| | "navigate" |
| | "keypress" |
| | "hover" |
| | "answer" |
| | "wait"; |
|
|
| type ActionResult = { |
| type: ActionType; |
| description: string; |
| x?: number; |
| y?: number; |
| text?: string; |
| key?: string; |
| url?: string; |
| scrollX?: number; |
| scrollY?: number; |
| answer?: string; |
| ms?: number; |
| }; |
|
|
| const GEMINI_BASE_URL = "https://generativelanguage.googleapis.com"; |
| const GEMINI_MODEL = "gemini-2.5-flash"; |
| const GEMINI_ENDPOINT = `${GEMINI_BASE_URL}/v1beta/models/${GEMINI_MODEL}:generateContent`; |
|
|
| const SYSTEM_PROMPT = ` |
| You are an agentic browser controller. |
| |
| You receive: |
| 1. The user's instruction |
| 2. A current browser screenshot |
| 3. A list of visible interactive elements with bounding boxes and center coordinates |
| |
| Your task: |
| Return the SINGLE best next action as JSON only. |
| |
| Allowed action schema: |
| { |
| "type": "click" | "type" | "scroll" | "navigate" | "keypress" | "hover" | "answer" | "wait", |
| "description": "short human-readable description", |
| "x": number, |
| "y": number, |
| "text": string, |
| "key": string, |
| "url": string, |
| "scrollX": number, |
| "scrollY": number, |
| "answer": string, |
| "ms": number |
| } |
| |
| Rules: |
| - Output ONLY valid JSON. No markdown. No explanation. |
| - Use exactly one action. |
| - For click/hover/type actions, prefer the provided center coordinates. |
| - If the user is asking a question about the current page, use type="answer". |
| - If text must be entered into an input, prefer type="type" with x, y, and text. |
| - If the page likely needs more content to appear, use type="scroll" or type="wait". |
| - If you do not need to interact and can directly answer from the screenshot/page context, use type="answer". |
| - Keep descriptions short and clear. |
| - Never invent invisible elements if the element list already gives a better target. |
| `.trim(); |
|
|
| function truncate(value: string, max = 120): string { |
| return value.length > max ? `${value.slice(0, max - 1)}…` : value; |
| } |
|
|
| function safeNum(value: unknown): number | undefined { |
| if (typeof value === "number" && Number.isFinite(value)) return value; |
| if (typeof value === "string" && value.trim() !== "") { |
| const parsed = Number(value); |
| if (Number.isFinite(parsed)) return parsed; |
| } |
| return undefined; |
| } |
|
|
| function safeString(value: unknown): string | undefined { |
| return typeof value === "string" && value.trim() ? value.trim() : undefined; |
| } |
|
|
| function extractJsonObject(text: string): string { |
| const trimmed = text.trim(); |
|
|
| if (trimmed.startsWith("{") && trimmed.endsWith("}")) { |
| return trimmed; |
| } |
|
|
| const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/i); |
| if (fenced?.[1]) { |
| return fenced[1].trim(); |
| } |
|
|
| const firstBrace = trimmed.indexOf("{"); |
| const lastBrace = trimmed.lastIndexOf("}"); |
| if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) { |
| return trimmed.slice(firstBrace, lastBrace + 1); |
| } |
|
|
| throw new Error("Gemini response did not contain a JSON object"); |
| } |
|
|
| function normalizeAction(input: unknown, rawText?: string): ActionResult { |
| const obj = (input && typeof input === "object" ? input : {}) as Record< |
| string, |
| unknown |
| >; |
|
|
| const type = safeString(obj.type) as ActionType | undefined; |
| const description = safeString(obj.description) || "Execute next browser step"; |
|
|
| if (!type) { |
| return { |
| type: "answer", |
| description: "Fallback answer", |
| answer: rawText || "No valid action returned by model.", |
| }; |
| } |
|
|
| switch (type) { |
| case "click": |
| return { |
| type, |
| description, |
| x: safeNum(obj.x), |
| y: safeNum(obj.y), |
| }; |
|
|
| case "type": |
| return { |
| type, |
| description, |
| x: safeNum(obj.x), |
| y: safeNum(obj.y), |
| text: safeString(obj.text) || "", |
| }; |
|
|
| case "scroll": |
| return { |
| type, |
| description, |
| scrollX: safeNum(obj.scrollX) ?? 0, |
| scrollY: safeNum(obj.scrollY) ?? 400, |
| }; |
|
|
| case "navigate": |
| return { |
| type, |
| description, |
| url: safeString(obj.url), |
| }; |
|
|
| case "keypress": |
| return { |
| type, |
| description, |
| key: safeString(obj.key) || "Enter", |
| }; |
|
|
| case "hover": |
| return { |
| type, |
| description, |
| x: safeNum(obj.x), |
| y: safeNum(obj.y), |
| }; |
|
|
| case "wait": |
| return { |
| type, |
| description, |
| ms: safeNum(obj.ms) ?? 1000, |
| }; |
|
|
| case "answer": |
| return { |
| type, |
| description, |
| answer: safeString(obj.answer) || rawText || "", |
| }; |
|
|
| default: |
| return { |
| type: "answer", |
| description: "Fallback answer", |
| answer: rawText || "Unsupported action type returned by model.", |
| }; |
| } |
| } |
|
|
| function buildElementSummary(elements: ClickableElement[]): string { |
| if (!Array.isArray(elements) || elements.length === 0) { |
| return "No clickable elements detected."; |
| } |
|
|
| return elements |
| .slice(0, 150) |
| .map((el, index) => { |
| const centerX = Math.round(el.x + el.width / 2); |
| const centerY = Math.round(el.y + el.height / 2); |
|
|
| return [ |
| `[${index}]`, |
| `tag=${el.tag}`, |
| el.type ? `type=${el.type}` : null, |
| `box=(${el.x},${el.y},${el.width},${el.height})`, |
| `center=(${centerX},${centerY})`, |
| el.text ? `text="${truncate(el.text)}"` : null, |
| el.href ? `href="${truncate(el.href, 140)}"` : null, |
| ] |
| .filter(Boolean) |
| .join(" "); |
| }) |
| .join("\n"); |
| } |
|
|
| export async function POST(req: NextRequest) { |
| try { |
| const apiKey = process.env.GEMINI_API_KEY; |
| if (!apiKey) { |
| return NextResponse.json( |
| { |
| error: |
| "Missing GEMINI_API_KEY. Add it in Hugging Face Space secrets.", |
| }, |
| { status: 500 } |
| ); |
| } |
|
|
| const body = await req.json(); |
| const prompt = safeString(body?.prompt) || ""; |
| const screenshot = safeString(body?.screenshot); |
| const clickableElements = Array.isArray(body?.clickableElements) |
| ? (body.clickableElements as ClickableElement[]) |
| : []; |
|
|
| if (!prompt) { |
| return NextResponse.json( |
| { error: "Prompt is required" }, |
| { status: 400 } |
| ); |
| } |
|
|
| if (!screenshot) { |
| return NextResponse.json( |
| { error: "Screenshot is required" }, |
| { status: 400 } |
| ); |
| } |
|
|
| const elementSummary = buildElementSummary(clickableElements); |
|
|
| const userInstruction = ` |
| USER TASK: |
| ${prompt} |
| |
| INTERACTIVE ELEMENTS: |
| ${elementSummary} |
| |
| Remember: |
| - Return exactly one JSON object. |
| - If clicking or hovering, prefer the center coordinates. |
| - If answering a question, use type="answer". |
| `.trim(); |
|
|
| const geminiRequestBody = { |
| contents: [ |
| { |
| role: "user", |
| parts: [ |
| { text: SYSTEM_PROMPT }, |
| { text: userInstruction }, |
| { |
| inline_data: { |
| mime_type: "image/png", |
| data: screenshot, |
| }, |
| }, |
| ], |
| }, |
| ], |
| generationConfig: { |
| temperature: 0.1, |
| topP: 0.8, |
| maxOutputTokens: 1024, |
| responseMimeType: "application/json", |
| }, |
| }; |
|
|
| const response = await fetch(`${GEMINI_ENDPOINT}?key=${apiKey}`, { |
| method: "POST", |
| headers: { |
| "Content-Type": "application/json", |
| }, |
| body: JSON.stringify(geminiRequestBody), |
| signal: AbortSignal.timeout(30000), |
| }); |
|
|
| if (!response.ok) { |
| const errorText = await response.text(); |
| console.error("[/api/gemini] Gemini API error:", errorText); |
|
|
| return NextResponse.json( |
| { |
| error: `Gemini API returned ${response.status}: ${errorText}`, |
| }, |
| { status: response.status } |
| ); |
| } |
|
|
| const data = await response.json(); |
|
|
| const rawText = |
| data?.candidates |
| ?.flatMap((candidate: { content?: { parts?: Array<{ text?: string }> } }) => |
| candidate?.content?.parts || [] |
| ) |
| ?.map((part: { text?: string }) => part.text || "") |
| ?.join("") |
| ?.trim() || ""; |
|
|
| if (!rawText) { |
| return NextResponse.json( |
| { error: "Gemini returned an empty response" }, |
| { status: 500 } |
| ); |
| } |
|
|
| let parsed: unknown; |
| try { |
| parsed = JSON.parse(extractJsonObject(rawText)); |
| } catch { |
| parsed = { |
| type: "answer", |
| description: "Fallback answer", |
| answer: rawText, |
| }; |
| } |
|
|
| const action = normalizeAction(parsed, rawText); |
|
|
| return NextResponse.json({ |
| success: true, |
| model: GEMINI_MODEL, |
| action, |
| raw: rawText, |
| }); |
| } catch (e: unknown) { |
| const message = e instanceof Error ? e.message : "Unknown error"; |
| console.error("[/api/gemini]", message); |
|
|
| return NextResponse.json({ error: message }, { status: 500 }); |
| } |
| } |