agent-go / src /app /api /gemini /route.ts
arudradey's picture
Update src/app/api/gemini/route.ts
c77a840 verified
import { NextRequest, NextResponse } from "next/server";
export const runtime = "nodejs";
export const dynamic = "force-dynamic";
type ClickableElement = {
tag: string;
text: string;
x: number;
y: number;
width: number;
height: number;
type?: string;
href?: string;
};
type ActionType =
| "click"
| "type"
| "scroll"
| "navigate"
| "keypress"
| "hover"
| "answer"
| "wait";
type ActionResult = {
type: ActionType;
description: string;
x?: number;
y?: number;
text?: string;
key?: string;
url?: string;
scrollX?: number;
scrollY?: number;
answer?: string;
ms?: number;
};
const GEMINI_BASE_URL = "https://generativelanguage.googleapis.com";
const GEMINI_MODEL = "gemini-2.5-flash";
const GEMINI_ENDPOINT = `${GEMINI_BASE_URL}/v1beta/models/${GEMINI_MODEL}:generateContent`;
const SYSTEM_PROMPT = `
You are an agentic browser controller.
You receive:
1. The user's instruction
2. A current browser screenshot
3. A list of visible interactive elements with bounding boxes and center coordinates
Your task:
Return the SINGLE best next action as JSON only.
Allowed action schema:
{
"type": "click" | "type" | "scroll" | "navigate" | "keypress" | "hover" | "answer" | "wait",
"description": "short human-readable description",
"x": number,
"y": number,
"text": string,
"key": string,
"url": string,
"scrollX": number,
"scrollY": number,
"answer": string,
"ms": number
}
Rules:
- Output ONLY valid JSON. No markdown. No explanation.
- Use exactly one action.
- For click/hover/type actions, prefer the provided center coordinates.
- If the user is asking a question about the current page, use type="answer".
- If text must be entered into an input, prefer type="type" with x, y, and text.
- If the page likely needs more content to appear, use type="scroll" or type="wait".
- If you do not need to interact and can directly answer from the screenshot/page context, use type="answer".
- Keep descriptions short and clear.
- Never invent invisible elements if the element list already gives a better target.
`.trim();
function truncate(value: string, max = 120): string {
return value.length > max ? `${value.slice(0, max - 1)}…` : value;
}
function safeNum(value: unknown): number | undefined {
if (typeof value === "number" && Number.isFinite(value)) return value;
if (typeof value === "string" && value.trim() !== "") {
const parsed = Number(value);
if (Number.isFinite(parsed)) return parsed;
}
return undefined;
}
function safeString(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function extractJsonObject(text: string): string {
const trimmed = text.trim();
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
return trimmed;
}
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
if (fenced?.[1]) {
return fenced[1].trim();
}
const firstBrace = trimmed.indexOf("{");
const lastBrace = trimmed.lastIndexOf("}");
if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
return trimmed.slice(firstBrace, lastBrace + 1);
}
throw new Error("Gemini response did not contain a JSON object");
}
function normalizeAction(input: unknown, rawText?: string): ActionResult {
const obj = (input && typeof input === "object" ? input : {}) as Record<
string,
unknown
>;
const type = safeString(obj.type) as ActionType | undefined;
const description = safeString(obj.description) || "Execute next browser step";
if (!type) {
return {
type: "answer",
description: "Fallback answer",
answer: rawText || "No valid action returned by model.",
};
}
switch (type) {
case "click":
return {
type,
description,
x: safeNum(obj.x),
y: safeNum(obj.y),
};
case "type":
return {
type,
description,
x: safeNum(obj.x),
y: safeNum(obj.y),
text: safeString(obj.text) || "",
};
case "scroll":
return {
type,
description,
scrollX: safeNum(obj.scrollX) ?? 0,
scrollY: safeNum(obj.scrollY) ?? 400,
};
case "navigate":
return {
type,
description,
url: safeString(obj.url),
};
case "keypress":
return {
type,
description,
key: safeString(obj.key) || "Enter",
};
case "hover":
return {
type,
description,
x: safeNum(obj.x),
y: safeNum(obj.y),
};
case "wait":
return {
type,
description,
ms: safeNum(obj.ms) ?? 1000,
};
case "answer":
return {
type,
description,
answer: safeString(obj.answer) || rawText || "",
};
default:
return {
type: "answer",
description: "Fallback answer",
answer: rawText || "Unsupported action type returned by model.",
};
}
}
function buildElementSummary(elements: ClickableElement[]): string {
if (!Array.isArray(elements) || elements.length === 0) {
return "No clickable elements detected.";
}
return elements
.slice(0, 150)
.map((el, index) => {
const centerX = Math.round(el.x + el.width / 2);
const centerY = Math.round(el.y + el.height / 2);
return [
`[${index}]`,
`tag=${el.tag}`,
el.type ? `type=${el.type}` : null,
`box=(${el.x},${el.y},${el.width},${el.height})`,
`center=(${centerX},${centerY})`,
el.text ? `text="${truncate(el.text)}"` : null,
el.href ? `href="${truncate(el.href, 140)}"` : null,
]
.filter(Boolean)
.join(" ");
})
.join("\n");
}
export async function POST(req: NextRequest) {
try {
const apiKey = process.env.GEMINI_API_KEY;
if (!apiKey) {
return NextResponse.json(
{
error:
"Missing GEMINI_API_KEY. Add it in Hugging Face Space secrets.",
},
{ status: 500 }
);
}
const body = await req.json();
const prompt = safeString(body?.prompt) || "";
const screenshot = safeString(body?.screenshot);
const clickableElements = Array.isArray(body?.clickableElements)
? (body.clickableElements as ClickableElement[])
: [];
if (!prompt) {
return NextResponse.json(
{ error: "Prompt is required" },
{ status: 400 }
);
}
if (!screenshot) {
return NextResponse.json(
{ error: "Screenshot is required" },
{ status: 400 }
);
}
const elementSummary = buildElementSummary(clickableElements);
const userInstruction = `
USER TASK:
${prompt}
INTERACTIVE ELEMENTS:
${elementSummary}
Remember:
- Return exactly one JSON object.
- If clicking or hovering, prefer the center coordinates.
- If answering a question, use type="answer".
`.trim();
const geminiRequestBody = {
contents: [
{
role: "user",
parts: [
{ text: SYSTEM_PROMPT },
{ text: userInstruction },
{
inline_data: {
mime_type: "image/png",
data: screenshot,
},
},
],
},
],
generationConfig: {
temperature: 0.1,
topP: 0.8,
maxOutputTokens: 1024,
responseMimeType: "application/json",
},
};
const response = await fetch(`${GEMINI_ENDPOINT}?key=${apiKey}`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(geminiRequestBody),
signal: AbortSignal.timeout(30000),
});
if (!response.ok) {
const errorText = await response.text();
console.error("[/api/gemini] Gemini API error:", errorText);
return NextResponse.json(
{
error: `Gemini API returned ${response.status}: ${errorText}`,
},
{ status: response.status }
);
}
const data = await response.json();
const rawText =
data?.candidates
?.flatMap((candidate: { content?: { parts?: Array<{ text?: string }> } }) =>
candidate?.content?.parts || []
)
?.map((part: { text?: string }) => part.text || "")
?.join("")
?.trim() || "";
if (!rawText) {
return NextResponse.json(
{ error: "Gemini returned an empty response" },
{ status: 500 }
);
}
let parsed: unknown;
try {
parsed = JSON.parse(extractJsonObject(rawText));
} catch {
parsed = {
type: "answer",
description: "Fallback answer",
answer: rawText,
};
}
const action = normalizeAction(parsed, rawText);
return NextResponse.json({
success: true,
model: GEMINI_MODEL,
action,
raw: rawText,
});
} catch (e: unknown) {
const message = e instanceof Error ? e.message : "Unknown error";
console.error("[/api/gemini]", message);
return NextResponse.json({ error: message }, { status: 500 });
}
}