arudradey commited on
Commit
c77a840
Β·
verified Β·
1 Parent(s): 2377fa6

Update src/app/api/gemini/route.ts

Browse files
Files changed (1) hide show
  1. src/app/api/gemini/route.ts +299 -87
src/app/api/gemini/route.ts CHANGED
@@ -1,54 +1,235 @@
1
  import { NextRequest, NextResponse } from "next/server";
2
 
3
- // ─── Gemini API config ────────────────────────────────────────────────────────
4
- // Base URL: https://generativelanguage.googleapis.com
5
- // Endpoint: /v1beta/models/{model}:generateContent
6
- // Model: gemini-2.5-flash (multimodal: vision + text)
7
- // API Key: Set GEMINI_API_KEY in HuggingFace Space β†’ Settings β†’ Repository secrets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  const GEMINI_BASE_URL = "https://generativelanguage.googleapis.com";
9
  const GEMINI_MODEL = "gemini-2.5-flash";
10
- const GEMINI_ENDPOINT = `/v1beta/models/${GEMINI_MODEL}:generateContent`;
 
 
 
11
 
12
- const SYSTEM_PROMPT = `You are an agentic browser AI assistant. You control a real Chromium browser.
13
- You are given:
14
- 1. A screenshot of the current browser viewport (PNG, base64)
15
- 2. A list of all detected clickable/interactive elements with their tag, text, and pixel coordinates (x, y, width, height)
16
- 3. A user instruction/prompt
17
 
18
- Your job is to decide the SINGLE BEST next action to take.
 
19
 
20
- Respond with ONLY valid JSON in this exact schema (no markdown, no explanation):
21
  {
22
  "type": "click" | "type" | "scroll" | "navigate" | "keypress" | "hover" | "answer" | "wait",
23
- "description": "Human-readable description of what you are doing",
24
- "x": <number, pixel x for click/hover/type>,
25
- "y": <number, pixel y for click/hover/type>,
26
- "text": "<string, text to type if type action>",
27
- "key": "<string, key name if keypress, e.g. Enter, Tab, ArrowDown, Escape>",
28
- "url": "<string, full URL if navigate action>",
29
- "scrollX": <number, horizontal scroll delta in pixels>,
30
- "scrollY": <number, vertical scroll delta in pixels, positive = down>,
31
- "answer": "<string, your answer if the user asked a question about the page>",
32
- "ms": <number, milliseconds to wait if wait action>
33
  }
34
 
35
  Rules:
36
- - Use pixel coordinates from the provided element list for clicks - be precise.
37
- - If the user asks a question about the page, use type="answer" and put your response in "answer".
38
- - Only include keys relevant to the chosen action type.
39
- - For scroll, typical scrollY is 300-600 pixels.
40
- - Do NOT use WebSockets, VNC, or any streaming protocol.
41
- - Prioritize visible, accessible elements from the element list.`;
42
-
43
- interface ClickableElement {
44
- tag: string;
45
- text: string;
46
- x: number;
47
- y: number;
48
- width: number;
49
- height: number;
50
- type?: string;
51
- href?: string;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
54
  export async function POST(req: NextRequest) {
@@ -58,103 +239,134 @@ export async function POST(req: NextRequest) {
58
  return NextResponse.json(
59
  {
60
  error:
61
- "GEMINI_API_KEY is not set. Add it in HuggingFace Space β†’ Settings β†’ Repository secrets.",
62
  },
63
  { status: 500 }
64
  );
65
  }
66
 
67
- const { prompt, screenshot, clickableElements } = await req.json();
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if (!screenshot) {
70
  return NextResponse.json(
71
- { error: "No screenshot provided" },
72
  { status: 400 }
73
  );
74
  }
75
 
76
- // Format clickable elements as compact text summary for the model
77
- const elementSummary = (clickableElements as ClickableElement[])
78
- .slice(0, 100)
79
- .map(
80
- (el, i) =>
81
- `[${i}] ${el.tag.toUpperCase()}${el.type ? `[${el.type}]` : ""} @ (${el.x},${el.y}) ${el.width}Γ—${el.height}px β€” "${el.text || el.href || "(no text)"}"`
82
- )
83
- .join("\n");
84
 
85
- const userText = `USER INSTRUCTION: ${prompt}
 
 
86
 
87
- DETECTED INTERACTIVE ELEMENTS (use these coordinates for clicks):
88
- ${elementSummary || "None detected"}
89
 
90
- Analyze the screenshot and return the best action JSON.`;
 
 
 
 
91
 
92
- const body = {
93
- system_instruction: {
94
- parts: [{ text: SYSTEM_PROMPT }],
95
- },
96
  contents: [
97
  {
98
  role: "user",
99
  parts: [
 
 
100
  {
101
  inline_data: {
102
  mime_type: "image/png",
103
  data: screenshot,
104
  },
105
  },
106
- { text: userText },
107
  ],
108
  },
109
  ],
110
  generationConfig: {
111
- temperature: 0.2,
112
  topP: 0.8,
113
- maxOutputTokens: 512,
114
  responseMimeType: "application/json",
115
  },
116
  };
117
 
118
- const response = await fetch(
119
- `${GEMINI_BASE_URL}${GEMINI_ENDPOINT}?key=${apiKey}`,
120
- {
121
- method: "POST",
122
- headers: { "Content-Type": "application/json" },
123
- body: JSON.stringify(body),
124
- signal: AbortSignal.timeout(30000),
125
- }
126
- );
127
 
128
  if (!response.ok) {
129
- const errText = await response.text();
130
- console.error("[Gemini] API error:", errText);
 
131
  return NextResponse.json(
132
- { error: `Gemini API returned ${response.status}: ${errText}` },
 
 
133
  { status: response.status }
134
  );
135
  }
136
 
137
  const data = await response.json();
 
138
  const rawText =
139
- data.candidates?.[0]?.content?.parts?.[0]?.text || "";
 
 
 
 
 
 
140
 
141
- let action;
 
 
 
 
 
 
 
142
  try {
143
- action = JSON.parse(rawText);
144
  } catch {
145
- // Fallback: try extracting JSON from markdown code block
146
- const match = rawText.match(/```(?:json)?\s*([\s\S]*?)```/);
147
- if (match) {
148
- action = JSON.parse(match[1]);
149
- } else {
150
- throw new Error(`Could not parse Gemini response as JSON: ${rawText}`);
151
- }
152
  }
153
 
154
- return NextResponse.json({ action });
 
 
 
 
 
 
 
155
  } catch (e: unknown) {
156
- const msg = e instanceof Error ? e.message : "Unknown error";
157
- console.error("[/api/gemini] Error:", msg);
158
- return NextResponse.json({ error: msg }, { status: 500 });
 
159
  }
160
  }
 
1
  import { NextRequest, NextResponse } from "next/server";
2
 
3
+ export const runtime = "nodejs";
4
+ export const dynamic = "force-dynamic";
5
+
6
+ type ClickableElement = {
7
+ tag: string;
8
+ text: string;
9
+ x: number;
10
+ y: number;
11
+ width: number;
12
+ height: number;
13
+ type?: string;
14
+ href?: string;
15
+ };
16
+
17
+ type ActionType =
18
+ | "click"
19
+ | "type"
20
+ | "scroll"
21
+ | "navigate"
22
+ | "keypress"
23
+ | "hover"
24
+ | "answer"
25
+ | "wait";
26
+
27
+ type ActionResult = {
28
+ type: ActionType;
29
+ description: string;
30
+ x?: number;
31
+ y?: number;
32
+ text?: string;
33
+ key?: string;
34
+ url?: string;
35
+ scrollX?: number;
36
+ scrollY?: number;
37
+ answer?: string;
38
+ ms?: number;
39
+ };
40
+
41
  const GEMINI_BASE_URL = "https://generativelanguage.googleapis.com";
42
  const GEMINI_MODEL = "gemini-2.5-flash";
43
+ const GEMINI_ENDPOINT = `${GEMINI_BASE_URL}/v1beta/models/${GEMINI_MODEL}:generateContent`;
44
+
45
+ const SYSTEM_PROMPT = `
46
+ You are an agentic browser controller.
47
 
48
+ You receive:
49
+ 1. The user's instruction
50
+ 2. A current browser screenshot
51
+ 3. A list of visible interactive elements with bounding boxes and center coordinates
 
52
 
53
+ Your task:
54
+ Return the SINGLE best next action as JSON only.
55
 
56
+ Allowed action schema:
57
  {
58
  "type": "click" | "type" | "scroll" | "navigate" | "keypress" | "hover" | "answer" | "wait",
59
+ "description": "short human-readable description",
60
+ "x": number,
61
+ "y": number,
62
+ "text": string,
63
+ "key": string,
64
+ "url": string,
65
+ "scrollX": number,
66
+ "scrollY": number,
67
+ "answer": string,
68
+ "ms": number
69
  }
70
 
71
  Rules:
72
+ - Output ONLY valid JSON. No markdown. No explanation.
73
+ - Use exactly one action.
74
+ - For click/hover/type actions, prefer the provided center coordinates.
75
+ - If the user is asking a question about the current page, use type="answer".
76
+ - If text must be entered into an input, prefer type="type" with x, y, and text.
77
+ - If the page likely needs more content to appear, use type="scroll" or type="wait".
78
+ - If you do not need to interact and can directly answer from the screenshot/page context, use type="answer".
79
+ - Keep descriptions short and clear.
80
+ - Never invent invisible elements if the element list already gives a better target.
81
+ `.trim();
82
+
83
+ function truncate(value: string, max = 120): string {
84
+ return value.length > max ? `${value.slice(0, max - 1)}…` : value;
85
+ }
86
+
87
+ function safeNum(value: unknown): number | undefined {
88
+ if (typeof value === "number" && Number.isFinite(value)) return value;
89
+ if (typeof value === "string" && value.trim() !== "") {
90
+ const parsed = Number(value);
91
+ if (Number.isFinite(parsed)) return parsed;
92
+ }
93
+ return undefined;
94
+ }
95
+
96
+ function safeString(value: unknown): string | undefined {
97
+ return typeof value === "string" && value.trim() ? value.trim() : undefined;
98
+ }
99
+
100
+ function extractJsonObject(text: string): string {
101
+ const trimmed = text.trim();
102
+
103
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
104
+ return trimmed;
105
+ }
106
+
107
+ const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
108
+ if (fenced?.[1]) {
109
+ return fenced[1].trim();
110
+ }
111
+
112
+ const firstBrace = trimmed.indexOf("{");
113
+ const lastBrace = trimmed.lastIndexOf("}");
114
+ if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
115
+ return trimmed.slice(firstBrace, lastBrace + 1);
116
+ }
117
+
118
+ throw new Error("Gemini response did not contain a JSON object");
119
+ }
120
+
121
+ function normalizeAction(input: unknown, rawText?: string): ActionResult {
122
+ const obj = (input && typeof input === "object" ? input : {}) as Record<
123
+ string,
124
+ unknown
125
+ >;
126
+
127
+ const type = safeString(obj.type) as ActionType | undefined;
128
+ const description = safeString(obj.description) || "Execute next browser step";
129
+
130
+ if (!type) {
131
+ return {
132
+ type: "answer",
133
+ description: "Fallback answer",
134
+ answer: rawText || "No valid action returned by model.",
135
+ };
136
+ }
137
+
138
+ switch (type) {
139
+ case "click":
140
+ return {
141
+ type,
142
+ description,
143
+ x: safeNum(obj.x),
144
+ y: safeNum(obj.y),
145
+ };
146
+
147
+ case "type":
148
+ return {
149
+ type,
150
+ description,
151
+ x: safeNum(obj.x),
152
+ y: safeNum(obj.y),
153
+ text: safeString(obj.text) || "",
154
+ };
155
+
156
+ case "scroll":
157
+ return {
158
+ type,
159
+ description,
160
+ scrollX: safeNum(obj.scrollX) ?? 0,
161
+ scrollY: safeNum(obj.scrollY) ?? 400,
162
+ };
163
+
164
+ case "navigate":
165
+ return {
166
+ type,
167
+ description,
168
+ url: safeString(obj.url),
169
+ };
170
+
171
+ case "keypress":
172
+ return {
173
+ type,
174
+ description,
175
+ key: safeString(obj.key) || "Enter",
176
+ };
177
+
178
+ case "hover":
179
+ return {
180
+ type,
181
+ description,
182
+ x: safeNum(obj.x),
183
+ y: safeNum(obj.y),
184
+ };
185
+
186
+ case "wait":
187
+ return {
188
+ type,
189
+ description,
190
+ ms: safeNum(obj.ms) ?? 1000,
191
+ };
192
+
193
+ case "answer":
194
+ return {
195
+ type,
196
+ description,
197
+ answer: safeString(obj.answer) || rawText || "",
198
+ };
199
+
200
+ default:
201
+ return {
202
+ type: "answer",
203
+ description: "Fallback answer",
204
+ answer: rawText || "Unsupported action type returned by model.",
205
+ };
206
+ }
207
+ }
208
+
209
+ function buildElementSummary(elements: ClickableElement[]): string {
210
+ if (!Array.isArray(elements) || elements.length === 0) {
211
+ return "No clickable elements detected.";
212
+ }
213
+
214
+ return elements
215
+ .slice(0, 150)
216
+ .map((el, index) => {
217
+ const centerX = Math.round(el.x + el.width / 2);
218
+ const centerY = Math.round(el.y + el.height / 2);
219
+
220
+ return [
221
+ `[${index}]`,
222
+ `tag=${el.tag}`,
223
+ el.type ? `type=${el.type}` : null,
224
+ `box=(${el.x},${el.y},${el.width},${el.height})`,
225
+ `center=(${centerX},${centerY})`,
226
+ el.text ? `text="${truncate(el.text)}"` : null,
227
+ el.href ? `href="${truncate(el.href, 140)}"` : null,
228
+ ]
229
+ .filter(Boolean)
230
+ .join(" ");
231
+ })
232
+ .join("\n");
233
  }
234
 
235
  export async function POST(req: NextRequest) {
 
239
  return NextResponse.json(
240
  {
241
  error:
242
+ "Missing GEMINI_API_KEY. Add it in Hugging Face Space secrets.",
243
  },
244
  { status: 500 }
245
  );
246
  }
247
 
248
+ const body = await req.json();
249
+ const prompt = safeString(body?.prompt) || "";
250
+ const screenshot = safeString(body?.screenshot);
251
+ const clickableElements = Array.isArray(body?.clickableElements)
252
+ ? (body.clickableElements as ClickableElement[])
253
+ : [];
254
+
255
+ if (!prompt) {
256
+ return NextResponse.json(
257
+ { error: "Prompt is required" },
258
+ { status: 400 }
259
+ );
260
+ }
261
 
262
  if (!screenshot) {
263
  return NextResponse.json(
264
+ { error: "Screenshot is required" },
265
  { status: 400 }
266
  );
267
  }
268
 
269
+ const elementSummary = buildElementSummary(clickableElements);
 
 
 
 
 
 
 
270
 
271
+ const userInstruction = `
272
+ USER TASK:
273
+ ${prompt}
274
 
275
+ INTERACTIVE ELEMENTS:
276
+ ${elementSummary}
277
 
278
+ Remember:
279
+ - Return exactly one JSON object.
280
+ - If clicking or hovering, prefer the center coordinates.
281
+ - If answering a question, use type="answer".
282
+ `.trim();
283
 
284
+ const geminiRequestBody = {
 
 
 
285
  contents: [
286
  {
287
  role: "user",
288
  parts: [
289
+ { text: SYSTEM_PROMPT },
290
+ { text: userInstruction },
291
  {
292
  inline_data: {
293
  mime_type: "image/png",
294
  data: screenshot,
295
  },
296
  },
 
297
  ],
298
  },
299
  ],
300
  generationConfig: {
301
+ temperature: 0.1,
302
  topP: 0.8,
303
+ maxOutputTokens: 1024,
304
  responseMimeType: "application/json",
305
  },
306
  };
307
 
308
+ const response = await fetch(`${GEMINI_ENDPOINT}?key=${apiKey}`, {
309
+ method: "POST",
310
+ headers: {
311
+ "Content-Type": "application/json",
312
+ },
313
+ body: JSON.stringify(geminiRequestBody),
314
+ signal: AbortSignal.timeout(30000),
315
+ });
 
316
 
317
  if (!response.ok) {
318
+ const errorText = await response.text();
319
+ console.error("[/api/gemini] Gemini API error:", errorText);
320
+
321
  return NextResponse.json(
322
+ {
323
+ error: `Gemini API returned ${response.status}: ${errorText}`,
324
+ },
325
  { status: response.status }
326
  );
327
  }
328
 
329
  const data = await response.json();
330
+
331
  const rawText =
332
+ data?.candidates
333
+ ?.flatMap((candidate: { content?: { parts?: Array<{ text?: string }> } }) =>
334
+ candidate?.content?.parts || []
335
+ )
336
+ ?.map((part: { text?: string }) => part.text || "")
337
+ ?.join("")
338
+ ?.trim() || "";
339
 
340
+ if (!rawText) {
341
+ return NextResponse.json(
342
+ { error: "Gemini returned an empty response" },
343
+ { status: 500 }
344
+ );
345
+ }
346
+
347
+ let parsed: unknown;
348
  try {
349
+ parsed = JSON.parse(extractJsonObject(rawText));
350
  } catch {
351
+ parsed = {
352
+ type: "answer",
353
+ description: "Fallback answer",
354
+ answer: rawText,
355
+ };
 
 
356
  }
357
 
358
+ const action = normalizeAction(parsed, rawText);
359
+
360
+ return NextResponse.json({
361
+ success: true,
362
+ model: GEMINI_MODEL,
363
+ action,
364
+ raw: rawText,
365
+ });
366
  } catch (e: unknown) {
367
+ const message = e instanceof Error ? e.message : "Unknown error";
368
+ console.error("[/api/gemini]", message);
369
+
370
+ return NextResponse.json({ error: message }, { status: 500 });
371
  }
372
  }