Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, Form | |
| from fastapi.responses import JSONResponse | |
| from PIL import Image | |
| import io, os, json, re | |
| from google import genai | |
| # ---------------- CONFIG ---------------- | |
| app = FastAPI() | |
| API_KEY = os.getenv("GEMINI_API") | |
| MODEL = "gemini-2.5-flash" | |
| client = genai.Client(api_key=API_KEY) if API_KEY else None | |
| # ---------------- HELPERS ---------------- | |
| def clean_json(text: str) -> str: | |
| if not text: | |
| return "{}" | |
| t = text.replace("```json", "").replace("```", "").strip() | |
| m = re.search(r"\{.*\}", t, re.DOTALL) | |
| return m.group(0) if m else "{}" | |
| # ---------------- ROUTES ---------------- | |
| def home(): | |
| return {"status": "ok", "has_key": bool(API_KEY)} | |
| async def next_step( | |
| user_request: str = Form(...), | |
| image: UploadFile = File(...) | |
| ): | |
| try: | |
| if not client: | |
| return JSONResponse( | |
| {"error": "GEMINI_API_KEY missing"}, | |
| status_code=500 | |
| ) | |
| # read image | |
| img_bytes = await image.read() | |
| screenshot = Image.open(io.BytesIO(img_bytes)).convert("RGB") | |
| # ---------------- PROMPT ---------------- | |
| prompt = f""" | |
| You are an Android UI automation agent. | |
| INPUTS: | |
| 1) user_request β what the user wants | |
| 2) screenshot β current app screen | |
| GOAL: | |
| Return the NEXT UI step as JSON. | |
| ================ INTENT DETECTION ================= | |
| Decide intent FIRST. | |
| INTENT RULES: | |
| - If user_request mentions food, restaurant, dish, eat, Swiggy, Zomato β intent = order_food | |
| - If user_request mentions ride, cab, bike, auto, Uber, Ola, Rapido β intent = book_ride | |
| - Otherwise β intent = unknown | |
| Allowed intents: | |
| order_food | book_ride | browse | unknown | |
| ================ GENERAL RULES ================= | |
| - Return ONLY valid JSON | |
| - NO explanation | |
| - NO markdown | |
| - Think ONE screen only | |
| ================ FOOD RULES ================= | |
| 1) If restaurant name exists: | |
| - Open restaurant using search | |
| 2) Search behavior: | |
| - After food_type β DO NOT press enter | |
| - ALWAYS click first visible result | |
| - Use food_click_first_result | |
| 3) Menu: | |
| - If item visible β click | |
| - Else β scroll_down | |
| 4) Add: | |
| - Click ADD / Add / + Add / Customize | |
| 5) Cart: | |
| - If cart visible β open cart β done = true | |
| FOOD STEPS: | |
| food_open_search | |
| food_type | |
| food_click_first_result | |
| food_open_restaurant | |
| food_find_item | |
| food_add_first | |
| food_open_cart | |
| ================ RIDE RULES ================= | |
| 1) If intent = book_ride: | |
| - Look for pickup field β type pickup | |
| - Look for drop field β type drop | |
| 2) Ride selection: | |
| - If bike/auto/cab visible β select cheapest or requested | |
| 3) Confirm: | |
| - If Book / Confirm button visible β click β done = true | |
| RIDE STEPS (STRICT ORDER): | |
| 1) ride_open_drop_search | |
| 2) ride_set_drop | |
| 3) ride_select_suggestion | |
| 4) ride_select_vehicle | |
| 5) ride_confirm | |
| CRITICAL RIDE RULE (STRICT): | |
| - NEVER use ride_set_drop without ride_open_drop_search first | |
| - Pickup and drop text are ALREADY known | |
| - NEVER send pickup, drop, destination names | |
| - NEVER include address, text, value, pickup, drop fields | |
| - ONLY send step types | |
| - Pickup and drop are already known from user_request | |
| - NEVER infer pickup or drop from screenshot suggestions | |
| - NEVER use recent places, railway stations, or history | |
| - ONLY return UI steps, not new locations | |
| - If suggestion list is visible: | |
| β type text | |
| β then select first suggestion | |
| ================ WAIT ================= | |
| - If screen loading β wait | |
| - wait MUST include ms (1500β3000) | |
| ================ RESPONSE FORMAT ================= | |
| {{ | |
| "done": false, | |
| "intent": "order_food | book_ride | browse | unknown", | |
| "steps": [ | |
| {{ | |
| "type": "wait", | |
| "ms": 1500 | |
| }} | |
| ], | |
| "next": "short hint" | |
| }} | |
| """ | |
| # ---------------- GEMINI CALL ---------------- | |
| resp = client.models.generate_content( | |
| model=MODEL, | |
| contents=[prompt, screenshot] | |
| ) | |
| raw = resp.text or "" | |
| cleaned = clean_json(raw) | |
| # ---------------- VALIDATE JSON ---------------- | |
| try: | |
| data = json.loads(cleaned) | |
| except: | |
| data = { | |
| "done": False, | |
| "intent": "unknown", | |
| "steps": [{"type": "wait", "ms": 1500}], | |
| "next": "retry" | |
| } | |
| return JSONResponse(data) | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |