Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, Form | |
| from fastapi.responses import JSONResponse | |
| from PIL import Image | |
| import io, os, json, re | |
| from google import genai | |
| # ---------------- CONFIG ---------------- | |
| app = FastAPI() | |
| API_KEY = os.getenv("GEMINI_API") | |
| MODEL = "gemini-2.5-flash" | |
| client = genai.Client(api_key=API_KEY) if API_KEY else None | |
| # ---------------- HELPERS ---------------- | |
| def clean_json(text: str) -> str: | |
| if not text: | |
| return "{}" | |
| t = text.replace("```json", "").replace("```", "").strip() | |
| m = re.search(r"\{.*\}", t, re.DOTALL) | |
| return m.group(0) if m else "{}" | |
| def safe_fallback(): | |
| return { | |
| "done": False, | |
| "intent": "unknown", | |
| "confidence": 0.3, | |
| "vision_tap": None, | |
| "steps": [{"type": "wait", "ms": 1500}], | |
| "next": "retry" | |
| } | |
| # ---------------- ROUTES ---------------- | |
| def home(): | |
| return {"status": "ok", "has_key": bool(API_KEY)} | |
| async def next_step( | |
| user_request: str = Form(...), | |
| image: UploadFile = File(...) | |
| ): | |
| try: | |
| if not client: | |
| return JSONResponse( | |
| {"error": "GEMINI_API_KEY missing"}, | |
| status_code=500 | |
| ) | |
| # -------- read image -------- | |
| img_bytes = await image.read() | |
| screenshot = Image.open(io.BytesIO(img_bytes)).convert("RGB") | |
| # ================= PROMPT ================= | |
| prompt = f""" | |
| You are an expert Android UI automation agent. | |
| Your job: return the NEXT BEST UI action. | |
| USER REQUEST: | |
| {user_request} | |
| ββββββββββββββββββββββββββ | |
| SUPPORTED STEP TYPES (STRICT) | |
| ββββββββββββββββββββββββββ | |
| You MUST use ONLY these: | |
| click_text | |
| click_desc | |
| click_id | |
| click_any | |
| type_text | |
| scroll_down | |
| wait | |
| FOOD: | |
| food_open_search | |
| food_type | |
| food_click_first_result | |
| food_open_restaurant | |
| food_find_item | |
| food_add_first | |
| food_open_cart | |
| RIDE: | |
| ride_open_drop_search | |
| ride_set_drop | |
| ride_select_suggestion | |
| ride_select_vehicle | |
| ride_confirm | |
| ββββββββββββββββββββββββββ | |
| CRITICAL RULES | |
| ββββββββββββββββββββββββββ | |
| β Only one logical next step | |
| β Only visible UI | |
| β Never hallucinate | |
| β Prefer click_text first | |
| β Prefer click_desc second | |
| β Use click_any if unsure | |
| β Use wait if loading | |
| β Use scroll_down if item missing | |
| ββββββββββββββββββββββββββ | |
| SELF-REFLECTION (VERY IMPORTANT) | |
| ββββββββββββββββββββββββββ | |
| Before answering, check: | |
| 1. Is the element clearly visible? | |
| 2. Am I repeating the same step? | |
| 3. Is the app still loading? | |
| 4. Is there a simpler action? | |
| 5. Am I guessing? | |
| If unsure β use wait or scroll_down. | |
| ANTI-LOOP: | |
| If previous action likely failed, | |
| DO NOT repeat same click again. | |
| ββββββββββββββββββββββββββ | |
| CONFIDENCE RULE | |
| ββββββββββββββββββββββββββ | |
| Return confidence (0β1). | |
| If confidence < 0.7: | |
| β prefer wait | |
| β or scroll_down | |
| ββββββββββββββββββββββββββ | |
| VISION TAP (ONLY WHEN NEEDED) | |
| ββββββββββββββββββββββββββ | |
| If button is visible but has NO text: | |
| Return: | |
| "vision_tap": {{ | |
| "x": 0.0β1.0, | |
| "y": 0.0β1.0 | |
| }} | |
| Otherwise vision_tap = null. | |
| ββββββββββββββββββββββββββ | |
| FOOD FLOW | |
| ββββββββββββββββββββββββββ | |
| Order: | |
| 1) open search | |
| 2) type restaurant | |
| 3) click first result | |
| 4) click food or find item | |
| 5) add | |
| 6) open cart β done=true | |
| ββββββββββββββββββββββββββ | |
| RIDE FLOW (STRICT ORDER) | |
| ββββββββββββββββββββββββββ | |
| 1) ride_open_drop_search | |
| 2) ride_set_drop | |
| 3) ride_select_suggestion | |
| 4) ride_select_vehicle | |
| 5) ride_confirm | |
| NEVER send pickup/drop text. | |
| ββββββββββββββββββββββββββ | |
| WAIT RULE | |
| ββββββββββββββββββββββββββ | |
| Use wait if: | |
| - spinner | |
| - shimmer | |
| - loading | |
| - transition | |
| - keyboard opening | |
| Range: 1200β2500 ms | |
| ββββββββββββββββββββββββββ | |
| OUTPUT FORMAT (STRICT JSON) | |
| ββββββββββββββββββββββββββ | |
| {{ | |
| "done": false, | |
| "intent": "order_food | book_ride | browse | unknown", | |
| "confidence": 0.0, | |
| "vision_tap": null, | |
| "steps": [ | |
| {{ | |
| "type": "click_text", | |
| "value": "Search" | |
| }} | |
| ], | |
| "next": "short hint" | |
| }} | |
| """ | |
| # ---------------- GEMINI CALL ---------------- | |
| resp = client.models.generate_content( | |
| model=MODEL, | |
| contents=[prompt, screenshot] | |
| ) | |
| raw = resp.text or "" | |
| cleaned = clean_json(raw) | |
| # ---------------- VALIDATE ---------------- | |
| try: | |
| data = json.loads(cleaned) | |
| except Exception: | |
| return JSONResponse(safe_fallback()) | |
| # -------- HARD SAFETY -------- | |
| if "steps" not in data: | |
| return JSONResponse(safe_fallback()) | |
| if not isinstance(data["steps"], list) or len(data["steps"]) == 0: | |
| return JSONResponse(safe_fallback()) | |
| # ensure confidence | |
| try: | |
| conf = float(data.get("confidence", 0.5)) | |
| data["confidence"] = max(0.0, min(1.0, conf)) | |
| except: | |
| data["confidence"] = 0.5 | |
| # ensure vision_tap exists | |
| if "vision_tap" not in data: | |
| data["vision_tap"] = None | |
| return JSONResponse(data) | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) |