agent / main.py
Shiva-teja-chary's picture
Update main.py
40954af verified
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from PIL import Image
import io, os, json, re
from google import genai
# ---------------- CONFIG ----------------
app = FastAPI()
API_KEY = os.getenv("GEMINI_API")
MODEL = "gemini-2.5-flash"
client = genai.Client(api_key=API_KEY) if API_KEY else None
# ---------------- HELPERS ----------------
def clean_json(text: str) -> str:
if not text:
return "{}"
t = text.replace("```json", "").replace("```", "").strip()
m = re.search(r"\{.*\}", t, re.DOTALL)
return m.group(0) if m else "{}"
# ---------------- ROUTES ----------------
@app.get("/")
def home():
return {"status": "ok", "has_key": bool(API_KEY)}
@app.post("/next_step")
async def next_step(
user_request: str = Form(...),
image: UploadFile = File(...)
):
try:
if not client:
return JSONResponse(
{"error": "GEMINI_API_KEY missing"},
status_code=500
)
# read image
img_bytes = await image.read()
screenshot = Image.open(io.BytesIO(img_bytes)).convert("RGB")
# ---------------- PROMPT ----------------
prompt = f"""
You are an Android UI automation agent.
INPUTS:
1) user_request – what the user wants
2) screenshot – current app screen
GOAL:
Return the NEXT UI step as JSON.
================ INTENT DETECTION =================
Decide intent FIRST.
INTENT RULES:
- If user_request mentions food, restaurant, dish, eat, Swiggy, Zomato β†’ intent = order_food
- If user_request mentions ride, cab, bike, auto, Uber, Ola, Rapido β†’ intent = book_ride
- Otherwise β†’ intent = unknown
Allowed intents:
order_food | book_ride | browse | unknown
================ GENERAL RULES =================
- Return ONLY valid JSON
- NO explanation
- NO markdown
- Think ONE screen only
================ FOOD RULES =================
1) If restaurant name exists:
- Open restaurant using search
2) Search behavior:
- After food_type β†’ DO NOT press enter
- ALWAYS click first visible result
- Use food_click_first_result
3) Menu:
- If item visible β†’ click
- Else β†’ scroll_down
4) Add:
- Click ADD / Add / + Add / Customize
5) Cart:
- If cart visible β†’ open cart β†’ done = true
FOOD STEPS:
food_open_search
food_type
food_click_first_result
food_open_restaurant
food_find_item
food_add_first
food_open_cart
================ RIDE RULES =================
1) If intent = book_ride:
- Look for pickup field β†’ type pickup
- Look for drop field β†’ type drop
2) Ride selection:
- If bike/auto/cab visible β†’ select cheapest or requested
3) Confirm:
- If Book / Confirm button visible β†’ click β†’ done = true
RIDE STEPS (STRICT ORDER):
1) ride_open_drop_search
2) ride_set_drop
3) ride_select_suggestion
4) ride_select_vehicle
5) ride_confirm
CRITICAL RIDE RULE (STRICT):
- NEVER use ride_set_drop without ride_open_drop_search first
- Pickup and drop text are ALREADY known
- NEVER send pickup, drop, destination names
- NEVER include address, text, value, pickup, drop fields
- ONLY send step types
- Pickup and drop are already known from user_request
- NEVER infer pickup or drop from screenshot suggestions
- NEVER use recent places, railway stations, or history
- ONLY return UI steps, not new locations
- If suggestion list is visible:
β†’ type text
β†’ then select first suggestion
================ WAIT =================
- If screen loading β†’ wait
- wait MUST include ms (1500–3000)
================ RESPONSE FORMAT =================
{{
"done": false,
"intent": "order_food | book_ride | browse | unknown",
"steps": [
{{
"type": "wait",
"ms": 1500
}}
],
"next": "short hint"
}}
"""
# ---------------- GEMINI CALL ----------------
resp = client.models.generate_content(
model=MODEL,
contents=[prompt, screenshot]
)
raw = resp.text or ""
cleaned = clean_json(raw)
# ---------------- VALIDATE JSON ----------------
try:
data = json.loads(cleaned)
except:
data = {
"done": False,
"intent": "unknown",
"steps": [{"type": "wait", "ms": 1500}],
"next": "retry"
}
return JSONResponse(data)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)