agent / app.py
Shiva-teja-chary's picture
Update app.py
466c81d verified
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from PIL import Image
import io, os, json, re
from google import genai
# ---------------- CONFIG ----------------
app = FastAPI()
API_KEY = os.getenv("GEMINI_API")
MODEL = "gemini-2.5-flash"
client = genai.Client(api_key=API_KEY) if API_KEY else None
# ---------------- HELPERS ----------------
def clean_json(text: str) -> str:
if not text:
return "{}"
t = text.replace("```json", "").replace("```", "").strip()
m = re.search(r"\{.*\}", t, re.DOTALL)
return m.group(0) if m else "{}"
def safe_fallback():
return {
"done": False,
"intent": "unknown",
"confidence": 0.3,
"vision_tap": None,
"steps": [{"type": "wait", "ms": 1500}],
"next": "retry"
}
# ---------------- ROUTES ----------------
@app.get("/")
def home():
return {"status": "ok", "has_key": bool(API_KEY)}
@app.post("/next_step")
async def next_step(
user_request: str = Form(...),
image: UploadFile = File(...)
):
try:
if not client:
return JSONResponse(
{"error": "GEMINI_API_KEY missing"},
status_code=500
)
# -------- read image --------
img_bytes = await image.read()
screenshot = Image.open(io.BytesIO(img_bytes)).convert("RGB")
# ================= PROMPT =================
prompt = f"""
You are an expert Android UI automation agent.
Your job: return the NEXT BEST UI action.
USER REQUEST:
{user_request}
━━━━━━━━━━━━━━━━━━━━━━━━━━
SUPPORTED STEP TYPES (STRICT)
━━━━━━━━━━━━━━━━━━━━━━━━━━
You MUST use ONLY these:
click_text
click_desc
click_id
click_any
type_text
scroll_down
wait
FOOD:
food_open_search
food_type
food_click_first_result
food_open_restaurant
food_find_item
food_add_first
food_open_cart
RIDE:
ride_open_drop_search
ride_set_drop
ride_select_suggestion
ride_select_vehicle
ride_confirm
━━━━━━━━━━━━━━━━━━━━━━━━━━
CRITICAL RULES
━━━━━━━━━━━━━━━━━━━━━━━━━━
βœ… Only one logical next step
βœ… Only visible UI
βœ… Never hallucinate
βœ… Prefer click_text first
βœ… Prefer click_desc second
βœ… Use click_any if unsure
βœ… Use wait if loading
βœ… Use scroll_down if item missing
━━━━━━━━━━━━━━━━━━━━━━━━━━
SELF-REFLECTION (VERY IMPORTANT)
━━━━━━━━━━━━━━━━━━━━━━━━━━
Before answering, check:
1. Is the element clearly visible?
2. Am I repeating the same step?
3. Is the app still loading?
4. Is there a simpler action?
5. Am I guessing?
If unsure β†’ use wait or scroll_down.
ANTI-LOOP:
If previous action likely failed,
DO NOT repeat same click again.
━━━━━━━━━━━━━━━━━━━━━━━━━━
CONFIDENCE RULE
━━━━━━━━━━━━━━━━━━━━━━━━━━
Return confidence (0–1).
If confidence < 0.7:
β†’ prefer wait
β†’ or scroll_down
━━━━━━━━━━━━━━━━━━━━━━━━━━
VISION TAP (ONLY WHEN NEEDED)
━━━━━━━━━━━━━━━━━━━━━━━━━━
If button is visible but has NO text:
Return:
"vision_tap": {{
"x": 0.0–1.0,
"y": 0.0–1.0
}}
Otherwise vision_tap = null.
━━━━━━━━━━━━━━━━━━━━━━━━━━
FOOD FLOW
━━━━━━━━━━━━━━━━━━━━━━━━━━
Order:
1) open search
2) type restaurant
3) click first result
4) click food or find item
5) add
6) open cart β†’ done=true
━━━━━━━━━━━━━━━━━━━━━━━━━━
RIDE FLOW (STRICT ORDER)
━━━━━━━━━━━━━━━━━━━━━━━━━━
1) ride_open_drop_search
2) ride_set_drop
3) ride_select_suggestion
4) ride_select_vehicle
5) ride_confirm
NEVER send pickup/drop text.
━━━━━━━━━━━━━━━━━━━━━━━━━━
WAIT RULE
━━━━━━━━━━━━━━━━━━━━━━━━━━
Use wait if:
- spinner
- shimmer
- loading
- transition
- keyboard opening
Range: 1200–2500 ms
━━━━━━━━━━━━━━━━━━━━━━━━━━
OUTPUT FORMAT (STRICT JSON)
━━━━━━━━━━━━━━━━━━━━━━━━━━
{{
"done": false,
"intent": "order_food | book_ride | browse | unknown",
"confidence": 0.0,
"vision_tap": null,
"steps": [
{{
"type": "click_text",
"value": "Search"
}}
],
"next": "short hint"
}}
"""
# ---------------- GEMINI CALL ----------------
resp = client.models.generate_content(
model=MODEL,
contents=[prompt, screenshot]
)
raw = resp.text or ""
cleaned = clean_json(raw)
# ---------------- VALIDATE ----------------
try:
data = json.loads(cleaned)
except Exception:
return JSONResponse(safe_fallback())
# -------- HARD SAFETY --------
if "steps" not in data:
return JSONResponse(safe_fallback())
if not isinstance(data["steps"], list) or len(data["steps"]) == 0:
return JSONResponse(safe_fallback())
# ensure confidence
try:
conf = float(data.get("confidence", 0.5))
data["confidence"] = max(0.0, min(1.0, conf))
except:
data["confidence"] = 0.5
# ensure vision_tap exists
if "vision_tap" not in data:
data["vision_tap"] = None
return JSONResponse(data)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)