Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| from openai import OpenAI | |
| import os | |
| import json | |
| import re | |
| # ====================================================== | |
| # NVIDIA OPENAI-COMPATIBLE CLIENT | |
| # ====================================================== | |
| client = OpenAI( | |
| base_url="https://integrate.api.nvidia.com/v1", | |
| api_key=os.getenv("NVIDIA_API_KEY") | |
| ) | |
| MODEL = "deepseek-ai/deepseek-v3.2" | |
| app = FastAPI() | |
| # ====================================================== | |
| # HELPER: CALL LLM AND RETURN PURE JSON | |
| # ====================================================== | |
| def call_llm_json(prompt: str) -> dict: | |
| try: | |
| completion = client.chat.completions.create( | |
| model=MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0, | |
| top_p=0.95, | |
| max_tokens=4096, | |
| extra_body={"chat_template_kwargs": {"thinking": True}}, | |
| stream=False | |
| ) | |
| content = completion.choices[0].message.content | |
| # π₯ Extract ONLY JSON (ignore reasoning) | |
| match = re.search(r"\{[\s\S]*\}", content) | |
| if not match: | |
| raise ValueError("No JSON found in LLM output") | |
| return json.loads(match.group()) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # ====================================================== | |
| # 1οΈβ£ GENERATE STEPS (HIGH-LEVEL PLAN) | |
| # ====================================================== | |
| class GenerateStepsRequest(BaseModel): | |
| user_request: str | |
| class GenerateStepsResponse(BaseModel): | |
| intent: str | |
| restaurant: Optional[str] | |
| food_item: Optional[str] | |
| steps: List[str] | |
| def generate_steps(req: GenerateStepsRequest): | |
| prompt = f""" | |
| You are an intent and planning engine. | |
| USER REQUEST: | |
| {req.user_request} | |
| TASKS: | |
| 1. Detect intent | |
| 2. Extract restaurant name (if food) | |
| 3. Extract food item (if food) | |
| 4. Generate HIGH-LEVEL steps ONLY | |
| INTENT RULES: | |
| - food, restaurant, dish, eat, Swiggy, Zomato β order_food | |
| - ride, cab, bike, auto, Uber, Ola, Rapido β book_ride | |
| - otherwise β unknown | |
| STEP RULES: | |
| - App is ALREADY open | |
| - DO NOT include "open app" | |
| - Steps must be GENERIC (no UI clicks) | |
| - Max 10 steps | |
| - Order must be logical | |
| OUTPUT JSON FORMAT (ONLY JSON): | |
| {{ | |
| "intent": "order_food | book_ride | unknown", | |
| "restaurant": "string | null", | |
| "food_item": "string | null", | |
| "steps": [ | |
| "step 1", | |
| "step 2" | |
| ] | |
| }} | |
| """ | |
| return call_llm_json(prompt) | |
| # ====================================================== | |
| # 2οΈβ£ NEXT UI STEP (SCREEN β ACTION) | |
| # ====================================================== | |
| class NextUiStepRequest(BaseModel): | |
| user_request: str | |
| intent: str | |
| restaurant: Optional[str] | |
| food_item: Optional[str] | |
| current_step: str | |
| screen: str | |
| class UiStep(BaseModel): | |
| type: str | |
| value: Optional[str] = None | |
| ms: Optional[int] = None | |
| class NextUiStepResponse(BaseModel): | |
| done: bool | |
| intent: str | |
| steps: List[UiStep] | |
| next: str | |
| def next_ui_step(req: NextUiStepRequest): | |
| prompt = f""" | |
| You are an Android UI automation agent. | |
| INPUTS: | |
| 1) user_request β what the user wants | |
| 2) screenshot β current app screen text | |
| 3) current_step β current high-level step | |
| USER REQUEST: | |
| {req.user_request} | |
| CURRENT STEP: | |
| {req.current_step} | |
| SCREEN TEXT: | |
| {req.screen} | |
| KNOWN CONTEXT: | |
| - intent: {req.intent} | |
| - restaurant: {req.restaurant} | |
| - food_item: {req.food_item} | |
| GOAL: | |
| Return the NEXT UI step as JSON. | |
| ================ INTENT ================= | |
| Use provided intent ONLY. | |
| ================ GENERAL RULES ================= | |
| - Return ONLY valid JSON | |
| - NO explanation | |
| - Think ONE screen only | |
| ================ FOOD RULES ================= | |
| 1) If restaurant exists β search & open | |
| 2) After typing β ALWAYS click first result | |
| 3) If item not visible β scroll_down | |
| 4) Add item β click ADD / Add / + Add | |
| 5) If cart visible β open cart β done=true | |
| FOOD STEPS: | |
| food_open_search | |
| food_type | |
| food_click_first_result | |
| food_open_restaurant | |
| food_find_item | |
| food_add_first | |
| food_open_cart | |
| ================ WAIT ================= | |
| - If loading β wait (1500β3000 ms) | |
| ================ RESPONSE FORMAT ================= | |
| {{ | |
| "done": false, | |
| "intent": "{req.intent}", | |
| "steps": [ | |
| {{ "type": "food_open_search" }} | |
| ], | |
| "next": "short hint" | |
| }} | |
| """ | |
| return call_llm_json(prompt) | |