kayanirani commited on
Commit
a633786
·
1 Parent(s): 7dd821e
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. main_back.py +314 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "main_back:app", "--host", "0.0.0.0", "--port", "7860"]
main_back.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from PIL import Image
4
+ import io
5
+ import logging
6
+ import asyncio
7
+ from typing import List
8
+ import google.generativeai as genai
9
+ from dotenv import load_dotenv
10
+ import os
11
+
12
+ load_dotenv()
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ app = FastAPI(title="Sequential Test Step Generator API")
19
+
20
+ # Add CORS middleware to allow frontend requests
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+
30
+ class ModelInference:
31
+ def __init__(self):
32
+ """Initialize the model."""
33
+ genai.configure(api_key=os.getenv("KEY"))
34
+ self.model = genai.GenerativeModel("gemini-2.5-flash")
35
+ self.device = "cuda"
36
+ logger.info("Model loaded successfully!")
37
+
38
+ def process_single_image(self, image: Image.Image) -> Image.Image:
39
+ """Convert image to RGB PIL Image."""
40
+ if image.mode != "RGB":
41
+ image = image.convert("RGB")
42
+ return image
43
+
44
+ def predict_next_step_with_history(
45
+ self, image: Image.Image, goal: str, completed_steps: List[str] = None
46
+ ) -> str:
47
+ """Predict the next step."""
48
+ try:
49
+ if completed_steps is None:
50
+ completed_steps = []
51
+
52
+ image = self.process_single_image(image)
53
+
54
+ if completed_steps:
55
+ history_str = "\n".join(
56
+ [f"{i + 1}. {step}" for i, step in enumerate(completed_steps)]
57
+ )
58
+ prompt = f"""Analyze this UI and generate the next test step.
59
+
60
+ Task: {goal}
61
+
62
+ Completed:
63
+ {history_str}
64
+
65
+ Output format: "ACTION: description [x1, y1, x2, y2]"
66
+ Actions: CLICK, TYPE, SCROLL, WAIT, VERIFY, SELECT, DRAG
67
+ Coordinates: normalized 0.0-1.0
68
+
69
+ Next step only:"""
70
+ else:
71
+ prompt = f"""Analyze this UI and generate the first test step.
72
+
73
+ Task: {goal}
74
+
75
+ Output format: "ACTION: description [x1, y1, x2, y2]"
76
+ Actions: CLICK, TYPE, SCROLL, WAIT, VERIFY, SELECT, DRAG
77
+ Coordinates: normalized 0.0-1.0
78
+
79
+ First step only:"""
80
+
81
+ logger.info(
82
+ f"Generating prediction with {len(completed_steps)} history steps"
83
+ )
84
+ response = self.model.generate_content([prompt, image])
85
+ prediction = response.text.strip()
86
+ logger.info(f"Generated prediction: {prediction}")
87
+ return prediction
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error during prediction: {str(e)}")
91
+ raise
92
+
93
+ def generate_step_sequence(
94
+ self,
95
+ image: Image.Image,
96
+ task_description: str,
97
+ action_history: str = "",
98
+ max_steps: int = 10,
99
+ ) -> List[str]:
100
+ """Generate sequence of steps."""
101
+ logger.info("Using recursive history-aware workflow generation")
102
+ return self.generate_recursive_workflow(
103
+ image=image,
104
+ goal=task_description,
105
+ initial_history=action_history,
106
+ max_steps=max_steps,
107
+ )
108
+
109
+ def generate_recursive_workflow(
110
+ self,
111
+ image: Image.Image,
112
+ goal: str,
113
+ initial_history: str = "",
114
+ max_steps: int = 10,
115
+ ) -> List[str]:
116
+ """Generate all workflow steps at once (faster)."""
117
+ completed_steps = []
118
+
119
+ if initial_history and initial_history.strip():
120
+ if "→" in initial_history:
121
+ completed_steps = [
122
+ s.strip() for s in initial_history.split("→") if s.strip()
123
+ ]
124
+ elif "," in initial_history:
125
+ completed_steps = [
126
+ s.strip() for s in initial_history.split(",") if s.strip()
127
+ ]
128
+ else:
129
+ completed_steps = [initial_history.strip()]
130
+
131
+ logger.info(f"Generating all workflow steps at once for goal: {goal}")
132
+ logger.info(f"Initial history: {completed_steps}")
133
+
134
+ # Generate all steps in one call
135
+ image = self.process_single_image(image)
136
+
137
+ if completed_steps:
138
+ history_str = "\n".join(
139
+ [f"{i + 1}. {step}" for i, step in enumerate(completed_steps)]
140
+ )
141
+ prompt = f"""Analyze this UI and generate ALL remaining test steps to complete the task.
142
+
143
+ Task: {goal}
144
+
145
+ Already completed steps:
146
+ {history_str}
147
+
148
+ Generate the REMAINING steps needed to complete the task.
149
+
150
+ CRITICAL RULES:
151
+ - Output ONLY the steps, NO explanations, NO reasoning, NO extra text
152
+ - One step per line
153
+ - Format: "ACTION: description [x1, y1, x2, y2]"
154
+ - Actions: CLICK, TYPE, SCROLL, WAIT, VERIFY, SELECT, DRAG
155
+ - Coordinates: normalized 0.0-1.0
156
+ - For TYPE actions, describe what to type WITHOUT providing example values (e.g., "TYPE: Enter username in email field" NOT "TYPE: test@example.com")
157
+ - For CLICK actions, describe what to click (e.g., "CLICK: Click on the username input field")
158
+ - Maximum {max_steps} steps
159
+
160
+ Steps:"""
161
+ else:
162
+ prompt = f"""Analyze this UI and generate ALL test steps to complete the task.
163
+
164
+ Task: {goal}
165
+
166
+ Generate a complete sequence of steps to accomplish this task.
167
+
168
+ CRITICAL RULES:
169
+ - Output ONLY the steps, NO explanations, NO reasoning, NO extra text
170
+ - One step per line
171
+ - Format: "ACTION: description [x1, y1, x2, y2]"
172
+ - Actions: CLICK, TYPE, SCROLL, WAIT, VERIFY, SELECT, DRAG
173
+ - Coordinates: normalized 0.0-1.0
174
+ - For TYPE actions, describe what to type WITHOUT providing example values (e.g., "TYPE: Enter username in email field" NOT "TYPE: test@example.com")
175
+ - For CLICK actions, describe what to click (e.g., "CLICK: Click on the username input field")
176
+ - Maximum {max_steps} steps
177
+
178
+ Steps:"""
179
+
180
+ try:
181
+ logger.info("Generating all steps in single API call...")
182
+ response = self.model.generate_content([prompt, image])
183
+ all_steps_text = response.text.strip()
184
+
185
+ # Parse steps (split by newlines)
186
+ new_steps = []
187
+ for line in all_steps_text.split("\n"):
188
+ line = line.strip()
189
+ # Skip empty lines, numbered prefixes, and explanatory text
190
+ if not line:
191
+ continue
192
+ # Remove numbering if present (e.g., "1. " or "1) ")
193
+ if line and line[0].isdigit():
194
+ line = line.split(".", 1)[-1].strip()
195
+ line = line.split(")", 1)[-1].strip()
196
+ # Only keep lines that start with action keywords
197
+ if any(
198
+ line.upper().startswith(action)
199
+ for action in [
200
+ "CLICK:",
201
+ "TYPE:",
202
+ "SCROLL:",
203
+ "WAIT:",
204
+ "VERIFY:",
205
+ "SELECT:",
206
+ "DRAG:",
207
+ ]
208
+ ):
209
+ new_steps.append(line)
210
+
211
+ logger.info(f"Generated {len(new_steps)} steps in one call")
212
+ for i, step in enumerate(new_steps):
213
+ logger.info(f"Step {len(completed_steps) + i + 1}: {step}")
214
+
215
+ return completed_steps + new_steps
216
+
217
+ except Exception as e:
218
+ logger.error(f"Error generating all steps: {str(e)}")
219
+ raise
220
+
221
+
222
+ # Initialize model
223
+ logger.info("Initializing model inference...")
224
+ model_inference = ModelInference()
225
+ logger.info("Model inference ready!")
226
+
227
+
228
+ @app.get("/")
229
+ async def root():
230
+ """Health check endpoint."""
231
+ return {
232
+ "status": "running",
233
+ "message": "Sequential Test Step Generator API",
234
+ "device": str(model_inference.device),
235
+ "model_loaded": True,
236
+ }
237
+
238
+
239
+ @app.post("/predict")
240
+ async def predict(
241
+ image: UploadFile = File(..., description="UI screenshot image"),
242
+ action_history: str = Form(default="", description="Previous action history"),
243
+ task_description: str = Form(..., description="Task description"),
244
+ generate_sequence: bool = Form(
245
+ default=True, description="Generate full sequence or single action"
246
+ ),
247
+ ):
248
+ """Generate test steps based on UI image, action history, and task description."""
249
+ try:
250
+ await asyncio.sleep(0.5)
251
+
252
+ image_data = await image.read()
253
+ pil_image = Image.open(io.BytesIO(image_data))
254
+
255
+ logger.info(f"Received image: {pil_image.size}, mode: {pil_image.mode}")
256
+ logger.info(f"Task description: {task_description}")
257
+ logger.info(
258
+ f"Action history: {action_history[:100]}..."
259
+ if action_history
260
+ else "No history"
261
+ )
262
+
263
+ if generate_sequence:
264
+ predicted_steps = model_inference.generate_step_sequence(
265
+ image=pil_image,
266
+ task_description=task_description,
267
+ action_history=action_history,
268
+ max_steps=10,
269
+ )
270
+ else:
271
+ completed_steps = []
272
+ if action_history and action_history.strip():
273
+ if "→" in action_history:
274
+ completed_steps = [
275
+ s.strip() for s in action_history.split("→") if s.strip()
276
+ ]
277
+ elif "," in action_history:
278
+ completed_steps = [
279
+ s.strip() for s in action_history.split(",") if s.strip()
280
+ ]
281
+ else:
282
+ completed_steps = [action_history.strip()]
283
+
284
+ predicted_action = model_inference.predict_next_step_with_history(
285
+ image=pil_image, goal=task_description, completed_steps=completed_steps
286
+ )
287
+ predicted_steps = [predicted_action]
288
+
289
+ return {
290
+ "success": True,
291
+ "steps": predicted_steps,
292
+ "image_size": pil_image.size,
293
+ "num_steps": len(predicted_steps),
294
+ }
295
+
296
+ except Exception as e:
297
+ logger.error(f"Error processing request: {str(e)}", exc_info=True)
298
+ return {"success": False, "error": "ERROR", "steps": []}
299
+
300
+
301
+ @app.get("/health")
302
+ async def health():
303
+ """Detailed health check."""
304
+ return {
305
+ "status": "healthy",
306
+ "device": str(model_inference.device),
307
+ "model_loaded": model_inference.model is not None,
308
+ }
309
+
310
+
311
+ if __name__ == "__main__":
312
+ import uvicorn
313
+
314
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ torch
5
+ torchvision
6
+ transformers
7
+ Pillow
8
+ numpy
9
+ python-dotenv
10
+ google-generativeai