1ForrestW1
/

gta1-endpoint

Model card Files Files and versions

Forrest Wargo commited on Oct 8, 2025

Commit

582f8ae

·

1 Parent(s): cde6e20

unification

Files changed (1) hide show

handler.py +29 -11

handler.py CHANGED Viewed

@@ -191,6 +191,8 @@ class EndpointHandler:
         )
         inputs = {k: (v.to(self.model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
         with torch.no_grad():
             out_ids = self.model.generate(
                 **inputs,
@@ -202,6 +204,7 @@ class EndpointHandler:
         out_text = self.processor.batch_decode(
             trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )[0]
         # Extract coordinates from model output and rescale to original image
         def _extract_xy(s: str):
@@ -215,16 +218,31 @@ class EndpointHandler:
                 return None
         pred = _extract_xy(out_text)
-        payload: Dict[str, Any] = {"raw": out_text}
-        if width and height:
-            payload.update({"width": width, "height": height})
-        if pred is not None and width and height:
-            # The model returns pixel coordinates on the input image; we did not pre-resize
-            px = max(0.0, min(float(pred[0]), float(width)))
-            py = max(0.0, min(float(pred[1]), float(height)))
-            nx = px / float(width)
-            ny = py / float(height)
-            payload.update({"x": nx, "y": ny})
-        return payload

         )
         inputs = {k: (v.to(self.model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+        import time
+        t0 = time.time()
         with torch.no_grad():
             out_ids = self.model.generate(
                 **inputs,
         out_text = self.processor.batch_decode(
             trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )[0]
+        t1 = time.time()
         # Extract coordinates from model output and rescale to original image
         def _extract_xy(s: str):
                 return None
         pred = _extract_xy(out_text)
+        if user_text is not None:
+            try:
+                print(f"[gta1-endpoint] Prompt: {user_text}")
+            except Exception:
+                pass
+        try:
+            print(f"[gta1-endpoint] Raw output: {out_text}")
+        except Exception:
+            pass
+        try:
+            print(f"[gta1-endpoint] Inference time: {t1 - t0:.3f}s")
+        except Exception:
+            pass
+        if pred is None or not (width and height):
+            return {"error": "Failed to parse coordinates or missing image dimensions."}
+        # The model returns pixel coordinates on the input image; we did not pre-resize
+        px = max(0.0, min(float(pred[0]), float(width)))
+        py = max(0.0, min(float(pred[1]), float(height)))
+        nx = px / float(width)
+        ny = py / float(height)
+        return {
+            "points": [{"x": nx, "y": ny}],
+            "raw": out_text,
+        }