Spaces:

e1250
/

tracking_system_backend

Runtime error

App Files Files Community

e1250 commited on 5 days ago

Commit

c303abd

1 Parent(s): c1c755a

feat: adjusting monitoring and profiling, adding mlflow

Browse files

Files changed (7) hide show

README.md +3 -1
api/routers/camera_stream.py +27 -17
api/routers/dashboard_stream.py +3 -0
api/routers/metrics.py +25 -0
contracts/camera_metadata.py +4 -3
main.py +6 -1
utils/experiment.py +13 -0

README.md CHANGED Viewed

@@ -3,4 +3,6 @@ title: Tracking System Backend
 sdk: docker
 app_port: 7960
 pinned: false
----

 sdk: docker
 app_port: 7960
 pinned: false
+---
+This project is part of a big project proiding Real-Time Tracking system for Indoors.

api/routers/camera_stream.py CHANGED Viewed

@@ -3,7 +3,7 @@ import itertools
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from ai.contracts.detector import DetectionResults
 from backend.api.routers.metrics import active_cameras, frame_processing_duration_seconds
-from backend.contracts.camera_metadata import CameraMetadata
 import traceback
 import cv2 as cv
@@ -26,6 +26,7 @@ async def websocket_detect(websocket: WebSocket, camera_id:str):
     safety_detector = state.safety_detection_model
     depth_model = state.depth_model
     # Accepting the connection from the client
     await websocket.accept()
@@ -34,6 +35,7 @@ async def websocket_detect(websocket: WebSocket, camera_id:str):
     logger.info(f"Client ID >>{camera_id}<< Connected...")
     loop = asyncio.get_running_loop()
     try:
         # What are the info you aim to collect from the camera?
@@ -62,17 +64,32 @@ async def websocket_detect(websocket: WebSocket, camera_id:str):
         # Keep receiving messages in a loop until disconnection.
         while True:
-            # Profiling
-            time_start = time.time()
             frame_bytes = await websocket.receive_bytes()
             image_array = await loop.run_in_executor(None, decode_frame)
             detection_task = loop.run_in_executor(None, run_detection, image_array)
             safety_task = loop.run_in_executor(None, run_safety, image_array)
             detections, safety_detection = await asyncio.gather(detection_task, safety_task)
             boxes_center = []
             boxes_center_ratio = []
             for box in detections.detections:
@@ -82,34 +99,27 @@ async def websocket_detect(websocket: WebSocket, camera_id:str):
                 ycenter = (ymax + ymin) / 2
                 boxes_center.append((int(xcenter), int(ycenter)))
                 boxes_center_ratio.append(xcenter / image_array.shape[1])
-            depth_points = await loop.run_in_executor(None, run_depth, image_array, boxes_center) if boxes_center else []
-            detection_metadata = [{"depth": depth, "xRatio": xRatio} for depth, xRatio in zip(depth_points, boxes_center_ratio)]
             metadata = CameraMetadata(camera_id=camera_id, is_danger = True if safety_detection else False, detection_metadata=detection_metadata)
-            print(metadata)
             state.camera_metadata[camera_id] = metadata.model_dump()
-            # Profiling
-            duration = time.time() - time_start
-            frame_processing_duration_seconds.labels(camera_id).observe(round(duration, 3))
-            logger.debug("Frame processed", camera_id=camera_id)
             # Note that JSONResponse doesn't work here, as it is for HTTP
             await websocket.send_json({"status": 200, "camera_id": camera_id})
     except WebSocketDisconnect:
         logger.warn(f"Client ID >>{camera_id}<< Disconnected Normally...")
-        traceback.print_exc()  # This one is actually really better, it shows more details about the issue happened.
-        # Also work on and create the logger.exception, as it directly controls printing more details about the issue happened.
         state.camera_metadata.pop(camera_id, None)
     except Exception as e:
         logger.error(f"Error in websocker, Client ID: >>{camera_id}<<: {e}")
-        traceback.print_exc()
         await websocket.close()
     finally:
         active_cameras.dec()
 # Uncomment this when needed, It is the same but using HTTP, which is Request Response only. could be used for testing.

 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from ai.contracts.detector import DetectionResults
 from backend.api.routers.metrics import active_cameras, frame_processing_duration_seconds
+from backend.contracts.camera_metadata import CameraMetadata, DetectionMetadata
 import traceback
 import cv2 as cv
     safety_detector = state.safety_detection_model
     depth_model = state.depth_model
     # Accepting the connection from the client
     await websocket.accept()
     logger.info(f"Client ID >>{camera_id}<< Connected...")
     loop = asyncio.get_running_loop()
+    run = mlflow.start_run(run_name=f'camera_{camera_id}')
     try:
         # What are the info you aim to collect from the camera?
         # Keep receiving messages in a loop until disconnection.
         while True:
             frame_bytes = await websocket.receive_bytes()
+            # Profiling
+            t0 = time.time()
             image_array = await loop.run_in_executor(None, decode_frame)
+            decode_duration_seconds.labels(camera_id).observe(round(time.time() - t0, 3))
+            mlflow.log_metric("frame_processing_time", round(time.time() - t0, 3))
             detection_task = loop.run_in_executor(None, run_detection, image_array)
             safety_task = loop.run_in_executor(None, run_safety, image_array)
             detections, safety_detection = await asyncio.gather(detection_task, safety_task)
+            detection_duration_seconds.labels(camera_id).observe(round(time.time() - t0, 3))
+            mlflow.log_metric("detection_duration_seconds", round(time.time() - t0, 3))
+            depth_points = await loop.run_in_executor(None, run_depth, image_array, boxes_center) if boxes_center else []
+            depth_duration_seconds.labels(camera_id).observe(round(time.time() - t0, 3))
+            mlflow.log_metric("depth_duration_seconds", round(time.time() - t0, 3))
+            # Profiling
+            frame_processing_duration_seconds.labels(camera_id).observe(round(time.time() - t0, 3))
+            logger.debug("Frame processed", camera_id=camera_id)
+            mlflow.log_metric("frame_processing duration time", round(time.time() - t0, 3))
             boxes_center = []
             boxes_center_ratio = []
             for box in detections.detections:
                 ycenter = (ymax + ymin) / 2
                 boxes_center.append((int(xcenter), int(ycenter)))
                 boxes_center_ratio.append(xcenter / image_array.shape[1])
+            detection_metadata = [DetectionMetadata(depth=depth, xRatio=xRatio) for depth, xRatio in zip(depth_points, boxes_center_ratio)]
             metadata = CameraMetadata(camera_id=camera_id, is_danger = True if safety_detection else False, detection_metadata=detection_metadata)
             state.camera_metadata[camera_id] = metadata.model_dump()
             # Note that JSONResponse doesn't work here, as it is for HTTP
             await websocket.send_json({"status": 200, "camera_id": camera_id})
     except WebSocketDisconnect:
         logger.warn(f"Client ID >>{camera_id}<< Disconnected Normally...")
         state.camera_metadata.pop(camera_id, None)
     except Exception as e:
         logger.error(f"Error in websocker, Client ID: >>{camera_id}<<: {e}")
+        traceback.print_exc() # This one is actually really better, it shows more details about the issue happened.
+        # Also work on and create the logger.exception, as it directly controls printing more details about the issue happened.
         await websocket.close()
     finally:
         active_cameras.dec()
+        mlflow.end_run()
 # Uncomment this when needed, It is the same but using HTTP, which is Request Response only. could be used for testing.

api/routers/dashboard_stream.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from backend.api.routers.metrics import active_dashboards
 import asyncio
 router = APIRouter()
@@ -35,5 +36,7 @@ async def dashboard_websocket(websocket: WebSocket):
     except Exception as e:
         logger.error(f"Dashboard Error: {e}")
     finally:
         active_dashboards.dec()

 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from backend.api.routers.metrics import active_dashboards
 import asyncio
+import traceback
 router = APIRouter()
     except Exception as e:
         logger.error(f"Dashboard Error: {e}")
+        traceback.print_exc()
     finally:
         active_dashboards.dec()

api/routers/metrics.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from prometheus_client import Counter, Histogram, Gauge, make_asgi_app
 metrics_asgi_app = make_asgi_app()
 active_cameras = Gauge(
     "active_camera_connections",
     "Number of Currently Connected camera websockets"
@@ -18,3 +22,24 @@ frame_processing_duration_seconds = Histogram(
     ["camera_id"]
 )

+# Prometheus is for real-time system health.
+# Grafana visualize the output of Prometheus
+# This is considered as Monitoring
 from prometheus_client import Counter, Histogram, Gauge, make_asgi_app
 metrics_asgi_app = make_asgi_app()
 active_cameras = Gauge(
     "active_camera_connections",
     "Number of Currently Connected camera websockets"
     ["camera_id"]
 )
+decode_duration_seconds = Histogram(
+    "decode_duration_seconds",
+    "Time to decode one image",
+    ["camera_id"]
+)
+detection_duration_seconds = Histogram(
+    "detection_duration_seconds",
+    "Time to detect",
+    ["camera_id"]
+)
+depth_duration_seconds = Histogram(
+    "depth_duration_seconds",
+    "Time to calculate the depth",
+    ["camera_id"]
+)
+cpu_usage = Gauge("cpu_usage_percent", "CPU usage %")
+mem_usage = Gauge("mem_usage_percent", "mem usage %")
+active_workers = Gauge("active_workers", "Active threads")

contracts/camera_metadata.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from typing import List
 from pydantic import BaseModel
 class CameraMetadata(BaseModel):
     camera_id: str
     is_danger: bool = False
-    detection_metadata: List
-    # depth_points: List = []
-    # box_detections_ratio: List = []

 from typing import List
 from pydantic import BaseModel
+class DetectionMetadata(BaseModel):
+    depth: float
+    xRatio: float
 class CameraMetadata(BaseModel):
     camera_id: str
     is_danger: bool = False
+    detection_metadata: List

main.py CHANGED Viewed

@@ -11,6 +11,8 @@ from backend.api.routers import health
 from contextlib import asynccontextmanager
 from infra.logger_structlog import StructLogger
 import asyncio
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -47,7 +49,10 @@ async def lifespan(app: FastAPI):
     logger.warn("Shutting down the server....")
     # You can remove connections and release gpu here .
 app = FastAPI(
     title="Tracking System Backend",
     description="real-time frame processing API",

 from contextlib import asynccontextmanager
 from infra.logger_structlog import StructLogger
 import asyncio
+import mlflow
+from backend.utils.experiment import log_config()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logger.warn("Shutting down the server....")
     # You can remove connections and release gpu here .
+mlflow.set_experiment("realtime-detection-system")
+log_config()
 app = FastAPI(
     title="Tracking System Backend",
     description="real-time frame processing API",

utils/experiment.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import mlflow
+def start_run(camera_id: str):
+    return mlflow.start_run(run_name=f"camera_{camera_id}")
+def log_config():
+    mlflow.log_param("detector", "yolov26_n")
+    mlflow.log_param("safety_model", "custom YOLO26_n")
+    mlflow.log_param("depth_model", "depthAnything_n")
+def log_metrics(metrics:dict):
+    for k, v in metrics.items():
+        mlflow.log_metric(k, v)