Spaces:

thienphuc12339
/

SignLanguage-pro

Runtime error

App Files Files Community

thienphuc12339 commited on Dec 20, 2024

Commit

a7eca0b

1 Parent(s): c1a2a6d

Add all source code

Browse files

Files changed (37) hide show

.dockerignore +12 -0
Dockerfile +27 -0
__init__.py +3 -0
app.py +112 -0
configs/__init__.py +1 -0
configs/arguments.py +176 -0
configs/dsta_slr.yaml +13 -0
configs/sl_gcn.yaml +13 -0
configs/spoter.yaml +11 -0
data/__init__.py +1 -0
data/utils.py +159 -0
inference.py +215 -0
models/dsta_slr_joint_motion_v3_0.onnx +3 -0
models/sl_gcn_joint_v3_0.onnx +3 -0
models/spoter_v3.0.onnx +3 -0
request.py +15 -0
requirements.txt +27 -0
tools/__init__.py +3 -0
tools/__pycache__/__init__.cpython-312.pyc +0 -0
tools/__pycache__/__init__.cpython-39.pyc +0 -0
tools/__pycache__/features.cpython-39.pyc +0 -0
tools/__pycache__/models.cpython-312.pyc +0 -0
tools/__pycache__/models.cpython-39.pyc +0 -0
tools/features.py +31 -0
tools/models.py +443 -0
utils/__init__.py +2 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/constants.cpython-312.pyc +0 -0
utils/__pycache__/loggers.cpython-312.pyc +0 -0
utils/constants.py +160 -0
utils/loggers.py +26 -0
visualization/__init__.py +1 -0
visualization/__pycache__/__init__.cpython-312.pyc +0 -0
visualization/__pycache__/__init__.cpython-39.pyc +0 -0
visualization/__pycache__/utils.cpython-312.pyc +0 -0
visualization/__pycache__/utils.cpython-39.pyc +0 -0
visualization/utils.py +57 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# Ignore build artifacts
+*.log
+*.tmp
+# Ignore compiled Python files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Ignore files/directories
+# engines/data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.10-slim
+# Tắt buffering để log ra terminal ngay lập tức
+ENV PYTHONUNBUFFERED=1
+# Cài đặt các thư viện hệ thống cần thiết
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Sao chép requirements.txt vào container và cài đặt
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Sao chép toàn bộ code vào container
+COPY . .
+# Thiết lập biến môi trường PORT (Hugging Face sẽ trỏ traffic vào port này)
+ENV PORT 7860
+EXPOSE 7860
+# Chạy ứng dụng FastAPI bằng uvicorn
+# Ở đây giả sử file main app của bạn là app.py và app là tên biến FastAPI instance
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# WRITER: PhucNTT2 # EMAIL: thienphuc12339@gmail.com # DATE: 11/2023
+# FROM: akaOCR Team
+# ALL USE CASES MUST BE APPROVED BY AKAOCR TEAM

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from pathlib import Path
+import shutil
+import logging
+import uvicorn
+import asyncio
+from typing import Optional
+from configs import ModelConfig, InferenceConfig
+from tools.models import load_pipeline
+from inference import inference as run_inference
+# Initialize FastAPI app
+app = FastAPI(title="Sign Language Recognition API")
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Define a Pydantic model for the response
+class InferenceResponse(BaseModel):
+    status: str
+    predictions: Optional[list] = None
+    message: Optional[str] = None
+@app.post("/inference", response_model=InferenceResponse)
+async def inference_endpoint(
+    file: UploadFile = File(...),
+    model_name: str = Form(...),
+    output_dir: Optional[str] = Form("output")
+):
+    """
+    Endpoint để xử lý yêu cầu nhận diện ngôn ngữ ký hiệu từ video.
+    Args:
+        file (UploadFile): Video file được tải lên.
+        model_name (str): Tên mô hình sẽ sử dụng (ví dụ: 'spoter', 'sl_gcn', 'dsta_slr').
+        output_dir (str, optional): Thư mục để lưu kết quả. Mặc định là 'output'.
+    Returns:
+        InferenceResponse: Kết quả nhận diện.
+    """
+    # Kiểm tra file có hợp lệ không
+    if not file.filename.endswith((".mp4", ".avi", ".mov", ".mkv")):
+        raise HTTPException(status_code=400, detail="Unsupported file type.")
+    # Tạo thư mục output nếu không tồn tại
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Lưu video tạm thời
+    video_path = output_path / file.filename
+    with open(video_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    logger.info(f"Video saved to {video_path}")
+    # Tải cấu hình mô hình dựa trên model_name
+    try:
+        if model_name == "spoter":
+            model_config = ModelConfig(arch="spoter", pretrained="vsltranslation/spoter_v3.0")
+        elif model_name == "sl_gcn":
+            model_config = ModelConfig(arch="sl_gcn", pretrained="vsltranslation/sl_gcn_joint_v3_0")
+        elif model_name == "dsta_slr":
+            model_config = ModelConfig(arch="dsta_slr", pretrained="vsltranslation/dsta_slr_joint_motion_v3_0")
+        else:
+            raise ValueError("Unsupported model name.")
+        inference_config = InferenceConfig(
+            source=str(video_path),
+            output_dir=str(output_path),
+            use_onnx=False,
+            device="cpu",  # Bạn có thể thay đổi thành "cuda" nếu sử dụng GPU
+            cache_dir="models/huggingface",
+            visualize=False,
+            show_skeleton=False,
+            visibility=0.5,
+            angle_threshold=140,
+            min_num_up_frames=10,
+            min_num_down_frames=10,
+            delay=400,
+            top_k=3,
+            bone_stream=False,
+            motion_stream=False
+        )
+        # Tải pipeline
+        pipeline = load_pipeline(model_config, inference_config)
+        logger.info("Pipeline loaded successfully.")
+        # Chạy inference
+        run_inference(model_config, inference_config, pipeline)
+        logger.info("Inference completed successfully.")
+        # Đọc kết quả từ file CSV
+        results_csv = output_path / "results.csv"
+        if results_csv.exists():
+            import pandas as pd
+            df = pd.read_csv(results_csv)
+            predictions = df.to_dict(orient="records")
+        else:
+            predictions = []
+        return InferenceResponse(status="success", predictions=predictions)
+    except Exception as e:
+        logger.exception("Error during inference.")
+        raise HTTPException(status_code=500, detail=str(e))

configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .arguments import *

configs/arguments.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#configs/arguments.py
+from pathlib import Path
+from typing import Any
+from dataclasses import dataclass, field
+from utils import MODELS, VIDEO_EXTENSIONS
+@dataclass
+class TransformConfig:
+    # RGB specific
+    horizontal_flip_prob: float = 0.5
+    aug_type: str = "augmix"
+    aug_paras: dict = field(
+        default_factory=lambda: {
+            "magnitude": 3,
+            "alpha": 1.0,
+            "width": 5,
+            "depth": -1,
+        }
+    )
+    sample_rate: int = 4
+    # Pose specific
+    normalization: bool = True
+    # SL-GCN, DSTA-SLR specific
+    random_choose: bool = False
+    random_shift: bool = False
+    random_move: bool = False
+    random_mirror: bool = False
+    random_mirror_p: float = 0.5
+    bone_stream: bool = False
+    motion_stream: bool = False
+    # SPOTER specific
+    augmentation: bool = True
+    aug_prob: float = 0.5
+    noise: bool = True
+    def __post_init__(self):
+        assert self.aug_type in ["augmix", "mixup"], \
+            "Only AugMix and MixUp are supported for now"
+@dataclass
+class DataConfig:
+    dataset: str = "vsl"
+    modality: str = "rgb"
+    subset: str = None
+    data_dir: str = "data/processed/vsl"
+    transform: Any = None
+    fps: int = 30
+    debug: bool = False
+    # transform: TransformConfig = TransformConfig()
+    transform: TransformConfig = field(default_factory=TransformConfig)
+    def __post_init__(self):
+        assert self.dataset in ["vsl_98", "vsl_400"], \
+            "Only VSL dataset is supported for now"
+        assert self.modality in ["rgb", "pose"], \
+            "Only RGB and Pose modalities are supported for now"
+@dataclass
+class ModelConfig:
+    arch: str = "sl_gcn"
+    pretrained: str = "vsltranslation/sl_gcn_joint_v3_0"
+    num_frozen_layers: int = 0
+    ignored_weights: list = field(default_factory=lambda: [])
+    num_frames: int = 16
+    # SL-GCN specific
+    num_points: int = 27
+    groups: int = 8
+    block_size: int = 41
+    in_channels: int = 3
+    labeling_mode: str = "spatial"
+    is_vector: bool = False
+    # DSTA-SLR specific
+    graph: str = "wlasl"
+    inner_dim: int = 64
+    drop_layers: int = 2
+    depth: int = 4
+    s_num_heads: int = 1
+    window_size: int = 120
+    # SPOTER specific
+    hidden_dim: int = 108
+    def __post_init__(self):
+        assert self.arch in MODELS, f"Model {self.arch} is not supported"
+@dataclass
+class TrainingConfig:
+    output_dir: str = "experiments"
+    remove_unused_columns: bool = False
+    do_train: bool = True
+    use_cpu: bool = False
+    eval_strategy: str = "epoch"
+    logging_strategy: str = "epoch"
+    save_strategy: str = "epoch"
+    logging_steps: int = 1
+    save_steps: int = 1
+    eval_steps: int = 1
+    learning_rate: float = 5e-5
+    weight_decay: float = 0
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_epsilon: float = 1e-8
+    warmup_ratio: float = 0.1
+    num_train_epochs: int = 10
+    per_device_train_batch_size: int = 8
+    per_device_eval_batch_size: int = 8
+    dataloader_num_workers: int = 0
+    load_best_model_at_end: bool = True
+    metric_for_best_model: str = "accuracy"
+    resume_from_checkpoint: str = None
+    run_name: str = "swin3d"
+    report_to: str = None
+    push_to_hub: bool = False
+    hub_model_id: str = None
+    hub_strategy: str = "checkpoint"
+    hub_private_repo: bool = True
+    def __post_init__(self):
+        self.output_dir = Path(self.output_dir)
+        if str(self.output_dir) == "experiments":
+            self.output_dir = self.output_dir / self.run_name
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.hub_model_id is not None:
+            self.push_to_hub = True
+            if len(self.hub_model_id.split("/")) == 1:
+                self.hub_model_id = f"{self.hub_model_id}/{self.run_name}"
+@dataclass
+class InferenceConfig:
+    source: str = "webcam"
+    output_dir: str = "demo"
+    use_onnx: bool = False
+    device: str = "cpu"
+    cache_dir: str = "models/huggingface"
+    visualize: bool = False
+    show_skeleton: bool = False
+    visibility: float = 0.5
+    angle_threshold: int = 140
+    min_num_up_frames: int = 10
+    min_num_down_frames: int = 10
+    delay: int = 400
+    top_k: int = 3
+    # SL-GCN, DSTA-SLR specific
+    bone_stream: bool = False
+    motion_stream: bool = False
+    def __post_init__(self):
+        self.source = Path(self.source)
+        assert any((
+            str(self.source) == "webcam",
+            (self.source.exists() and str(self.source).endswith(VIDEO_EXTENSIONS))
+        )), \
+            f"Only Webcam and Video sources are supported for now (got {self.source})"
+        self.output_dir = Path(self.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)

configs/dsta_slr.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+#configs/dsta_slr.yaml
+model:
+  arch: dsta_slr
+  pretrained: vsltranslation/dsta_slr_joint_motion_v3_0
+inference:
+  source: webcam
+  output_dir: demo/run_1
+  use_onnx: True
+  show_skeleton: True
+  visualize: True
+  bone_stream: False
+  motion_stream: True

configs/sl_gcn.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+#configs/sl_gcn.yaml
+model:
+  arch: sl_gcn
+  pretrained: vsltranslation/sl_gcn_joint_v3_0
+inference:
+  source: webcam
+  output_dir: demo/run_1
+  use_onnx: True
+  show_skeleton: True
+  visualize: True
+  bone_stream: True
+  motion_stream: False

configs/spoter.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+#configs/spoter.yaml
+model:
+  arch: spoter
+  pretrained: vsltranslation/spoter_v3.0
+inference:
+  source: webcam
+  output_dir: demo/run_1
+  use_onnx: True
+  show_skeleton: True
+  visualize: True

data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import *

data/utils.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#data/utils.py
+import numpy as np
+from mediapipe.python.solutions import pose
+from visualization import draw_text_on_image
+class Arm:
+    def __init__(
+        self,
+        side: str,
+        visibility: float = 0.5,
+    ) -> None:
+        if side == "left":
+            self.shoulde_idx = pose.PoseLandmark.LEFT_SHOULDER.value
+            self.elbow_idx = pose.PoseLandmark.LEFT_ELBOW.value
+            self.wrist_idx = pose.PoseLandmark.LEFT_WRIST.value
+        elif side == "right":
+            self.shoulde_idx = pose.PoseLandmark.RIGHT_SHOULDER.value
+            self.elbow_idx = pose.PoseLandmark.RIGHT_ELBOW.value
+            self.wrist_idx = pose.PoseLandmark.RIGHT_WRIST.value
+        else:
+            raise ValueError("Side must be either 'left' or 'right'")
+        self.visibility = visibility
+        self.is_up = False
+        self.num_up_frames = 0
+        self.num_down_frames = 0
+        self.start_time = 0
+        self.end_time = 0
+        self.shoulder = None
+        self.elbow = None
+        self.wrist = None
+        self.angle = 0
+    def reset_state(self) -> None:
+        self.is_up = False
+        self.num_up_frames = 0
+        self.num_down_frames = 0
+        self.start_time = 0
+        self.end_time = 0
+        self.shoulder = None
+        self.elbow = None
+        self.wrist = None
+        self.angle = 0
+    def set_pose(self, landmarks) -> bool:
+        if landmarks[self.shoulde_idx].visibility < self.visibility:
+            return False
+        self.shoulder = (
+            landmarks[self.shoulde_idx].x,
+            landmarks[self.shoulde_idx].y,
+        )
+        if landmarks[self.elbow_idx].visibility < self.visibility:
+            return False
+        self.elbow = (
+            landmarks[self.elbow_idx].x,
+            landmarks[self.elbow_idx].y,
+        )
+        if landmarks[self.wrist_idx].visibility < self.visibility:
+            return False
+        self.wrist = (
+            landmarks[self.wrist_idx].x,
+            landmarks[self.wrist_idx].y,
+        )
+        self.angle = calculate_angle(self.shoulder, self.elbow, self.wrist)
+        return True
+    def visualize(
+        self,
+        frame: np.ndarray,
+        position: tuple = (20, 50),
+        prefix: str = "Angle",
+        color: tuple = (0, 0, 255),
+    ) -> np.ndarray:
+        text = prefix + ": " + str(round(self.angle, 2))
+        return draw_text_on_image(
+            image=frame,
+            text=text,
+            position=position,
+            color=color,
+            font_size=20,
+        )
+def get_sample_timestamp(left_arm: Arm, right_arm: Arm) -> tuple:
+    start_time, end_time = 0, 0
+    left_arm_available = left_arm.start_time > 0 and left_arm.end_time > 0
+    right_arm_available = right_arm.start_time > 0 and right_arm.end_time > 0
+    if left_arm_available and right_arm.start_time == 0:
+        start_time = left_arm.start_time
+        end_time = left_arm.end_time
+    if right_arm_available and left_arm.start_time == 0:
+        start_time = right_arm.start_time
+        end_time = right_arm.end_time
+    if all((
+        left_arm_available, not left_arm.is_up,
+        right_arm_available, not right_arm.is_up,
+    )):
+        start_time = min(left_arm.start_time, right_arm.start_time)
+        end_time = max(left_arm.end_time, right_arm.end_time)
+    # Convert seconds to milliseconds
+    start_time /= 1000
+    end_time /= 1000
+    return start_time, end_time
+def calculate_angle(a: tuple, b: tuple, c: tuple) -> float:
+    a = np.array(a)     # First
+    b = np.array(b)     # Mid
+    c = np.array(c)     # End
+    radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - np.arctan2(a[1] - b[1], a[0] - b[0])
+    angle = np.abs(radians * 180.0 / np.pi)
+    return 360 - angle if angle > 180 else angle
+def ok_to_get_frame(
+    arm: Arm,
+    angle_threshold: int,
+    min_num_up_frames: int,
+    min_num_down_frames: int,
+    current_time: int,
+    delay: int,
+) -> bool:
+    if 0 < arm.angle < angle_threshold:
+        if arm.is_up:
+            arm.num_down_frames = 0
+            arm.end_time = 0
+        else:
+            if arm.num_up_frames == min_num_up_frames:
+                arm.is_up = True
+                arm.num_up_frames = 0
+            else:
+                if arm.num_up_frames == 0:
+                    arm.start_time = current_time - delay
+                arm.num_up_frames += 1
+                return False
+    else:
+        if arm.is_up:
+            if arm.num_down_frames == min_num_down_frames:
+                arm.is_up = False
+                arm.num_down_frames = 0
+            else:
+                if arm.num_down_frames == 0:
+                    arm.end_time = current_time + delay
+                arm.num_down_frames += 1
+                return True
+        else:
+            arm.num_up_frames = 0
+            arm.start_time = 0
+    return arm.is_up

inference.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# inference.py
+import logging
+import pandas as pd
+import cv2
+import numpy as np
+from pathlib import Path
+import time
+from configs import ModelConfig, InferenceConfig
+from tools.models import load_pipeline
+from utils import POSE_BASED_MODELS
+from data import Arm, get_sample_timestamp, ok_to_get_frame
+from visualization.utils import draw_text_on_image
+from tools.models import Predictions
+def inference(model_config: ModelConfig, inference_config: InferenceConfig, pipeline) -> dict:
+    """
+    Thực hiện quá trình suy luận trên video.
+    Args:
+        model_config (ModelConfig): Cấu hình mô hình.
+        inference_config (InferenceConfig): Cấu hình suy luận.
+        pipeline: Pipeline đã được tải.
+    Returns:
+        dict: Kết quả nhận diện.
+    """
+    # Load video
+    source = str(inference_config.source) if Path(inference_config.source).is_file() else 0
+    cap = cv2.VideoCapture(source)
+    if inference_config.output_dir is not None:
+        output_dir = Path(inference_config.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        writer = cv2.VideoWriter(
+            str(output_dir / "output.mp4"),
+            cv2.VideoWriter_fourcc(*"mp4v"),
+            cap.get(cv2.CAP_PROP_FPS),
+            (int(cap.get(3)), int(cap.get(4))),
+        )
+    else:
+        writer = None
+    # Init Mediapipe
+    import mediapipe as mp
+    from mediapipe.python.solutions.pose import PoseLandmark
+    from mediapipe.python.solutions.hands import HandLandmark
+    from mediapipe.python.solutions.drawing_utils import DrawingSpec
+    mp_holistic = mp.solutions.holistic
+    mp_drawing = mp.solutions.drawing_utils
+    mp_drawing_styles = mp.solutions.drawing_styles
+    custom_pose_style = mp_drawing_styles.get_default_pose_landmarks_style()
+    custom_right_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_left_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_pose_connections = list(mp_holistic.POSE_CONNECTIONS)
+    custom_hand_connections = list(mp_holistic.HAND_CONNECTIONS)
+    if inference_config.show_skeleton:
+        pose_landmarks = [
+            PoseLandmark.NOSE,
+            PoseLandmark.LEFT_EYE,
+            PoseLandmark.RIGHT_EYE,
+            PoseLandmark.LEFT_SHOULDER,
+            PoseLandmark.RIGHT_SHOULDER,
+            PoseLandmark.LEFT_ELBOW,
+            PoseLandmark.RIGHT_ELBOW,
+            PoseLandmark.LEFT_WRIST,
+            PoseLandmark.RIGHT_WRIST
+        ]
+        hand_landmarks = [
+            HandLandmark.WRIST,
+            HandLandmark.INDEX_FINGER_TIP, HandLandmark.INDEX_FINGER_DIP, HandLandmark.INDEX_FINGER_PIP, HandLandmark.INDEX_FINGER_MCP,
+            HandLandmark.MIDDLE_FINGER_TIP, HandLandmark.MIDDLE_FINGER_DIP, HandLandmark.MIDDLE_FINGER_PIP, HandLandmark.MIDDLE_FINGER_MCP,
+            HandLandmark.RING_FINGER_TIP, HandLandmark.RING_FINGER_DIP, HandLandmark.RING_FINGER_PIP, HandLandmark.RING_FINGER_MCP,
+            HandLandmark.PINKY_TIP, HandLandmark.PINKY_DIP, HandLandmark.PINKY_PIP, HandLandmark.PINKY_MCP,
+            HandLandmark.THUMB_TIP, HandLandmark.THUMB_IP, HandLandmark.THUMB_MCP, HandLandmark.THUMB_CMC,
+        ]
+        for landmark in PoseLandmark:
+            if landmark in pose_landmarks:
+                custom_pose_style[landmark] = DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2)
+            else:
+                custom_pose_style[landmark] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                custom_pose_connections = [conn for conn in custom_pose_connections if landmark.value not in conn]
+        for landmark in HandLandmark:
+            if landmark in hand_landmarks:
+                custom_right_hand_style[landmark] = DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2)
+                custom_left_hand_style[landmark] = DrawingSpec(color=(255,0,0), thickness=2, circle_radius=2)
+            else:
+                custom_right_hand_style[landmark] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                custom_left_hand_style[landmark] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                custom_hand_connections = [conn for conn in custom_hand_connections if landmark.value not in conn]
+    # Init variables
+    right_arm = Arm("right", inference_config.visibility)
+    left_arm = Arm("left", inference_config.visibility)
+    data = []
+    results = None
+    predictions = Predictions()
+    with mp_holistic.Holistic(min_detection_confidence=0.9, min_tracking_confidence=0.5) as holistic:
+        while cap.isOpened():
+            success, frame = cap.read()
+            if not success:
+                break
+            # Recolor image to RGB, because mp processes on RGB image
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame.flags.writeable = False
+            # Make detections
+            detection_results = holistic.process(frame)
+            # Recolor image back to BGR, because cv2 processes on BGR image
+            frame.flags.writeable = True
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            # Extract landmarks
+            try:
+                landmarks = detection_results.pose_landmarks.landmark
+            except Exception:
+                continue
+            left_arm.set_pose(landmarks)
+            right_arm.set_pose(landmarks)
+            # Check if arms are up or down
+            left_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=left_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=cap.get(cv2.CAP_PROP_POS_MSEC),
+                delay=inference_config.delay,
+            )
+            right_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=right_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=cap.get(cv2.CAP_PROP_POS_MSEC),
+                delay=inference_config.delay,
+            )
+            if left_arm_ok_to_get_frame or right_arm_ok_to_get_frame:
+                predictions = Predictions()
+                data.append(detection_results.pose_landmarks if inference_config.use_pose_model else frame)
+            # Calculate the start and end time of sign
+            start_time, end_time = get_sample_timestamp(left_arm, right_arm)
+            # Convert from miliseconds to seconds
+            start_time /= 1_000
+            end_time /= 1_000
+            if start_time != 0 and end_time != 0:
+                # Run inference
+                start_inference_time = time.time()
+                predictions = Predictions(predictions=pipeline(np.array(data)))
+                predictions.inference_time = time.time() - start_inference_time
+                predictions.start_time = start_time
+                predictions.end_time = end_time
+                logging.info(str(predictions))
+                results = predictions.merge_results(results)
+                # Reset variables
+                start_time = 0
+                end_time = 0
+                left_arm.reset_state()
+                right_arm.reset_state()
+                data = []
+            # Render detections
+            frame = left_arm.visualize(frame, (20, 10), "Left arm angle")
+            frame = right_arm.visualize(frame, (20, 40), "Right arm angle")
+            frame = predictions.visualize(frame, (20, 70))
+            if inference_config.show_skeleton:
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.pose_landmarks,
+                    connections = custom_pose_connections,
+                    landmark_drawing_spec=custom_pose_style
+                )
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.right_hand_landmarks,
+                    connections = custom_hand_connections,
+                    landmark_drawing_spec=custom_right_hand_style
+                )
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.left_hand_landmarks,
+                    connections = custom_hand_connections,
+                    landmark_drawing_spec=custom_left_hand_style
+                )
+            if writer:
+                writer.write(frame)
+    cap.release()
+    if writer:
+        writer.release()
+        logging.info(f"Video is recorded and saved to {inference_config.output_dir / 'output.mp4'}")
+        pd.DataFrame(results).to_csv(inference_config.output_dir / "results.csv", index=False)
+        logging.info(f"Results saved to {inference_config.output_dir / 'results.csv'}")
+    return {
+        "video_path": str(output_path / "output.mp4"),
+        "results": results
+    }

models/dsta_slr_joint_motion_v3_0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecfcb2b459fd68bfe838569d41bdb502f7cd21ddd675790146034cf0e6f71632
+size 29678372

models/sl_gcn_joint_v3_0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab4e3b86ec2a828c9e8f72f1f80ca131c0b7439539412fe15244dbcb64fb2a1
+size 17046336

models/spoter_v3.0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38c21cd96446475cdc110f7748b11ad58b84cd055133379684f9f463dea8fcbd
+size 24208453

request.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import requests
+url = 'https://<your-hf-space-url>.hf.space/inference'  # URL thực tế sau khi deploy lên HF
+video_path = '/path/to/your_video.mp4'
+params = {
+    'model_name': 'spoter',
+    'output_option': 'all',
+    'output_dir': 'custom_output_folder'  # người dùng có thể chọn folder output
+}
+files = {
+    'file': open(video_path, 'rb')
+}
+response = requests.post(url=url, files=files, params=params)
+print(response.json())

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+transformers
+pandas
+evaluate
+simple-parsing
+torch
+torchvision
+hf-transfer
+decord
+accelerate
+scikit-learn
+wandb
+pose-format
+torchsummary
+mediapipe
+opencv-python
+onnxruntime
+onnx
+imageio
+tk
+timm
+einops
+fastapi
+uvicorn
+pydantic
+numpy
+opencv-python
+simple_parsing

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .models import *
+from .features import *
+# from .utils import exists_on_hf

tools/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (203 Bytes). View file

tools/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (234 Bytes). View file

tools/__pycache__/features.cpython-39.pyc ADDED Viewed

Binary file (1.51 kB). View file

tools/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (15.4 kB). View file

tools/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (9.63 kB). View file

tools/features.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#tools/features.py
+import torch
+from configs import DataConfig
+from features import BaseDataset, VSL98Dataset, VSL400Dataset
+def load_dataset(data_config: DataConfig) -> BaseDataset:
+    '''
+    '''
+    datasets = {
+        'vsl_98': VSL98Dataset,
+        "vsl_400": VSL400Dataset,
+    }
+    return datasets[data_config.dataset](data_config)
+def rgb_collate_fn(examples) -> dict:
+    # permute to (num_frames, num_channels, height, width)
+    pixel_values = torch.stack(
+        [example["video"].permute(1, 0, 2, 3) for example in examples]
+    )
+    labels = torch.tensor([example["label"] for example in examples])
+    return {"pixel_values": pixel_values, "labels": labels}
+def pose_collate_fn(examples) -> dict:
+    # permute to (num_frames, num_channels, height, width)
+    poses = torch.stack([example["pose"] for example in examples])
+    labels = torch.tensor([example["label"] for example in examples])
+    return {"poses": poses, "labels": labels}

tools/models.py ADDED Viewed

	@@ -0,0 +1,443 @@

+#tools/models.py
+import torch
+import logging
+import onnxruntime as ort
+from time import time
+from typing import Union
+from configs import ModelConfig, InferenceConfig
+from utils import (
+    POSE_BASED_MODELS,
+    RGB_BASED_MODELS,
+    HUGGINGFACE_RGB_BASED_MODELS,
+    TORCHHUB_RGB_BASED_MODELS,
+)
+from transformers import (
+    ImageProcessingMixin,
+    FeatureExtractionMixin,
+    AutoModelForVideoClassification,
+    AutoModel,
+    Pipeline,
+    pipeline,
+)
+from transformers.pipelines import PIPELINE_REGISTRY
+from visualization import draw_text_on_image
+from utils import exists_on_hf
+from models import (
+    Swin3DConfig, Swin3DImageProcessor, Swin3DForVideoClassification,
+    S3DConfig, S3DImageProcessor, S3DForVideoClassification,
+    VideoResNetConfig, VideoResNetImageProcessor, VideoResNetForVideoClassification,
+    MViTConfig, MViTImageProcessor, MViTForVideoClassification,
+    SLGCNConfig, SLGCNFeatureExtractor, SLGCNForGraphClassification,
+    SPOTERConfig, SPOTERFeatureExtractor, SPOTERForGraphClassification,
+    DSTASLRConfig, DSTASLRFeatureExtractor, DSTASLRForGraphClassification,
+    VideoMAEConfig, VideoMAEImageProcessor, VideoMAEForVideoClassification
+)
+from pipelines import (
+    VideoClassificationPipeline,
+    SLGCNGraphClassificationPipeline,
+    SPOTERGraphClassificationPipeline,
+)
+def load_model(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+    do_train: bool = False,
+) -> tuple:
+    '''
+    '''
+    if do_train:
+        if model_config.arch in POSE_BASED_MODELS:
+            return load_pose_model_for_training(model_config, label2id, id2label)
+        return load_rgb_model_for_training(model_config, label2id, id2label)
+    if model_config.arch in POSE_BASED_MODELS:
+        processor = FeatureExtractionMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModel.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+    else:
+        processor = ImageProcessingMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModelForVideoClassification.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+    model.eval()
+    return model.config, processor, model
+def load_rgb_model_for_training(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+) -> tuple:
+    '''
+    '''
+    if model_config.arch in HUGGINGFACE_RGB_BASED_MODELS:
+        if model_config.arch == "videomae":
+            config_class = VideoMAEConfig
+            processor_class = VideoMAEImageProcessor
+            model_class = VideoMAEForVideoClassification
+    elif exists_on_hf(model_config.pretrained):
+        processor = ImageProcessingMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModelForVideoClassification.from_pretrained(
+            model_config.pretrained,
+            label2id,
+            id2label,
+            ignore_mismatched_sizes=True,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        return model.config, processor, model
+    elif model_config.arch in TORCHHUB_RGB_BASED_MODELS:
+        if model_config.arch in ['swin3d_t', 'swin3d_s', 'swin3d_b']:
+            config_class = Swin3DConfig
+            processor_class = Swin3DImageProcessor
+            model_class = Swin3DForVideoClassification
+        elif model_config.arch in ['r3d_18', 'mc3_18', 'r2plus1d_18']:
+            config_class = VideoResNetConfig
+            processor_class = VideoResNetImageProcessor
+            model_class = VideoResNetForVideoClassification
+        elif model_config.arch in ['s3d']:
+            config_class = S3DConfig
+            processor_class = S3DImageProcessor
+            model_class = S3DForVideoClassification
+        elif model_config.arch in ['mvit_v1_b', 'mvit_v2_s']:
+            config_class = MViTConfig
+            processor_class = MViTImageProcessor
+            model_class = MViTForVideoClassification
+    else:
+        logging.error(f"Model {model_config.arch} is not supported")
+        exit(1)
+    config_class.register_for_auto_class()
+    processor_class.register_for_auto_class("AutoImageProcessor")
+    model_class.register_for_auto_class("AutoModel")
+    model_class.register_for_auto_class("AutoModelForVideoClassification")
+    logging.info(f"{model_config.arch} classes registered")
+    config = config_class(**vars(model_config))
+    processor = processor_class(config=config)
+    model = model_class(config=config, label2id=label2id, id2label=id2label)
+    return config, processor, model
+def load_pose_model_for_training(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+) -> tuple:
+    '''
+    '''
+    if exists_on_hf(model_config.pretrained):
+        processor = FeatureExtractionMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModel.from_pretrained(
+            model_config.pretrained,
+            label2id=label2id,
+            id2label=id2label,
+            ignore_mismatched_sizes=True,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        return model.config, processor, model
+    elif model_config.arch in POSE_BASED_MODELS:
+        if model_config.arch == "spoter":
+            config_class = SPOTERConfig
+            processor_class = SPOTERFeatureExtractor
+            model_class = SPOTERForGraphClassification
+        elif model_config.arch == "sl_gcn":
+            config_class = SLGCNConfig
+            processor_class = SLGCNFeatureExtractor
+            model_class = SLGCNForGraphClassification
+        elif model_config.arch == "dsta_slr":
+            config_class = DSTASLRConfig
+            processor_class = DSTASLRFeatureExtractor
+            model_class = DSTASLRForGraphClassification
+    else:
+        logging.error(f"Model {model_config.arch} is not supported")
+        exit(1)
+    config_class.register_for_auto_class()
+    processor_class.register_for_auto_class("AutoFeatureExtractor")
+    model_class.register_for_auto_class("AutoModel")
+    logging.info(F"Registering {model_config.arch} classes")
+    config = config_class(**vars(model_config))
+    processor = processor_class(config=config)
+    model = model_class(config=config, label2id=label2id, id2label=id2label)
+    return config, processor, model
+class Predictions:
+    def __init__(
+        self,
+        predictions: list[dict] = None,
+        inference_time: float = 0,
+        start_time: float = 0,
+        end_time: float = 0,
+    ) -> None:
+        self.predictions = predictions
+        self.inference_time = inference_time
+        self.start_time = start_time
+        self.end_time = end_time
+    def visualize(
+        self,
+        frame: torch.Tensor,
+        position: tuple = (20, 100),
+        prefix: str = "Predictions",
+        color: tuple = (0, 0, 255),
+    ) -> None:
+        text = prefix + ": " + self.get_pred_message()
+        return draw_text_on_image(
+            image=frame,
+            text=text,
+            position=position,
+            color=color,
+            font_size=20,
+        )
+    def get_pred_message(self) -> str:
+        if not any((
+            self.start_time,
+            self.end_time,
+            self.inference_time,
+            self.predictions
+        )):
+            return ""
+        return ', '.join(
+            [
+                f"{pred['gloss']} ({pred['score']*100:.2f}%)"
+                for pred in self.predictions
+            ]
+        )
+    def __str__(self) -> str:
+        if not any((
+            self.start_time,
+            self.end_time,
+            self.inference_time,
+            self.predictions
+        )):
+            return ""
+        predictions = self.get_pred_message()
+        message = "Sample start: {:.2f}s - end: {:.2f}s | Runtime: {:.2f}s | Predictions: {}"
+        return message.format(self.start_time, self.end_time, self.inference_time, predictions)
+    def merge_results(self, results: dict = None) -> dict:
+        if results is None:
+            results = {
+                "start_time": [],
+                "end_time": [],
+                "inference_time": [],
+                "prediction": [],
+            }
+        results["start_time"].append(self.start_time)
+        results["end_time"].append(self.end_time)
+        results["inference_time"].append(self.inference_time)
+        results["prediction"].append(self.predictions)
+        return results
+def get_predictions(
+    inputs: torch.Tensor,
+    model: Union[ort.InferenceSession, AutoModel],
+    id2gloss: dict,
+    k: int = 3,
+) -> Predictions:
+    '''
+    Get the top-k predictions.
+    Parameters
+    ----------
+    inputs : torch.Tensor
+        Model inputs (Time, Height, Width, Channels).
+    model : Union[ort.InferenceSession, AutoModel]
+        Model to get predictions from.
+    id2gloss : dict
+        Mapping of class indices to glosses.
+    k : int, optional
+        Number of predictions to return, by default 3.
+    Returns
+    -------
+    tuple
+        List of top-k predictions and inference time.
+    '''
+    if inputs is None:
+        return Predictions()
+    # Get logits
+    start_time = time()
+    if isinstance(model, ort.InferenceSession):
+        inputs = inputs.cpu().numpy()
+        logits = torch.from_numpy(model.run(None, {"pixel_values": inputs})[0])
+    else:
+        logits = model(inputs.to(model.device)).logits
+    inference_time = time() - start_time
+    # Get top-3 predictions
+    topk_scores, topk_indices = torch.topk(logits, k, dim=1)
+    topk_scores = torch.nn.functional.softmax(topk_scores, dim=1).squeeze().detach().numpy()
+    topk_indices = topk_indices.squeeze().detach().numpy()
+    predictions = [
+        {
+            'gloss': id2gloss[str(topk_indices[i])],
+            'score': topk_scores[i],
+        }
+        for i in range(k)
+    ]
+    return Predictions(predictions=predictions, inference_time=inference_time)
+def register_pipeline(model_config: ModelConfig) -> Pipeline:
+    '''
+    '''
+    _, processor, model = load_model(model_config)
+    if model_config.arch == "spoter":
+        PIPELINE_REGISTRY.register_pipeline(
+            "video-classification",
+            pipeline_class=SPOTERGraphClassificationPipeline,
+            pt_model=AutoModel,
+            default={"pt": ("vsltranslation/spoter_v3.0", "main")},
+            type="multimodal",
+        )
+        return SPOTERGraphClassificationPipeline(
+            model=model,
+            feature_extractor=processor,
+        )
+    elif model_config.arch in ["sl_gcn", "dsta_slr"]:
+        PIPELINE_REGISTRY.register_pipeline(
+            "video-classification",
+            pipeline_class=SLGCNGraphClassificationPipeline,
+            pt_model=AutoModel,
+            default={"pt": ("vsltranslation/sl_gcn_joint_v1.0", "main")},
+            type="multimodal",
+        )
+        return SLGCNGraphClassificationPipeline(
+            model=model,
+            feature_extractor=processor,
+        )
+    PIPELINE_REGISTRY.register_pipeline(
+        "video-classification",
+        pipeline_class=VideoClassificationPipeline,
+        pt_model=AutoModelForVideoClassification,
+        default={"pt": ("vsltranslation/swin3d_t_v1.0", "main")},
+        type="multimodal",
+    )
+    return VideoClassificationPipeline(
+        model=model,
+        image_processor=processor,
+    )
+def load_pipeline(
+    model_config: ModelConfig,
+    inference_config: InferenceConfig,
+) -> Pipeline:
+    '''
+    '''
+    if model_config.arch in POSE_BASED_MODELS:
+        return pipeline(
+            "video-classification",
+            model=model_config.pretrained,
+            feature_extractor=model_config.pretrained,
+            device=inference_config.device,
+            model_kwargs={
+                "cache_dir": inference_config.cache_dir,
+            },
+            trust_remote_code=True,
+            use_onnx=inference_config.use_onnx,
+            top_k=inference_config.top_k,
+            bone_stream=inference_config.bone_stream,
+            motion_stream=inference_config.motion_stream,
+        )
+    return pipeline(
+        "video-classification",
+        model=model_config.pretrained,
+        image_processor=model_config.pretrained,
+        device=inference_config.device,
+        model_kwargs={
+            "cache_dir": inference_config.cache_dir,
+        },
+        trust_remote_code=True,
+        use_onnx=inference_config.use_onnx,
+        top_k=inference_config.top_k,
+    )
+def get_input_shape(
+    arch: str,
+    processor: Union[ImageProcessingMixin, FeatureExtractionMixin],
+    batch_size: int = 1,
+) -> tuple:
+    '''
+    Get the input shape for the model.
+    Parameters
+    ----------
+    processor : Union[ImageProcessingMixin, FeatureExtractionMixin]
+        Model processor.
+    batch_size : int, optional
+        Batch size, by default 1.
+    Returns
+    -------
+    tuple
+        Input shape.
+    '''
+    if arch in RGB_BASED_MODELS:
+        return (
+            batch_size,
+            processor.num_frames,
+            3,
+            processor.size["height"],
+            processor.size["width"]
+        )
+    elif arch in POSE_BASED_MODELS:
+        if arch == "spoter":
+            return (
+                batch_size,
+                processor.num_frames,
+                processor.num_points,
+                processor.in_channels,
+            )
+        elif arch in ["sl_gcn", "dsta_slr"]:
+            return (
+                batch_size,
+                processor.in_channels,
+                processor.window_size,
+                processor.num_points,
+                processor.num_people,
+            )
+        else:
+            logging.error(f"Model {arch} is not supported")
+            exit(1)
+    else:
+        logging.error(f"Model {arch} is not supported")
+        exit(1)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .loggers import *
2	+ from .constants import *

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (205 Bytes). View file

utils/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (4.35 kB). View file

utils/__pycache__/loggers.cpython-312.pyc ADDED Viewed

Binary file (1.59 kB). View file

utils/constants.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#utils/constants.py
+import numpy as np
+VIDEO_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+TORCHHUB_RGB_BASED_MODELS = (
+    'swin3d_t',
+    'swin3d_s',
+    'swin3d_b',
+    "r3d_18",
+    "mc3_18",
+    "r2plus1d_18",
+    "s3d",
+    "mvit_v1_b",
+    "mvit_v2_s",
+)
+HUGGINGFACE_RGB_BASED_MODELS = (
+    "videomae",
+)
+RGB_BASED_MODELS = HUGGINGFACE_RGB_BASED_MODELS + TORCHHUB_RGB_BASED_MODELS
+POSE_BASED_MODELS = (
+    "spoter",
+    "sl_gcn",
+    "dsta_slr"
+)
+MODELS = RGB_BASED_MODELS + POSE_BASED_MODELS
+HAND_LANDMARKS = [
+    "wrist",
+    "indexTip",
+    "indexDIP",
+    "indexPIP",
+    "indexMCP",
+    "middleTip",
+    "middleDIP",
+    "middlePIP",
+    "middleMCP",
+    "ringTip",
+    "ringDIP",
+    "ringPIP",
+    "ringMCP",
+    "littleTip",
+    "littleDIP",
+    "littlePIP",
+    "littleMCP",
+    "thumbTip",
+    "thumbIP",
+    "thumbMP",
+    "thumbCMC",
+]
+BODY_LANDMARKS = [
+    "nose",
+    "neck",
+    "rightEye",
+    "leftEye",
+    "rightEar",
+    "leftEar",
+    "rightShoulder",
+    "leftShoulder",
+    "rightElbow",
+    "leftElbow",
+    "rightWrist",
+    "leftWrist",
+]
+ARM_LANDMARKS_ORDER = ["neck", "$side$Shoulder", "$side$Elbow", "$side$Wrist"]
+FLIP_IDXS = np.concatenate(
+    (
+        [0, 2, 1, 4, 3, 6, 5],
+        [17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
+        [7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ),
+    axis=0,
+)
+SLGCN_JOINTS = {
+    59: np.concatenate((np.arange(0, 17), np.arange(91, 133)), axis=0),  # 59
+    31: np.concatenate(
+        (
+            np.arange(0, 11),
+            [91, 95, 96, 99, 100, 103, 104, 107, 108, 111],
+            [112, 116, 117, 120, 121, 124, 125, 128, 129, 132],
+        ),
+        axis=0,
+    ),  # 31
+    27: np.concatenate(
+        (
+            [0, 5, 6, 7, 8, 9, 10],
+            [91, 95, 96, 99, 100, 103, 104, 107, 108, 111],
+            [112, 116, 117, 120, 121, 124, 125, 128, 129, 132],
+        ),
+        axis=0,
+    ),  # 27
+}
+COCO_TO_POSE_FORMAT = {
+    0: ("POSE_LANDMARKS", "NOSE"),
+    1: ("POSE_LANDMARKS", "LEFT_EYE"),
+    2: ("POSE_LANDMARKS", "RIGHT_EYE"),
+    3: ("POSE_LANDMARKS", "LEFT_EAR"),
+    4: ("POSE_LANDMARKS", "RIGHT_EAR"),
+    5: ("POSE_LANDMARKS", "LEFT_SHOULDER"),
+    6: ("POSE_LANDMARKS", "RIGHT_SHOULDER"),
+    7: ("POSE_LANDMARKS", "LEFT_ELBOW"),
+    8: ("POSE_LANDMARKS", "RIGHT_ELBOW"),
+    9: ("POSE_LANDMARKS", "LEFT_WRIST"),
+    10: ("POSE_LANDMARKS", "RIGHT_WRIST"),
+    11: ("POSE_LANDMARKS", "LEFT_HIP"),
+    12: ("POSE_LANDMARKS", "RIGHT_HIP"),
+    13: ("POSE_LANDMARKS", "LEFT_KNEE"),
+    14: ("POSE_LANDMARKS", "RIGHT_KNEE"),
+    15: ("POSE_LANDMARKS", "LEFT_ANKLE"),
+    16: ("POSE_LANDMARKS", "RIGHT_ANKLE"),
+    91: ("LEFT_HAND_LANDMARKS", "WRIST"),
+    92: ("LEFT_HAND_LANDMARKS", "THUMB_CMC"),
+    93: ("LEFT_HAND_LANDMARKS", "THUMB_MCP"),
+    94: ("LEFT_HAND_LANDMARKS", "THUMB_IP"),
+    95: ("LEFT_HAND_LANDMARKS", "THUMB_TIP"),
+    96: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_MCP"),
+    97: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_PIP"),
+    98: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_DIP"),
+    99: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_TIP"),
+    100: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_MCP"),
+    101: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_PIP"),
+    102: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_DIP"),
+    103: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_TIP"),
+    104: ("LEFT_HAND_LANDMARKS", "RING_FINGER_MCP"),
+    105: ("LEFT_HAND_LANDMARKS", "RING_FINGER_PIP"),
+    106: ("LEFT_HAND_LANDMARKS", "RING_FINGER_DIP"),
+    107: ("LEFT_HAND_LANDMARKS", "RING_FINGER_TIP"),
+    108: ("LEFT_HAND_LANDMARKS", "PINKY_MCP"),
+    109: ("LEFT_HAND_LANDMARKS", "PINKY_PIP"),
+    110: ("LEFT_HAND_LANDMARKS", "PINKY_DIP"),
+    111: ("LEFT_HAND_LANDMARKS", "PINKY_TIP"),
+    112: ("RIGHT_HAND_LANDMARKS", "WRIST"),
+    113: ("RIGHT_HAND_LANDMARKS", "THUMB_CMC"),
+    114: ("RIGHT_HAND_LANDMARKS", "THUMB_MCP"),
+    115: ("RIGHT_HAND_LANDMARKS", "THUMB_IP"),
+    116: ("RIGHT_HAND_LANDMARKS", "THUMB_TIP"),
+    117: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_MCP"),
+    118: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_PIP"),
+    119: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_DIP"),
+    120: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_TIP"),
+    121: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_MCP"),
+    122: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_PIP"),
+    123: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_DIP"),
+    124: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_TIP"),
+    125: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_MCP"),
+    126: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_PIP"),
+    127: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_DIP"),
+    128: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_TIP"),
+    129: ("RIGHT_HAND_LANDMARKS", "PINKY_MCP"),
+    130: ("RIGHT_HAND_LANDMARKS", "PINKY_PIP"),
+    131: ("RIGHT_HAND_LANDMARKS", "PINKY_DIP"),
+    132: ("RIGHT_HAND_LANDMARKS", "PINKY_TIP"),
+}

utils/loggers.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#utils/loggers.py
+import sys
+import logging
+from pathlib import Path
+from transformers import TrainerCallback
+class TrainingCallback(TrainerCallback):
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        logging.info(logs)
+def config_logger(log_file: str = None) -> None:
+    handlers = [logging.StreamHandler(sys.stdout)]
+    if log_file is not None:
+        log_dir = Path(log_file).parent
+        if not log_dir.exists():
+            log_dir.mkdir(parents=True, exist_ok=True)
+        handlers.append(logging.FileHandler(filename=log_file))
+    logging.basicConfig(
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+        format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s",
+        handlers=handlers
+    )

visualization/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import *

visualization/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (187 Bytes). View file

visualization/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (221 Bytes). View file

visualization/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.44 kB). View file

visualization/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.7 kB). View file

visualization/utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#visualization/utils.py
+import torch
+import numpy as np
+from imageio import mimsave
+from PIL import Image, ImageDraw, ImageFont
+def unnormalize_img(image: np.ndarray, std: tuple, mean: tuple) -> np.ndarray:
+    image = (image * std) + mean
+    image = (image * 255).astype('uint8')
+    return image.clip(0, 255)
+def save_as_gif(
+    video_tensor: torch.Tensor,
+    save_path: str = 'sample.gif',
+    std: tuple = None,
+    mean: tuple = None,
+):
+    frames = []
+    for video_frame in video_tensor:
+        frame_unnormalized = unnormalize_img(
+            image=video_frame.permute(1, 2, 0).numpy(),
+            std=std,
+            mean=mean,
+        )
+        frames.append(frame_unnormalized)
+    kargs = {'duration': 0.25}
+    mimsave(save_path, frames, 'GIF', **kargs)
+    return save_path
+def display_gif(gif_path: str) -> Image:
+    return Image(filename=gif_path)
+def draw_text_on_image(
+    image: np.ndarray,
+    text: str,
+    position: tuple = (20, 20),
+    color: tuple = (0, 0, 255),
+    font_size: int = 20,
+) -> np.ndarray:
+    font = ImageFont.truetype(
+        font="fonts/OpenSans-Regular.ttf",
+        size=font_size,
+    )
+    pil_image = Image.fromarray(image)
+    draw = ImageDraw.Draw(pil_image)
+    draw.text(
+        xy=position,
+        text=text,
+        fill=color,
+        font=font,
+    )
+    return np.array(pil_image)