Spaces:
Running
Running
Commit ·
061439f
0
Parent(s):
Deploy MotionBench Space
Browse files- .dockerignore +23 -0
- .gitignore +26 -0
- Dockerfile +34 -0
- README.md +192 -0
- assets/motionbench.png +0 -0
- requirements.txt +15 -0
- scripts/app/motionbench.py +244 -0
- scripts/benchmark/benchmark_inference.py +377 -0
- scripts/evaluate/benchmark_rep_counting.py +202 -0
- scripts/evaluate/evaluate_home_set.py +399 -0
- scripts/evaluate/rep_counting_methods.py +148 -0
- scripts/preprocess/build_similarity_assets.py +116 -0
- scripts/preprocess/create_fixed_splits.py +127 -0
- scripts/preprocess/create_sequence_of_features.py +132 -0
- scripts/preprocess/extract_features.py +301 -0
- scripts/realtime_eval/evaluate_realtime_webcam.py +412 -0
.dockerignore
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.github
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
.env
|
| 12 |
+
|
| 13 |
+
data/
|
| 14 |
+
archive/
|
| 15 |
+
models/
|
| 16 |
+
results/
|
| 17 |
+
checkpoints/
|
| 18 |
+
wandb/
|
| 19 |
+
runs/
|
| 20 |
+
|
| 21 |
+
.ipynb_checkpoints/
|
| 22 |
+
.DS_Store
|
| 23 |
+
.vscode/
|
.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.ipynb_checkpoints/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
|
| 12 |
+
.vscode/
|
| 13 |
+
.idea/
|
| 14 |
+
.DS_Store
|
| 15 |
+
Thumbs.db
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
data/**
|
| 19 |
+
!data/.gitkeep
|
| 20 |
+
|
| 21 |
+
models/**
|
| 22 |
+
!models/.gitkeep
|
| 23 |
+
!models/*/
|
| 24 |
+
!models/*/train.py
|
| 25 |
+
|
| 26 |
+
archive/**
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
libglib2.0-0 \
|
| 7 |
+
libgl1 \
|
| 8 |
+
libsm6 \
|
| 9 |
+
libxext6 \
|
| 10 |
+
libxrender1 \
|
| 11 |
+
ffmpeg \
|
| 12 |
+
git \
|
| 13 |
+
git-lfs \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 17 |
+
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 18 |
+
|
| 19 |
+
COPY requirements.txt ./
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# scripts to run the app and utilities
|
| 23 |
+
COPY scripts ./scripts
|
| 24 |
+
|
| 25 |
+
# Clone the models from my Hugging Face repository
|
| 26 |
+
RUN git clone --depth 1 https://huggingface.co/johnamit/motionbench-models /tmp/motionbench-models \
|
| 27 |
+
&& cd /tmp/motionbench-models \
|
| 28 |
+
&& git lfs pull \
|
| 29 |
+
&& mv /tmp/motionbench-models/models /app/models \
|
| 30 |
+
&& rm -rf /tmp/motionbench-models
|
| 31 |
+
|
| 32 |
+
EXPOSE 7860
|
| 33 |
+
|
| 34 |
+
ENTRYPOINT ["streamlit", "run", "scripts/app/motionbench.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
README.md
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MotionBench
|
| 3 |
+
emoji: 🏋️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
<img src="assets/motionbench.png" alt="MotionBench" width="900"><br>
|
| 12 |
+
|
| 13 |
+
[<img src="https://img.shields.io/badge/HuggingFace-Dataset-black?style=for-the-badge&logo=huggingface&logoColor=FFD21E&labelColor=ff7f1e" alt="View Dataset on Hugging Face"/>](https://huggingface.co/datasets/johnamit/motionbench-data)
|
| 14 |
+
|
| 15 |
+
[<img src="https://img.shields.io/badge/HuggingFace-Models-black?style=for-the-badge&logo=huggingface&logoColor=FFD21E&labelColor=ff7f1e" alt="View Models on Hugging Face"/>](https://huggingface.co/johnamit/motionbench-models)
|
| 16 |
+
|
| 17 |
+
MotionBench is a real-time pose-based exercise recognition project designed for practical local usage. It classifies exercise motion from short temporal windows, estimates repetition counts with a deterministic finite-state method, and reports similarity against class-level motion centroids.
|
| 18 |
+
|
| 19 |
+
<p>
|
| 20 |
+
<a href="#overview"><img src="https://img.shields.io/badge/Overview-111111?style=for-the-badge" alt="Overview"></a>
|
| 21 |
+
<a href="#dataset"><img src="https://img.shields.io/badge/Dataset-111111?style=for-the-badge" alt="Dataset"></a>
|
| 22 |
+
<a href="#models"><img src="https://img.shields.io/badge/Models-111111?style=for-the-badge" alt="Models"></a>
|
| 23 |
+
<a href="#training"><img src="https://img.shields.io/badge/Training-111111?style=for-the-badge" alt="Training"></a>
|
| 24 |
+
<a href="#inference-local"><img src="https://img.shields.io/badge/Inference-111111?style=for-the-badge" alt="Inference"></a>
|
| 25 |
+
<a href="#streamlit-app"><img src="https://img.shields.io/badge/Streamlit-111111?style=for-the-badge" alt="Streamlit"></a>
|
| 26 |
+
<a href="#citations"><img src="https://img.shields.io/badge/Citations-111111?style=for-the-badge" alt="Citations"></a>
|
| 27 |
+
<a href="#license"><img src="https://img.shields.io/badge/License-111111?style=for-the-badge" alt="License"></a>
|
| 28 |
+
</p>
|
| 29 |
+
|
| 30 |
+
## Overview
|
| 31 |
+
MotionBench is built to run the full workflow from start to finish. You can prepare sequence data, train models, benchmark inference, and run real-time prediction from a webcam.
|
| 32 |
+
|
| 33 |
+
The runtime pipeline is simple. It captures frames, extracts pose-based features, builds rolling windows, and predicts one of six exercise classes. It also estimates repetitions with a deterministic finite-state method and reports a centroid similarity score for live feedback.
|
| 34 |
+
|
| 35 |
+
Core work happens in `data/`, `models/`, `scripts/`, and `results/`. Older or non-essential files are moved to `archive/` to keep the main repository clear and easy to review.
|
| 36 |
+
|
| 37 |
+
## Dataset
|
| 38 |
+
This repo stays lightweight on GitHub. Download the dataset files from Hugging Face and place them in `data/`.
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
git clone https://huggingface.co/datasets/johnamit/motionbench-data data
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
For local usage, keep split files under `data/`.
|
| 45 |
+
|
| 46 |
+
The workflow expects fixed sequence splits (`train`, `val`, `test_internal`) and optionally a separate home/generalization test split.
|
| 47 |
+
|
| 48 |
+
Expected split files:
|
| 49 |
+
- `data/train_sequences.csv`
|
| 50 |
+
- `data/val_sequences.csv`
|
| 51 |
+
- `data/test_internal_sequences.csv`
|
| 52 |
+
- `data/test_home_sequences.csv` (optional for home/generalization evaluation)
|
| 53 |
+
|
| 54 |
+
To regenerate centralized fixed splits:
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
python scripts/preprocess/create_fixed_splits.py --input-file data/train_sequences_full.csv --output-dir data
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Models
|
| 61 |
+
Download trained model files from Hugging Face and place them in `models/`.
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
git clone https://huggingface.co/johnamit/motionbench-models models
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
This project includes six sequence models with different strengths. Some are strong on temporal memory, some are better for latency, and some are better at capturing structured feature relationships.
|
| 68 |
+
|
| 69 |
+
**BiLSTM:**
|
| 70 |
+
The bidirectional LSTM processes each sequence in forward and backward directions within the input window, so the classifier can use context from both ends of the motion segment. This helps when important movement details are spread across the whole sequence, not just a single frame.
|
| 71 |
+
|
| 72 |
+
**LSTM:**
|
| 73 |
+
Unidirectional LSTM reads movement step by step in time. It is a simple and reliable sequence model, so it works well as a strong baseline for exercise classification while keeping runtime reasonable.
|
| 74 |
+
|
| 75 |
+
**GRU:**
|
| 76 |
+
The GRU uses gating similar to LSTM but with fewer internal components, which can reduce parameter count and improve efficiency. In practice, it is a strong candidate when you want robust sequence modeling with lighter recurrent overhead.
|
| 77 |
+
|
| 78 |
+
**TCN:**
|
| 79 |
+
The temporal convolutional network uses dilated 1D convolutions and residual blocks to to learn patterns over short and long time ranges. Because convolutional operations are parallelizable, it is often fast at inference, which makes it a good option when responsiveness matters.
|
| 80 |
+
|
| 81 |
+
**CNN-BiLSTM:**
|
| 82 |
+
This hybrid architecture first applies temporal convolutions to capture short local motion patterns, then a BiLSTM models how those patterns evolve over time. This gives both local detail and sequence context.
|
| 83 |
+
|
| 84 |
+
**ST-GCN-inspired (feature-graph variant):**
|
| 85 |
+
This ST-GCN-style model treats features as connected nodes and learns both their relationships and how they change over time. It can help when interactions between pose features are important for classification.
|
| 86 |
+
|
| 87 |
+
## Training
|
| 88 |
+
Train each model from the shared sequence splits in `data/`.
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
python models/bilstm/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/bilstm/results
|
| 92 |
+
python models/lstm/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/lstm/results
|
| 93 |
+
python models/gru/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/gru/results
|
| 94 |
+
python models/tcn/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/tcn/results
|
| 95 |
+
python models/cnn_bilstm/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/cnn_bilstm/results
|
| 96 |
+
python models/st_gcn/train.py --train-file data/train_sequences.csv --val-file data/val_sequences.csv --test-file data/test_internal_sequences.csv --output-dir models/st_gcn/results
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
If centroid assets are missing or if models were retrained, rebuild similarity assets:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
python scripts/preprocess/build_similarity_assets.py --train-file data/train_sequences.csv --models-root models
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Inference (Local)
|
| 106 |
+
Run offline evaluation on home/generalization test data:
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
python scripts/evaluate/evaluate_home_set.py --test-file data/test_home_sequences.csv --models-root models --output-dir results/eval_offline_home
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
Run inference benchmarking:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
python scripts/benchmark/benchmark_inference.py --input-file data/test_home_sequences.csv --models-root models --output-dir results/benchmark_inference
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Run realtime webcam evaluation (CLI):
|
| 119 |
+
|
| 120 |
+
```bash
|
| 121 |
+
python scripts/realtime_eval/evaluate_realtime_webcam.py --model-name bilstm --models-root models --output-dir results/eval_realtime
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
## Streamlit App
|
| 125 |
+
Launch the local Streamlit interface:
|
| 126 |
+
|
| 127 |
+
```bash
|
| 128 |
+
streamlit run scripts/app/motionbench.py
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
In the app you can select a model, test camera capture, start a live session, and monitor predicted class, repetition count, and similarity score.
|
| 132 |
+
|
| 133 |
+
## Citations
|
| 134 |
+
|
| 135 |
+
**Bidirectional Long Short-Term Memory (BiLSTM)**
|
| 136 |
+
```bibtex
|
| 137 |
+
@article{riccio2024real,
|
| 138 |
+
title={Real-time fitness exercise classification and counting from video frames},
|
| 139 |
+
author={Riccio, Riccardo},
|
| 140 |
+
journal={arXiv preprint arXiv:2411.11548},
|
| 141 |
+
year={2024}
|
| 142 |
+
}
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
**Gated Recurrent Unit (GRU)**
|
| 146 |
+
```bibtex
|
| 147 |
+
@article{chung2014empirical,
|
| 148 |
+
title={Empirical evaluation of gated recurrent neural networks on sequence modeling},
|
| 149 |
+
author={Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun and Bengio, Yoshua},
|
| 150 |
+
journal={arXiv preprint arXiv:1412.3555},
|
| 151 |
+
year={2014}
|
| 152 |
+
}
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
**Temporal Convolutional Network (TCN)**
|
| 156 |
+
```bibtex
|
| 157 |
+
@inproceedings{lea2017temporal,
|
| 158 |
+
title={Temporal convolutional networks for action segmentation and detection},
|
| 159 |
+
author={Lea, Colin and Flynn, Michael D and Vidal, Rene and Reiter, Austin and Hager, Gregory D},
|
| 160 |
+
booktitle={proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
|
| 161 |
+
pages={156--165},
|
| 162 |
+
year={2017}
|
| 163 |
+
}
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
**Spatial Temporal Graph Convolutional Network**
|
| 167 |
+
```bibtex
|
| 168 |
+
@inproceedings{yan2018spatial,
|
| 169 |
+
title={Spatial temporal graph convolutional networks for skeleton-based action recognition},
|
| 170 |
+
author={Yan, Sijie and Xiong, Yuanjun and Lin, Dahua},
|
| 171 |
+
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
| 172 |
+
volume={32},
|
| 173 |
+
number={1},
|
| 174 |
+
year={2018}
|
| 175 |
+
}
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
**CNN BiLSTM Hybrid**
|
| 179 |
+
```bibtex
|
| 180 |
+
@online{dhomane2024cnnbilstm,
|
| 181 |
+
author = {Shreyas Dhomane},
|
| 182 |
+
title = {CNN + BiLSTM Architecture: A Practical Guide},
|
| 183 |
+
year = {2024},
|
| 184 |
+
month = oct,
|
| 185 |
+
day = {23},
|
| 186 |
+
url = {https://medium.com/@shreyas.dhomane22/cnn-bilstm-architecture-a-practical-guide-c81829022820},
|
| 187 |
+
note = {Medium article. Accessed: 2026-04-22}
|
| 188 |
+
}
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
## License
|
| 192 |
+
This project is released under the MIT License.
|
assets/motionbench.png
ADDED
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
pandas==2.3.3
|
| 3 |
+
scikit-learn==1.7.2
|
| 4 |
+
scipy==1.15.3
|
| 5 |
+
joblib==1.5.3
|
| 6 |
+
matplotlib==3.10.9
|
| 7 |
+
tqdm==4.67.3
|
| 8 |
+
|
| 9 |
+
opencv-python==4.13.0.92
|
| 10 |
+
mediapipe==0.10.14
|
| 11 |
+
|
| 12 |
+
streamlit==1.57.0
|
| 13 |
+
|
| 14 |
+
torch==2.11.0
|
| 15 |
+
torchvision==0.26.0
|
scripts/app/motionbench.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter, deque
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from types import SimpleNamespace
|
| 6 |
+
|
| 7 |
+
import cv2
|
| 8 |
+
import joblib
|
| 9 |
+
import mediapipe as mp
|
| 10 |
+
import numpy as np
|
| 11 |
+
import streamlit as st
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 15 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 16 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 17 |
+
|
| 18 |
+
from scripts.evaluate.rep_counting_methods import EXERCISE_CONFIGS, FixedThresholdFSMCounter, SmoothingBuffer, extract_primary_angle, normalize_exercise_name
|
| 19 |
+
from scripts.realtime_eval.evaluate_realtime_webcam import (
|
| 20 |
+
MODEL_SPECS,
|
| 21 |
+
build_landmark_indices,
|
| 22 |
+
build_model_and_tools,
|
| 23 |
+
extract_frame_features,
|
| 24 |
+
get_angle_triplets,
|
| 25 |
+
load_pose_module,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
SEQUENCE_LENGTH = 30
|
| 30 |
+
FEATURE_COUNT = 78
|
| 31 |
+
LABEL_SMOOTHING_WINDOW = 5
|
| 32 |
+
DEFAULT_MODELS_ROOT = "models"
|
| 33 |
+
DEFAULT_PREDICTION_INTERVAL = 1.0
|
| 34 |
+
CAMERA_INDEX_CANDIDATES = [0, 1, 2]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def load_runtime(model_name: str, models_root: str, feature_count: int):
|
| 38 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 39 |
+
args = SimpleNamespace(model_name=model_name, models_root=models_root, feature_count=feature_count)
|
| 40 |
+
model, scaler, label_encoder = build_model_and_tools(args, device)
|
| 41 |
+
return device, model, scaler, label_encoder
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def load_similarity_asset(model_name: str, models_root: str):
|
| 45 |
+
asset_path = Path(models_root) / model_name / "weights" / "similarity_centroids.pkl"
|
| 46 |
+
if not asset_path.exists():
|
| 47 |
+
return None
|
| 48 |
+
return joblib.load(asset_path)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def cosine_similarity_percent(vector_a: np.ndarray, vector_b: np.ndarray) -> float:
|
| 52 |
+
denom = float(np.linalg.norm(vector_a) * np.linalg.norm(vector_b))
|
| 53 |
+
if denom <= 1e-8:
|
| 54 |
+
return 0.0
|
| 55 |
+
score = float(np.dot(vector_a, vector_b) / denom)
|
| 56 |
+
score = max(-1.0, min(1.0, score))
|
| 57 |
+
return ((score + 1.0) / 2.0) * 100.0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def smooth_label(label_history: deque[str]) -> str:
|
| 61 |
+
if not label_history:
|
| 62 |
+
return "none"
|
| 63 |
+
counts = Counter(label_history)
|
| 64 |
+
return counts.most_common(1)[0][0]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def read_valid_frame(capture: cv2.VideoCapture, max_reads: int = 20) -> np.ndarray | None:
|
| 68 |
+
frame_bgr = None
|
| 69 |
+
for _ in range(max_reads):
|
| 70 |
+
ok, candidate = capture.read()
|
| 71 |
+
if not ok:
|
| 72 |
+
continue
|
| 73 |
+
if float(np.mean(candidate)) > 5.0:
|
| 74 |
+
return candidate
|
| 75 |
+
frame_bgr = candidate
|
| 76 |
+
if frame_bgr is not None and float(np.mean(frame_bgr)) > 5.0:
|
| 77 |
+
return frame_bgr
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def open_camera_with_fallback() -> cv2.VideoCapture | None:
|
| 82 |
+
for camera_index in CAMERA_INDEX_CANDIDATES:
|
| 83 |
+
capture = cv2.VideoCapture(camera_index)
|
| 84 |
+
if not capture.isOpened():
|
| 85 |
+
capture.release()
|
| 86 |
+
continue
|
| 87 |
+
frame_bgr = read_valid_frame(capture, max_reads=10)
|
| 88 |
+
if frame_bgr is not None:
|
| 89 |
+
return capture
|
| 90 |
+
capture.release()
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def main():
|
| 95 |
+
st.set_page_config(page_title="MotionBench", layout="wide")
|
| 96 |
+
st.title("MotionBench Live")
|
| 97 |
+
|
| 98 |
+
if "session_active" not in st.session_state:
|
| 99 |
+
st.session_state.session_active = False
|
| 100 |
+
|
| 101 |
+
model_name = st.selectbox("Select Model", options=list(MODEL_SPECS.keys()), index=0)
|
| 102 |
+
start_clicked = st.button("Start Session", width="stretch")
|
| 103 |
+
|
| 104 |
+
if start_clicked:
|
| 105 |
+
st.session_state.session_active = True
|
| 106 |
+
|
| 107 |
+
if not st.session_state.session_active:
|
| 108 |
+
st.info("Select a model, then start session.")
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
device, model, scaler, label_encoder = load_runtime(model_name, DEFAULT_MODELS_ROOT, feature_count=FEATURE_COUNT)
|
| 112 |
+
similarity_asset = load_similarity_asset(model_name, DEFAULT_MODELS_ROOT)
|
| 113 |
+
pose_module = load_pose_module()
|
| 114 |
+
landmark_indices = build_landmark_indices(pose_module)
|
| 115 |
+
angle_triplets = get_angle_triplets()
|
| 116 |
+
|
| 117 |
+
capture = open_camera_with_fallback()
|
| 118 |
+
if capture is None:
|
| 119 |
+
st.error("Could not capture a valid camera frame.")
|
| 120 |
+
st.session_state.session_active = False
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
left_col, right_col = st.columns([2, 1])
|
| 124 |
+
with right_col:
|
| 125 |
+
stop_clicked = st.button("Stop Session", width="stretch")
|
| 126 |
+
if stop_clicked:
|
| 127 |
+
st.session_state.session_active = False
|
| 128 |
+
capture.release()
|
| 129 |
+
st.info("Session stopped.")
|
| 130 |
+
return
|
| 131 |
+
|
| 132 |
+
with left_col:
|
| 133 |
+
frame_slot = st.empty()
|
| 134 |
+
with right_col:
|
| 135 |
+
metrics_slot = st.empty()
|
| 136 |
+
|
| 137 |
+
counter = None
|
| 138 |
+
smoother = None
|
| 139 |
+
active_exercise = None
|
| 140 |
+
current_label = "none"
|
| 141 |
+
current_similarity = None
|
| 142 |
+
last_prediction_time = 0.0
|
| 143 |
+
label_history = deque(maxlen=LABEL_SMOOTHING_WINDOW)
|
| 144 |
+
window = []
|
| 145 |
+
prediction_interval = DEFAULT_PREDICTION_INTERVAL
|
| 146 |
+
|
| 147 |
+
with pose_module.Pose(
|
| 148 |
+
static_image_mode=False,
|
| 149 |
+
model_complexity=1,
|
| 150 |
+
min_detection_confidence=0.5,
|
| 151 |
+
min_tracking_confidence=0.5,
|
| 152 |
+
) as pose_estimator:
|
| 153 |
+
drawing_utils = mp.solutions.drawing_utils
|
| 154 |
+
|
| 155 |
+
while True:
|
| 156 |
+
ok, frame_bgr = capture.read()
|
| 157 |
+
if not ok:
|
| 158 |
+
break
|
| 159 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 160 |
+
pose = pose_estimator.process(frame_rgb)
|
| 161 |
+
|
| 162 |
+
if drawing_utils is not None and pose.pose_landmarks:
|
| 163 |
+
drawing_utils.draw_landmarks(
|
| 164 |
+
frame_bgr,
|
| 165 |
+
pose.pose_landmarks,
|
| 166 |
+
pose_module.POSE_CONNECTIONS,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
frame_features = extract_frame_features(pose, landmark_indices, angle_triplets)
|
| 170 |
+
|
| 171 |
+
if frame_features is not None:
|
| 172 |
+
window.append(frame_features)
|
| 173 |
+
if len(window) > SEQUENCE_LENGTH:
|
| 174 |
+
window.pop(0)
|
| 175 |
+
|
| 176 |
+
if len(window) == SEQUENCE_LENGTH and (time.time() - last_prediction_time) >= prediction_interval:
|
| 177 |
+
sequence_flat = np.array(window, dtype=np.float32).reshape(1, -1)
|
| 178 |
+
scaled_flat = scaler.transform(sequence_flat)
|
| 179 |
+
scaled = scaled_flat.reshape(1, SEQUENCE_LENGTH, FEATURE_COUNT)
|
| 180 |
+
input_tensor = torch.tensor(scaled, dtype=torch.float32, device=device)
|
| 181 |
+
with torch.inference_mode():
|
| 182 |
+
logits = model(input_tensor)
|
| 183 |
+
prediction_index = int(torch.argmax(logits, dim=1).item())
|
| 184 |
+
predicted_label = label_encoder.classes_[prediction_index]
|
| 185 |
+
label_history.append(predicted_label)
|
| 186 |
+
current_label = smooth_label(label_history)
|
| 187 |
+
if similarity_asset is not None:
|
| 188 |
+
scaled_vector = scaled_flat[0]
|
| 189 |
+
centroids = similarity_asset.get("centroids", {})
|
| 190 |
+
centroid_vector = centroids.get(current_label)
|
| 191 |
+
if centroid_vector is not None:
|
| 192 |
+
current_similarity = cosine_similarity_percent(
|
| 193 |
+
scaled_vector.astype(np.float32),
|
| 194 |
+
np.asarray(centroid_vector, dtype=np.float32),
|
| 195 |
+
)
|
| 196 |
+
else:
|
| 197 |
+
current_similarity = None
|
| 198 |
+
last_prediction_time = time.time()
|
| 199 |
+
|
| 200 |
+
normalized_label = normalize_exercise_name(current_label)
|
| 201 |
+
current_reps = 0
|
| 202 |
+
if pose.pose_landmarks and normalized_label in EXERCISE_CONFIGS:
|
| 203 |
+
if normalized_label != active_exercise:
|
| 204 |
+
config = EXERCISE_CONFIGS[normalized_label]
|
| 205 |
+
counter = FixedThresholdFSMCounter(config.fixed_low, config.fixed_high, config.min_state_frames)
|
| 206 |
+
smoother = SmoothingBuffer(config.smoothing_window)
|
| 207 |
+
active_exercise = normalized_label
|
| 208 |
+
|
| 209 |
+
landmarks = {}
|
| 210 |
+
for name, index in landmark_indices.items():
|
| 211 |
+
lm = pose.pose_landmarks.landmark[index]
|
| 212 |
+
landmarks[name] = np.array([lm.x, lm.y, lm.z], dtype=np.float32) if lm.visibility >= 0.5 else np.array([0.0, 0.0, 0.0], dtype=np.float32)
|
| 213 |
+
|
| 214 |
+
config = EXERCISE_CONFIGS[normalized_label]
|
| 215 |
+
raw_angle = extract_primary_angle(landmarks, config)
|
| 216 |
+
smoothed_angle = smoother.update(raw_angle)
|
| 217 |
+
counter.update(smoothed_angle)
|
| 218 |
+
current_reps = counter.reps
|
| 219 |
+
else:
|
| 220 |
+
active_exercise = None
|
| 221 |
+
counter = None
|
| 222 |
+
smoother = None
|
| 223 |
+
|
| 224 |
+
show_frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 225 |
+
frame_slot.image(show_frame, channels="RGB", width="stretch")
|
| 226 |
+
similarity_text = f"{current_similarity:0.1f}%" if current_similarity is not None else "N/A"
|
| 227 |
+
metrics_slot.markdown(
|
| 228 |
+
f"### Live Metrics\n"
|
| 229 |
+
f"Model: `{model_name}`\n\n"
|
| 230 |
+
f"Exercise: `{current_label}`\n\n"
|
| 231 |
+
f"Reps: `{current_reps}`\n\n"
|
| 232 |
+
f"Similarity: `{similarity_text}`"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
if not st.session_state.session_active:
|
| 236 |
+
break
|
| 237 |
+
|
| 238 |
+
capture.release()
|
| 239 |
+
st.session_state.session_active = False
|
| 240 |
+
st.success("Session finished.")
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
if __name__ == "__main__":
|
| 244 |
+
main()
|
scripts/benchmark/benchmark_inference.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import time
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import joblib
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import torch
|
| 9 |
+
from torch import nn
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_args():
|
| 13 |
+
parser = argparse.ArgumentParser()
|
| 14 |
+
parser.add_argument("--input-file", default="data/test_home_sequences.csv")
|
| 15 |
+
parser.add_argument("--models-root", default="models")
|
| 16 |
+
parser.add_argument("--output-dir", default="results/benchmark_inference")
|
| 17 |
+
parser.add_argument("--sequence-length", type=int, default=30)
|
| 18 |
+
parser.add_argument("--feature-count", type=int, default=78)
|
| 19 |
+
parser.add_argument("--warmup-runs", type=int, default=50)
|
| 20 |
+
parser.add_argument("--timed-runs", type=int, default=500)
|
| 21 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 22 |
+
return parser.parse_args()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class BidirectionalLstmClassifier(nn.Module):
|
| 26 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 27 |
+
super().__init__()
|
| 28 |
+
self.bilstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 29 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 30 |
+
self.classifier = nn.Linear(hidden_size * 2, class_count)
|
| 31 |
+
|
| 32 |
+
def forward(self, input_sequence):
|
| 33 |
+
recurrent_output, _ = self.bilstm(input_sequence)
|
| 34 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 35 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 36 |
+
return self.classifier(dropout_output)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class LstmClassifier(nn.Module):
|
| 40 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.lstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 43 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 44 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 45 |
+
|
| 46 |
+
def forward(self, input_sequence):
|
| 47 |
+
recurrent_output, _ = self.lstm(input_sequence)
|
| 48 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 49 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 50 |
+
return self.classifier(dropout_output)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class GruClassifier(nn.Module):
|
| 54 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 55 |
+
super().__init__()
|
| 56 |
+
self.gru = nn.GRU(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 57 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 58 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 59 |
+
|
| 60 |
+
def forward(self, input_sequence):
|
| 61 |
+
recurrent_output, _ = self.gru(input_sequence)
|
| 62 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 63 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 64 |
+
return self.classifier(dropout_output)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class Chomp1d(nn.Module):
|
| 68 |
+
def __init__(self, chomp_size):
|
| 69 |
+
super().__init__()
|
| 70 |
+
self.chomp_size = chomp_size
|
| 71 |
+
|
| 72 |
+
def forward(self, input_tensor):
|
| 73 |
+
if self.chomp_size == 0:
|
| 74 |
+
return input_tensor
|
| 75 |
+
return input_tensor[:, :, :-self.chomp_size].contiguous()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class TemporalBlock(nn.Module):
|
| 79 |
+
def __init__(self, input_channels, output_channels, kernel_size, dilation, dropout):
|
| 80 |
+
super().__init__()
|
| 81 |
+
padding = (kernel_size - 1) * dilation
|
| 82 |
+
self.conv1 = nn.Conv1d(input_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 83 |
+
self.chomp1 = Chomp1d(padding)
|
| 84 |
+
self.relu1 = nn.ReLU()
|
| 85 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 86 |
+
self.conv2 = nn.Conv1d(output_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 87 |
+
self.chomp2 = Chomp1d(padding)
|
| 88 |
+
self.relu2 = nn.ReLU()
|
| 89 |
+
self.dropout2 = nn.Dropout(dropout)
|
| 90 |
+
self.downsample = nn.Conv1d(input_channels, output_channels, kernel_size=1) if input_channels != output_channels else None
|
| 91 |
+
self.final_relu = nn.ReLU()
|
| 92 |
+
|
| 93 |
+
def forward(self, input_tensor):
|
| 94 |
+
output_tensor = self.conv1(input_tensor)
|
| 95 |
+
output_tensor = self.chomp1(output_tensor)
|
| 96 |
+
output_tensor = self.relu1(output_tensor)
|
| 97 |
+
output_tensor = self.dropout1(output_tensor)
|
| 98 |
+
output_tensor = self.conv2(output_tensor)
|
| 99 |
+
output_tensor = self.chomp2(output_tensor)
|
| 100 |
+
output_tensor = self.relu2(output_tensor)
|
| 101 |
+
output_tensor = self.dropout2(output_tensor)
|
| 102 |
+
residual_tensor = input_tensor if self.downsample is None else self.downsample(input_tensor)
|
| 103 |
+
return self.final_relu(output_tensor + residual_tensor)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class TcnClassifier(nn.Module):
|
| 107 |
+
def __init__(self, feature_count, class_count, channel_width, kernel_size, dropout):
|
| 108 |
+
super().__init__()
|
| 109 |
+
self.input_projection = nn.Conv1d(feature_count, channel_width, kernel_size=1)
|
| 110 |
+
self.block1 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=1, dropout=dropout)
|
| 111 |
+
self.block2 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=2, dropout=dropout)
|
| 112 |
+
self.block3 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=4, dropout=dropout)
|
| 113 |
+
self.classifier = nn.Linear(channel_width, class_count)
|
| 114 |
+
|
| 115 |
+
def forward(self, input_sequence):
|
| 116 |
+
temporal_tensor = input_sequence.transpose(1, 2)
|
| 117 |
+
temporal_tensor = self.input_projection(temporal_tensor)
|
| 118 |
+
temporal_tensor = self.block1(temporal_tensor)
|
| 119 |
+
temporal_tensor = self.block2(temporal_tensor)
|
| 120 |
+
temporal_tensor = self.block3(temporal_tensor)
|
| 121 |
+
final_timestep_tensor = temporal_tensor[:, :, -1]
|
| 122 |
+
return self.classifier(final_timestep_tensor)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class CnnBiLstmClassifier(nn.Module):
|
| 126 |
+
def __init__(self, feature_count, class_count, cnn_filters, cnn_kernel_size, lstm_units, dropout_probability):
|
| 127 |
+
super().__init__()
|
| 128 |
+
cnn_padding = cnn_kernel_size // 2
|
| 129 |
+
self.conv1d = nn.Conv1d(in_channels=feature_count, out_channels=cnn_filters, kernel_size=cnn_kernel_size, padding=cnn_padding)
|
| 130 |
+
self.relu = nn.ReLU()
|
| 131 |
+
self.dropout1 = nn.Dropout(dropout_probability)
|
| 132 |
+
self.bilstm = nn.LSTM(input_size=cnn_filters, hidden_size=lstm_units, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 133 |
+
self.dropout2 = nn.Dropout(dropout_probability)
|
| 134 |
+
self.classifier = nn.Linear(lstm_units * 2, class_count)
|
| 135 |
+
|
| 136 |
+
def forward(self, input_sequence):
|
| 137 |
+
temporal_tensor = input_sequence.transpose(1, 2)
|
| 138 |
+
temporal_tensor = self.conv1d(temporal_tensor)
|
| 139 |
+
temporal_tensor = self.relu(temporal_tensor)
|
| 140 |
+
temporal_tensor = self.dropout1(temporal_tensor)
|
| 141 |
+
temporal_tensor = temporal_tensor.transpose(1, 2)
|
| 142 |
+
recurrent_output, _ = self.bilstm(temporal_tensor)
|
| 143 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 144 |
+
dropout_output = self.dropout2(final_timestep_output)
|
| 145 |
+
return self.classifier(dropout_output)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class GraphConvolution(nn.Module):
|
| 149 |
+
def __init__(self, input_channels, output_channels):
|
| 150 |
+
super().__init__()
|
| 151 |
+
self.projection = nn.Conv2d(input_channels, output_channels, kernel_size=1)
|
| 152 |
+
|
| 153 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 154 |
+
projected_tensor = self.projection(input_tensor)
|
| 155 |
+
return torch.einsum("nctv,vw->nctw", projected_tensor, adjacency_matrix)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class StGcnBlock(nn.Module):
|
| 159 |
+
def __init__(self, input_channels, output_channels, dropout, stride=1):
|
| 160 |
+
super().__init__()
|
| 161 |
+
self.graph_convolution = GraphConvolution(input_channels, output_channels)
|
| 162 |
+
self.temporal_convolution = nn.Sequential(
|
| 163 |
+
nn.BatchNorm2d(output_channels),
|
| 164 |
+
nn.ReLU(inplace=True),
|
| 165 |
+
nn.Conv2d(output_channels, output_channels, kernel_size=(9, 1), stride=(stride, 1), padding=(4, 0)),
|
| 166 |
+
nn.BatchNorm2d(output_channels),
|
| 167 |
+
nn.Dropout(dropout),
|
| 168 |
+
)
|
| 169 |
+
if stride != 1 or input_channels != output_channels:
|
| 170 |
+
self.residual = nn.Sequential(nn.Conv2d(input_channels, output_channels, kernel_size=1, stride=(stride, 1)), nn.BatchNorm2d(output_channels))
|
| 171 |
+
else:
|
| 172 |
+
self.residual = nn.Identity()
|
| 173 |
+
self.activation = nn.ReLU(inplace=True)
|
| 174 |
+
|
| 175 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 176 |
+
residual_tensor = self.residual(input_tensor)
|
| 177 |
+
output_tensor = self.graph_convolution(input_tensor, adjacency_matrix)
|
| 178 |
+
output_tensor = self.temporal_convolution(output_tensor)
|
| 179 |
+
return self.activation(output_tensor + residual_tensor)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class StGcnClassifier(nn.Module):
|
| 183 |
+
def __init__(self, feature_count, class_count, dropout):
|
| 184 |
+
super().__init__()
|
| 185 |
+
self.input_batch_norm = nn.BatchNorm1d(feature_count)
|
| 186 |
+
self.register_parameter("adjacency_logits", nn.Parameter(torch.eye(feature_count)))
|
| 187 |
+
self.block1 = StGcnBlock(1, 64, dropout=dropout, stride=1)
|
| 188 |
+
self.block2 = StGcnBlock(64, 64, dropout=dropout, stride=1)
|
| 189 |
+
self.block3 = StGcnBlock(64, 128, dropout=dropout, stride=1)
|
| 190 |
+
self.classifier = nn.Linear(128, class_count)
|
| 191 |
+
|
| 192 |
+
def get_normalized_adjacency(self):
|
| 193 |
+
return torch.softmax(self.adjacency_logits, dim=1)
|
| 194 |
+
|
| 195 |
+
def forward(self, input_sequence):
|
| 196 |
+
batch_size, sequence_length, feature_count = input_sequence.shape
|
| 197 |
+
normalized_input = input_sequence.reshape(batch_size * sequence_length, feature_count)
|
| 198 |
+
normalized_input = self.input_batch_norm(normalized_input)
|
| 199 |
+
normalized_input = normalized_input.reshape(batch_size, sequence_length, feature_count)
|
| 200 |
+
graph_tensor = normalized_input.unsqueeze(1)
|
| 201 |
+
adjacency_matrix = self.get_normalized_adjacency()
|
| 202 |
+
graph_tensor = self.block1(graph_tensor, adjacency_matrix)
|
| 203 |
+
graph_tensor = self.block2(graph_tensor, adjacency_matrix)
|
| 204 |
+
graph_tensor = self.block3(graph_tensor, adjacency_matrix)
|
| 205 |
+
pooled_tensor = graph_tensor.mean(dim=2).mean(dim=2)
|
| 206 |
+
return self.classifier(pooled_tensor)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
MODEL_SPECS = {
|
| 210 |
+
"bilstm": {
|
| 211 |
+
"weight": "bidirectionallstm_model.pt",
|
| 212 |
+
"scaler": "bidirectionallstm_scaler.pkl",
|
| 213 |
+
"encoder": "bidirectionallstm_label_encoder.pkl",
|
| 214 |
+
"builder": lambda feature_count, class_count: BidirectionalLstmClassifier(feature_count, 73, class_count, 0.2174),
|
| 215 |
+
},
|
| 216 |
+
"lstm": {
|
| 217 |
+
"weight": "lstm_model.pt",
|
| 218 |
+
"scaler": "lstm_scaler.pkl",
|
| 219 |
+
"encoder": "lstm_label_encoder.pkl",
|
| 220 |
+
"builder": lambda feature_count, class_count: LstmClassifier(feature_count, 117, class_count, 0.3829),
|
| 221 |
+
},
|
| 222 |
+
"gru": {
|
| 223 |
+
"weight": "gru_model.pt",
|
| 224 |
+
"scaler": "gru_scaler.pkl",
|
| 225 |
+
"encoder": "gru_label_encoder.pkl",
|
| 226 |
+
"builder": lambda feature_count, class_count: GruClassifier(feature_count, 96, class_count, 0.2),
|
| 227 |
+
},
|
| 228 |
+
"tcn": {
|
| 229 |
+
"weight": "tcn_model.pt",
|
| 230 |
+
"scaler": "tcn_scaler.pkl",
|
| 231 |
+
"encoder": "tcn_label_encoder.pkl",
|
| 232 |
+
"builder": lambda feature_count, class_count: TcnClassifier(feature_count, class_count, 128, 3, 0.2),
|
| 233 |
+
},
|
| 234 |
+
"cnn_bilstm": {
|
| 235 |
+
"weight": "cnn_bilstm_model.pt",
|
| 236 |
+
"scaler": "cnn_bilstm_scaler.pkl",
|
| 237 |
+
"encoder": "cnn_bilstm_label_encoder.pkl",
|
| 238 |
+
"builder": lambda feature_count, class_count: CnnBiLstmClassifier(feature_count, class_count, 128, 3, 73, 0.2),
|
| 239 |
+
},
|
| 240 |
+
"st_gcn": {
|
| 241 |
+
"weight": "st_gcn_model.pt",
|
| 242 |
+
"scaler": "st_gcn_scaler.pkl",
|
| 243 |
+
"encoder": "st_gcn_label_encoder.pkl",
|
| 244 |
+
"builder": lambda feature_count, class_count: StGcnClassifier(feature_count, class_count, 0.2),
|
| 245 |
+
},
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def set_seed(seed):
|
| 250 |
+
np.random.seed(seed)
|
| 251 |
+
torch.manual_seed(seed)
|
| 252 |
+
torch.cuda.manual_seed_all(seed)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def load_features(input_file_path):
|
| 256 |
+
table = pd.read_csv(input_file_path)
|
| 257 |
+
metadata_columns = {"video_id", "exercise_label", "start_frame_index", "end_frame_index"}
|
| 258 |
+
feature_columns = [column_name for column_name in table.columns if column_name not in metadata_columns]
|
| 259 |
+
return table[feature_columns].to_numpy(dtype=np.float32)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def run_device_benchmark(model, input_tensor, device, warmup_runs, timed_runs):
|
| 263 |
+
model = model.to(device)
|
| 264 |
+
input_tensor = input_tensor.to(device)
|
| 265 |
+
model.eval()
|
| 266 |
+
|
| 267 |
+
if device.type == "cuda":
|
| 268 |
+
torch.cuda.empty_cache()
|
| 269 |
+
torch.cuda.reset_peak_memory_stats(device)
|
| 270 |
+
|
| 271 |
+
with torch.inference_mode():
|
| 272 |
+
for _ in range(warmup_runs):
|
| 273 |
+
_ = model(input_tensor)
|
| 274 |
+
if device.type == "cuda":
|
| 275 |
+
torch.cuda.synchronize(device)
|
| 276 |
+
|
| 277 |
+
latencies_ms = []
|
| 278 |
+
for _ in range(timed_runs):
|
| 279 |
+
step_start = time.perf_counter()
|
| 280 |
+
_ = model(input_tensor)
|
| 281 |
+
if device.type == "cuda":
|
| 282 |
+
torch.cuda.synchronize(device)
|
| 283 |
+
latencies_ms.append((time.perf_counter() - step_start) * 1000.0)
|
| 284 |
+
|
| 285 |
+
mean_latency_ms = float(np.mean(latencies_ms))
|
| 286 |
+
p95_latency_ms = float(np.percentile(latencies_ms, 95))
|
| 287 |
+
|
| 288 |
+
peak_memory_mb = None
|
| 289 |
+
if device.type == "cuda":
|
| 290 |
+
peak_memory_mb = float(torch.cuda.max_memory_allocated(device) / (1024.0 * 1024.0))
|
| 291 |
+
|
| 292 |
+
return {
|
| 293 |
+
"mean_latency_ms": mean_latency_ms,
|
| 294 |
+
"p95_latency_ms": p95_latency_ms,
|
| 295 |
+
"peak_memory_mb": peak_memory_mb,
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def main():
|
| 300 |
+
args = parse_args()
|
| 301 |
+
set_seed(args.seed)
|
| 302 |
+
|
| 303 |
+
input_file_path = Path(args.input_file)
|
| 304 |
+
models_root_path = Path(args.models_root)
|
| 305 |
+
output_directory_path = Path(args.output_dir)
|
| 306 |
+
output_directory_path.mkdir(parents=True, exist_ok=True)
|
| 307 |
+
|
| 308 |
+
full_features = load_features(input_file_path)
|
| 309 |
+
single_window_features = full_features[0:1]
|
| 310 |
+
|
| 311 |
+
benchmark_rows = []
|
| 312 |
+
|
| 313 |
+
cpu_device = torch.device("cpu")
|
| 314 |
+
has_cuda = torch.cuda.is_available()
|
| 315 |
+
cuda_device = torch.device("cuda") if has_cuda else None
|
| 316 |
+
|
| 317 |
+
for model_name, spec in MODEL_SPECS.items():
|
| 318 |
+
weights_root = models_root_path / model_name / "weights"
|
| 319 |
+
|
| 320 |
+
scaler = joblib.load(weights_root / spec["scaler"])
|
| 321 |
+
label_encoder = joblib.load(weights_root / spec["encoder"])
|
| 322 |
+
class_count = len(label_encoder.classes_)
|
| 323 |
+
|
| 324 |
+
scaled_window = scaler.transform(single_window_features).reshape(1, args.sequence_length, args.feature_count)
|
| 325 |
+
input_tensor = torch.tensor(scaled_window, dtype=torch.float32)
|
| 326 |
+
|
| 327 |
+
model = spec["builder"](args.feature_count, class_count)
|
| 328 |
+
state_dict = torch.load(weights_root / spec["weight"], map_location="cpu")
|
| 329 |
+
model.load_state_dict(state_dict)
|
| 330 |
+
|
| 331 |
+
model_size_mb = float((weights_root / spec["weight"]).stat().st_size / (1024.0 * 1024.0))
|
| 332 |
+
|
| 333 |
+
cpu_stats = run_device_benchmark(model, input_tensor, cpu_device, args.warmup_runs, args.timed_runs)
|
| 334 |
+
benchmark_rows.append(
|
| 335 |
+
{
|
| 336 |
+
"model": model_name,
|
| 337 |
+
"device": "cpu",
|
| 338 |
+
"model_size_mb": model_size_mb,
|
| 339 |
+
**cpu_stats,
|
| 340 |
+
}
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
if has_cuda:
|
| 344 |
+
cuda_stats = run_device_benchmark(model, input_tensor, cuda_device, args.warmup_runs, args.timed_runs)
|
| 345 |
+
benchmark_rows.append(
|
| 346 |
+
{
|
| 347 |
+
"model": model_name,
|
| 348 |
+
"device": "cuda",
|
| 349 |
+
"model_size_mb": model_size_mb,
|
| 350 |
+
**cuda_stats,
|
| 351 |
+
}
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
print(f"Benchmarked: {model_name}")
|
| 355 |
+
|
| 356 |
+
benchmark_table = pd.DataFrame(benchmark_rows)
|
| 357 |
+
|
| 358 |
+
cpu_table = benchmark_table[benchmark_table["device"] == "cpu"].sort_values("mean_latency_ms", ascending=True)
|
| 359 |
+
cpu_csv_output_path = output_directory_path / "inference_benchmark_cpu.csv"
|
| 360 |
+
cpu_table.to_csv(cpu_csv_output_path, index=False)
|
| 361 |
+
|
| 362 |
+
print("\nInference Benchmark (CPU)")
|
| 363 |
+
print(cpu_table.to_string(index=False))
|
| 364 |
+
print(f"\nSaved: {cpu_csv_output_path}")
|
| 365 |
+
|
| 366 |
+
if has_cuda:
|
| 367 |
+
cuda_table = benchmark_table[benchmark_table["device"] == "cuda"].sort_values("mean_latency_ms", ascending=True)
|
| 368 |
+
cuda_csv_output_path = output_directory_path / "inference_benchmark_cuda.csv"
|
| 369 |
+
cuda_table.to_csv(cuda_csv_output_path, index=False)
|
| 370 |
+
|
| 371 |
+
print("\nInference Benchmark (CUDA)")
|
| 372 |
+
print(cuda_table.to_string(index=False))
|
| 373 |
+
print(f"\nSaved: {cuda_csv_output_path}")
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
main()
|
scripts/evaluate/benchmark_rep_counting.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import cv2
|
| 6 |
+
import mediapipe as mp
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 11 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 13 |
+
|
| 14 |
+
from scripts.evaluate.rep_counting_methods import EXERCISE_CONFIGS, FixedThresholdFSMCounter, SmoothingBuffer, extract_primary_angle, normalize_exercise_name
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_args():
|
| 18 |
+
parser = argparse.ArgumentParser()
|
| 19 |
+
parser.add_argument("--manifest-file", required=True)
|
| 20 |
+
parser.add_argument("--output-dir", default="results/eval_rep_counting")
|
| 21 |
+
parser.add_argument("--min-visibility", type=float, default=0.5)
|
| 22 |
+
return parser.parse_args()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_pose_module():
|
| 26 |
+
return mp.solutions.pose
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def build_landmark_indices(mp_pose):
|
| 30 |
+
names = [
|
| 31 |
+
"LEFT_SHOULDER",
|
| 32 |
+
"RIGHT_SHOULDER",
|
| 33 |
+
"LEFT_HIP",
|
| 34 |
+
"RIGHT_HIP",
|
| 35 |
+
"LEFT_KNEE",
|
| 36 |
+
"RIGHT_KNEE",
|
| 37 |
+
"LEFT_ELBOW",
|
| 38 |
+
"RIGHT_ELBOW",
|
| 39 |
+
"LEFT_WRIST",
|
| 40 |
+
"RIGHT_WRIST",
|
| 41 |
+
"LEFT_ANKLE",
|
| 42 |
+
"RIGHT_ANKLE",
|
| 43 |
+
]
|
| 44 |
+
return {name: mp_pose.PoseLandmark[name].value for name in names}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def extract_landmark_points(results, landmark_indices, min_visibility):
|
| 48 |
+
if not results.pose_landmarks:
|
| 49 |
+
return None
|
| 50 |
+
points = {}
|
| 51 |
+
for name, index in landmark_indices.items():
|
| 52 |
+
landmark = results.pose_landmarks.landmark[index]
|
| 53 |
+
if landmark.visibility >= min_visibility:
|
| 54 |
+
points[name] = np.array([landmark.x, landmark.y, landmark.z], dtype=np.float32)
|
| 55 |
+
else:
|
| 56 |
+
points[name] = np.array([0.0, 0.0, 0.0], dtype=np.float32)
|
| 57 |
+
return points
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def evaluate_video(video_path, exercise_label, pose_estimator, landmark_indices, min_visibility):
|
| 61 |
+
config = EXERCISE_CONFIGS[exercise_label]
|
| 62 |
+
fixed_counter = FixedThresholdFSMCounter(config.fixed_low, config.fixed_high, config.min_state_frames)
|
| 63 |
+
smoothing = SmoothingBuffer(window_size=config.smoothing_window)
|
| 64 |
+
|
| 65 |
+
capture = cv2.VideoCapture(str(video_path))
|
| 66 |
+
processed_frames = 0
|
| 67 |
+
valid_angle_frames = 0
|
| 68 |
+
|
| 69 |
+
while capture.isOpened():
|
| 70 |
+
read_ok, frame_bgr = capture.read()
|
| 71 |
+
if not read_ok:
|
| 72 |
+
break
|
| 73 |
+
|
| 74 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 75 |
+
pose_result = pose_estimator.process(frame_rgb)
|
| 76 |
+
points = extract_landmark_points(pose_result, landmark_indices, min_visibility)
|
| 77 |
+
if points is None:
|
| 78 |
+
processed_frames += 1
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
raw_angle = extract_primary_angle(points, config)
|
| 82 |
+
smoothed_angle = smoothing.update(raw_angle)
|
| 83 |
+
if not np.isnan(smoothed_angle):
|
| 84 |
+
valid_angle_frames += 1
|
| 85 |
+
fixed_counter.update(smoothed_angle)
|
| 86 |
+
processed_frames += 1
|
| 87 |
+
|
| 88 |
+
capture.release()
|
| 89 |
+
return {
|
| 90 |
+
"paper_fsm": fixed_counter.reps,
|
| 91 |
+
"processed_frames": processed_frames,
|
| 92 |
+
"valid_angle_frames": valid_angle_frames,
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def compute_error_metrics(predicted_reps, true_reps):
|
| 97 |
+
absolute_error = abs(predicted_reps - true_reps)
|
| 98 |
+
relative_error_percent = (absolute_error / true_reps) * 100.0 if true_reps > 0 else 0.0
|
| 99 |
+
missed_reps = max(0, true_reps - predicted_reps)
|
| 100 |
+
false_reps = max(0, predicted_reps - true_reps)
|
| 101 |
+
return absolute_error, relative_error_percent, missed_reps, false_reps
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def validate_manifest_columns(dataframe):
|
| 105 |
+
required = {"video_path", "exercise_label"}
|
| 106 |
+
missing = required - set(dataframe.columns)
|
| 107 |
+
if missing:
|
| 108 |
+
missing_text = ", ".join(sorted(missing))
|
| 109 |
+
raise ValueError(f"Manifest is missing required columns: {missing_text}")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main():
|
| 113 |
+
args = parse_args()
|
| 114 |
+
output_dir = Path(args.output_dir)
|
| 115 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 116 |
+
|
| 117 |
+
manifest = pd.read_csv(args.manifest_file)
|
| 118 |
+
validate_manifest_columns(manifest)
|
| 119 |
+
|
| 120 |
+
mp_pose = load_pose_module()
|
| 121 |
+
landmark_indices = build_landmark_indices(mp_pose)
|
| 122 |
+
per_video_rows = []
|
| 123 |
+
|
| 124 |
+
has_true_reps = "true_reps" in manifest.columns
|
| 125 |
+
|
| 126 |
+
with mp_pose.Pose(
|
| 127 |
+
static_image_mode=False,
|
| 128 |
+
model_complexity=1,
|
| 129 |
+
enable_segmentation=False,
|
| 130 |
+
min_detection_confidence=0.5,
|
| 131 |
+
min_tracking_confidence=0.5,
|
| 132 |
+
) as pose_estimator:
|
| 133 |
+
for row in manifest.itertuples(index=False):
|
| 134 |
+
video_path = Path(row.video_path)
|
| 135 |
+
raw_exercise = str(row.exercise_label)
|
| 136 |
+
exercise_label = normalize_exercise_name(raw_exercise)
|
| 137 |
+
true_reps = int(row.true_reps) if has_true_reps and not pd.isna(row.true_reps) else None
|
| 138 |
+
|
| 139 |
+
if exercise_label not in EXERCISE_CONFIGS:
|
| 140 |
+
raise ValueError(f"Unsupported exercise '{raw_exercise}' from manifest row: {video_path}")
|
| 141 |
+
if not video_path.exists():
|
| 142 |
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
| 143 |
+
|
| 144 |
+
counts = evaluate_video(
|
| 145 |
+
video_path=video_path,
|
| 146 |
+
exercise_label=exercise_label,
|
| 147 |
+
pose_estimator=pose_estimator,
|
| 148 |
+
landmark_indices=landmark_indices,
|
| 149 |
+
min_visibility=args.min_visibility,
|
| 150 |
+
)
|
| 151 |
+
pred_reps = counts["paper_fsm"]
|
| 152 |
+
row_data = {
|
| 153 |
+
"video_path": str(video_path),
|
| 154 |
+
"exercise_label": exercise_label,
|
| 155 |
+
"method": "paper_fsm",
|
| 156 |
+
"predicted_reps": pred_reps,
|
| 157 |
+
"processed_frames": counts["processed_frames"],
|
| 158 |
+
"valid_angle_frames": counts["valid_angle_frames"],
|
| 159 |
+
}
|
| 160 |
+
if true_reps is not None:
|
| 161 |
+
abs_err, rel_err, missed, false = compute_error_metrics(pred_reps, true_reps)
|
| 162 |
+
row_data["true_reps"] = true_reps
|
| 163 |
+
row_data["absolute_count_error"] = abs_err
|
| 164 |
+
row_data["relative_error_percent"] = rel_err
|
| 165 |
+
row_data["missed_reps"] = missed
|
| 166 |
+
row_data["false_reps"] = false
|
| 167 |
+
per_video_rows.append(row_data)
|
| 168 |
+
|
| 169 |
+
print(
|
| 170 |
+
f"{video_path.name} | {exercise_label} | "
|
| 171 |
+
f"predicted_reps={counts['paper_fsm']}"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
per_video_df = pd.DataFrame(per_video_rows)
|
| 175 |
+
if "absolute_count_error" in per_video_df.columns:
|
| 176 |
+
summary_df = (
|
| 177 |
+
per_video_df.groupby("method", as_index=False)
|
| 178 |
+
.agg(
|
| 179 |
+
videos=("video_path", "count"),
|
| 180 |
+
mean_absolute_count_error=("absolute_count_error", "mean"),
|
| 181 |
+
mean_relative_error_percent=("relative_error_percent", "mean"),
|
| 182 |
+
total_missed_reps=("missed_reps", "sum"),
|
| 183 |
+
total_false_reps=("false_reps", "sum"),
|
| 184 |
+
)
|
| 185 |
+
.sort_values(by=["mean_absolute_count_error", "total_false_reps"], ascending=[True, True])
|
| 186 |
+
)
|
| 187 |
+
else:
|
| 188 |
+
summary_df = per_video_df.groupby("method", as_index=False).agg(videos=("video_path", "count"))
|
| 189 |
+
|
| 190 |
+
per_video_file = output_dir / "rep_counting_per_video.csv"
|
| 191 |
+
summary_file = output_dir / "rep_counting_summary.csv"
|
| 192 |
+
per_video_df.to_csv(per_video_file, index=False)
|
| 193 |
+
summary_df.to_csv(summary_file, index=False)
|
| 194 |
+
|
| 195 |
+
print("\nRep counting summary")
|
| 196 |
+
print(summary_df.to_string(index=False))
|
| 197 |
+
print(f"\nSaved: {per_video_file}")
|
| 198 |
+
print(f"Saved: {summary_file}")
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == "__main__":
|
| 202 |
+
main()
|
scripts/evaluate/evaluate_home_set.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import joblib
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import torch
|
| 8 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
|
| 9 |
+
from torch import nn
|
| 10 |
+
from torch.utils.data import DataLoader, Dataset
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def parse_args():
|
| 14 |
+
parser = argparse.ArgumentParser()
|
| 15 |
+
parser.add_argument("--test-file", default="data/test_home_sequences.csv")
|
| 16 |
+
parser.add_argument("--models-root", default="models")
|
| 17 |
+
parser.add_argument("--output-dir", default="results/eval_offline_home")
|
| 18 |
+
parser.add_argument("--sequence-length", type=int, default=30)
|
| 19 |
+
parser.add_argument("--feature-count", type=int, default=78)
|
| 20 |
+
parser.add_argument("--batch-size", type=int, default=256)
|
| 21 |
+
return parser.parse_args()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class SequenceDataset(Dataset):
|
| 25 |
+
def __init__(self, feature_tensor, label_tensor):
|
| 26 |
+
self.feature_tensor = feature_tensor
|
| 27 |
+
self.label_tensor = label_tensor
|
| 28 |
+
|
| 29 |
+
def __len__(self):
|
| 30 |
+
return len(self.label_tensor)
|
| 31 |
+
|
| 32 |
+
def __getitem__(self, index):
|
| 33 |
+
return self.feature_tensor[index], self.label_tensor[index]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class BidirectionalLstmClassifier(nn.Module):
|
| 37 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 38 |
+
super().__init__()
|
| 39 |
+
self.bilstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 40 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 41 |
+
self.classifier = nn.Linear(hidden_size * 2, class_count)
|
| 42 |
+
|
| 43 |
+
def forward(self, input_sequence):
|
| 44 |
+
recurrent_output, _ = self.bilstm(input_sequence)
|
| 45 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 46 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 47 |
+
return self.classifier(dropout_output)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class LstmClassifier(nn.Module):
|
| 51 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 52 |
+
super().__init__()
|
| 53 |
+
self.lstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 54 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 55 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 56 |
+
|
| 57 |
+
def forward(self, input_sequence):
|
| 58 |
+
recurrent_output, _ = self.lstm(input_sequence)
|
| 59 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 60 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 61 |
+
return self.classifier(dropout_output)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class GruClassifier(nn.Module):
|
| 65 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 66 |
+
super().__init__()
|
| 67 |
+
self.gru = nn.GRU(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 68 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 69 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 70 |
+
|
| 71 |
+
def forward(self, input_sequence):
|
| 72 |
+
recurrent_output, _ = self.gru(input_sequence)
|
| 73 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 74 |
+
dropout_output = self.dropout(final_timestep_output)
|
| 75 |
+
return self.classifier(dropout_output)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class Chomp1d(nn.Module):
|
| 79 |
+
def __init__(self, chomp_size):
|
| 80 |
+
super().__init__()
|
| 81 |
+
self.chomp_size = chomp_size
|
| 82 |
+
|
| 83 |
+
def forward(self, input_tensor):
|
| 84 |
+
if self.chomp_size == 0:
|
| 85 |
+
return input_tensor
|
| 86 |
+
return input_tensor[:, :, :-self.chomp_size].contiguous()
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class TemporalBlock(nn.Module):
|
| 90 |
+
def __init__(self, input_channels, output_channels, kernel_size, dilation, dropout):
|
| 91 |
+
super().__init__()
|
| 92 |
+
padding = (kernel_size - 1) * dilation
|
| 93 |
+
self.conv1 = nn.Conv1d(input_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 94 |
+
self.chomp1 = Chomp1d(padding)
|
| 95 |
+
self.relu1 = nn.ReLU()
|
| 96 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 97 |
+
self.conv2 = nn.Conv1d(output_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 98 |
+
self.chomp2 = Chomp1d(padding)
|
| 99 |
+
self.relu2 = nn.ReLU()
|
| 100 |
+
self.dropout2 = nn.Dropout(dropout)
|
| 101 |
+
self.downsample = nn.Conv1d(input_channels, output_channels, kernel_size=1) if input_channels != output_channels else None
|
| 102 |
+
self.final_relu = nn.ReLU()
|
| 103 |
+
|
| 104 |
+
def forward(self, input_tensor):
|
| 105 |
+
output_tensor = self.conv1(input_tensor)
|
| 106 |
+
output_tensor = self.chomp1(output_tensor)
|
| 107 |
+
output_tensor = self.relu1(output_tensor)
|
| 108 |
+
output_tensor = self.dropout1(output_tensor)
|
| 109 |
+
output_tensor = self.conv2(output_tensor)
|
| 110 |
+
output_tensor = self.chomp2(output_tensor)
|
| 111 |
+
output_tensor = self.relu2(output_tensor)
|
| 112 |
+
output_tensor = self.dropout2(output_tensor)
|
| 113 |
+
residual_tensor = input_tensor if self.downsample is None else self.downsample(input_tensor)
|
| 114 |
+
return self.final_relu(output_tensor + residual_tensor)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class TcnClassifier(nn.Module):
|
| 118 |
+
def __init__(self, feature_count, class_count, channel_width, kernel_size, dropout):
|
| 119 |
+
super().__init__()
|
| 120 |
+
self.input_projection = nn.Conv1d(feature_count, channel_width, kernel_size=1)
|
| 121 |
+
self.block1 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=1, dropout=dropout)
|
| 122 |
+
self.block2 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=2, dropout=dropout)
|
| 123 |
+
self.block3 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=4, dropout=dropout)
|
| 124 |
+
self.classifier = nn.Linear(channel_width, class_count)
|
| 125 |
+
|
| 126 |
+
def forward(self, input_sequence):
|
| 127 |
+
temporal_tensor = input_sequence.transpose(1, 2)
|
| 128 |
+
temporal_tensor = self.input_projection(temporal_tensor)
|
| 129 |
+
temporal_tensor = self.block1(temporal_tensor)
|
| 130 |
+
temporal_tensor = self.block2(temporal_tensor)
|
| 131 |
+
temporal_tensor = self.block3(temporal_tensor)
|
| 132 |
+
final_timestep_tensor = temporal_tensor[:, :, -1]
|
| 133 |
+
return self.classifier(final_timestep_tensor)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class CnnBiLstmClassifier(nn.Module):
|
| 137 |
+
def __init__(self, feature_count, class_count, cnn_filters, cnn_kernel_size, lstm_units, dropout_probability):
|
| 138 |
+
super().__init__()
|
| 139 |
+
cnn_padding = cnn_kernel_size // 2
|
| 140 |
+
self.conv1d = nn.Conv1d(in_channels=feature_count, out_channels=cnn_filters, kernel_size=cnn_kernel_size, padding=cnn_padding)
|
| 141 |
+
self.relu = nn.ReLU()
|
| 142 |
+
self.dropout1 = nn.Dropout(dropout_probability)
|
| 143 |
+
self.bilstm = nn.LSTM(input_size=cnn_filters, hidden_size=lstm_units, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 144 |
+
self.dropout2 = nn.Dropout(dropout_probability)
|
| 145 |
+
self.classifier = nn.Linear(lstm_units * 2, class_count)
|
| 146 |
+
|
| 147 |
+
def forward(self, input_sequence):
|
| 148 |
+
temporal_tensor = input_sequence.transpose(1, 2)
|
| 149 |
+
temporal_tensor = self.conv1d(temporal_tensor)
|
| 150 |
+
temporal_tensor = self.relu(temporal_tensor)
|
| 151 |
+
temporal_tensor = self.dropout1(temporal_tensor)
|
| 152 |
+
temporal_tensor = temporal_tensor.transpose(1, 2)
|
| 153 |
+
recurrent_output, _ = self.bilstm(temporal_tensor)
|
| 154 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 155 |
+
dropout_output = self.dropout2(final_timestep_output)
|
| 156 |
+
return self.classifier(dropout_output)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class GraphConvolution(nn.Module):
|
| 160 |
+
def __init__(self, input_channels, output_channels):
|
| 161 |
+
super().__init__()
|
| 162 |
+
self.projection = nn.Conv2d(input_channels, output_channels, kernel_size=1)
|
| 163 |
+
|
| 164 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 165 |
+
projected_tensor = self.projection(input_tensor)
|
| 166 |
+
return torch.einsum("nctv,vw->nctw", projected_tensor, adjacency_matrix)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class StGcnBlock(nn.Module):
|
| 170 |
+
def __init__(self, input_channels, output_channels, dropout, stride=1):
|
| 171 |
+
super().__init__()
|
| 172 |
+
self.graph_convolution = GraphConvolution(input_channels, output_channels)
|
| 173 |
+
self.temporal_convolution = nn.Sequential(
|
| 174 |
+
nn.BatchNorm2d(output_channels),
|
| 175 |
+
nn.ReLU(inplace=True),
|
| 176 |
+
nn.Conv2d(output_channels, output_channels, kernel_size=(9, 1), stride=(stride, 1), padding=(4, 0)),
|
| 177 |
+
nn.BatchNorm2d(output_channels),
|
| 178 |
+
nn.Dropout(dropout),
|
| 179 |
+
)
|
| 180 |
+
if stride != 1 or input_channels != output_channels:
|
| 181 |
+
self.residual = nn.Sequential(nn.Conv2d(input_channels, output_channels, kernel_size=1, stride=(stride, 1)), nn.BatchNorm2d(output_channels))
|
| 182 |
+
else:
|
| 183 |
+
self.residual = nn.Identity()
|
| 184 |
+
self.activation = nn.ReLU(inplace=True)
|
| 185 |
+
|
| 186 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 187 |
+
residual_tensor = self.residual(input_tensor)
|
| 188 |
+
output_tensor = self.graph_convolution(input_tensor, adjacency_matrix)
|
| 189 |
+
output_tensor = self.temporal_convolution(output_tensor)
|
| 190 |
+
return self.activation(output_tensor + residual_tensor)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class StGcnClassifier(nn.Module):
|
| 194 |
+
def __init__(self, feature_count, class_count, dropout):
|
| 195 |
+
super().__init__()
|
| 196 |
+
self.input_batch_norm = nn.BatchNorm1d(feature_count)
|
| 197 |
+
self.register_parameter("adjacency_logits", nn.Parameter(torch.eye(feature_count)))
|
| 198 |
+
self.block1 = StGcnBlock(1, 64, dropout=dropout, stride=1)
|
| 199 |
+
self.block2 = StGcnBlock(64, 64, dropout=dropout, stride=1)
|
| 200 |
+
self.block3 = StGcnBlock(64, 128, dropout=dropout, stride=1)
|
| 201 |
+
self.classifier = nn.Linear(128, class_count)
|
| 202 |
+
|
| 203 |
+
def get_normalized_adjacency(self):
|
| 204 |
+
return torch.softmax(self.adjacency_logits, dim=1)
|
| 205 |
+
|
| 206 |
+
def forward(self, input_sequence):
|
| 207 |
+
batch_size, sequence_length, feature_count = input_sequence.shape
|
| 208 |
+
normalized_input = input_sequence.reshape(batch_size * sequence_length, feature_count)
|
| 209 |
+
normalized_input = self.input_batch_norm(normalized_input)
|
| 210 |
+
normalized_input = normalized_input.reshape(batch_size, sequence_length, feature_count)
|
| 211 |
+
graph_tensor = normalized_input.unsqueeze(1)
|
| 212 |
+
adjacency_matrix = self.get_normalized_adjacency()
|
| 213 |
+
graph_tensor = self.block1(graph_tensor, adjacency_matrix)
|
| 214 |
+
graph_tensor = self.block2(graph_tensor, adjacency_matrix)
|
| 215 |
+
graph_tensor = self.block3(graph_tensor, adjacency_matrix)
|
| 216 |
+
pooled_tensor = graph_tensor.mean(dim=2).mean(dim=2)
|
| 217 |
+
return self.classifier(pooled_tensor)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
MODEL_SPECS = {
|
| 221 |
+
"bilstm": {
|
| 222 |
+
"weight": "bidirectionallstm_model.pt",
|
| 223 |
+
"scaler": "bidirectionallstm_scaler.pkl",
|
| 224 |
+
"encoder": "bidirectionallstm_label_encoder.pkl",
|
| 225 |
+
"builder": lambda feature_count, class_count: BidirectionalLstmClassifier(feature_count, 73, class_count, 0.2174),
|
| 226 |
+
},
|
| 227 |
+
"lstm": {
|
| 228 |
+
"weight": "lstm_model.pt",
|
| 229 |
+
"scaler": "lstm_scaler.pkl",
|
| 230 |
+
"encoder": "lstm_label_encoder.pkl",
|
| 231 |
+
"builder": lambda feature_count, class_count: LstmClassifier(feature_count, 117, class_count, 0.3829),
|
| 232 |
+
},
|
| 233 |
+
"gru": {
|
| 234 |
+
"weight": "gru_model.pt",
|
| 235 |
+
"scaler": "gru_scaler.pkl",
|
| 236 |
+
"encoder": "gru_label_encoder.pkl",
|
| 237 |
+
"builder": lambda feature_count, class_count: GruClassifier(feature_count, 96, class_count, 0.2),
|
| 238 |
+
},
|
| 239 |
+
"tcn": {
|
| 240 |
+
"weight": "tcn_model.pt",
|
| 241 |
+
"scaler": "tcn_scaler.pkl",
|
| 242 |
+
"encoder": "tcn_label_encoder.pkl",
|
| 243 |
+
"builder": lambda feature_count, class_count: TcnClassifier(feature_count, class_count, 128, 3, 0.2),
|
| 244 |
+
},
|
| 245 |
+
"cnn_bilstm": {
|
| 246 |
+
"weight": "cnn_bilstm_model.pt",
|
| 247 |
+
"scaler": "cnn_bilstm_scaler.pkl",
|
| 248 |
+
"encoder": "cnn_bilstm_label_encoder.pkl",
|
| 249 |
+
"builder": lambda feature_count, class_count: CnnBiLstmClassifier(feature_count, class_count, 128, 3, 73, 0.2),
|
| 250 |
+
},
|
| 251 |
+
"st_gcn": {
|
| 252 |
+
"weight": "st_gcn_model.pt",
|
| 253 |
+
"scaler": "st_gcn_scaler.pkl",
|
| 254 |
+
"encoder": "st_gcn_label_encoder.pkl",
|
| 255 |
+
"builder": lambda feature_count, class_count: StGcnClassifier(feature_count, class_count, 0.2),
|
| 256 |
+
},
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def load_test_table(test_file_path):
|
| 261 |
+
table = pd.read_csv(test_file_path)
|
| 262 |
+
metadata_columns = {"video_id", "exercise_label", "start_frame_index", "end_frame_index"}
|
| 263 |
+
feature_columns = [column_name for column_name in table.columns if column_name not in metadata_columns]
|
| 264 |
+
features = table[feature_columns].to_numpy(dtype=np.float32)
|
| 265 |
+
labels = table["exercise_label"].to_numpy()
|
| 266 |
+
return features, labels
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def build_loader(features, labels, batch_size):
|
| 270 |
+
feature_tensor = torch.tensor(features, dtype=torch.float32)
|
| 271 |
+
label_tensor = torch.tensor(labels, dtype=torch.long)
|
| 272 |
+
dataset = SequenceDataset(feature_tensor, label_tensor)
|
| 273 |
+
return DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def predict_labels(model, data_loader, device):
|
| 277 |
+
model.eval()
|
| 278 |
+
true_labels = []
|
| 279 |
+
predicted_labels = []
|
| 280 |
+
with torch.inference_mode():
|
| 281 |
+
for feature_batch, label_batch in data_loader:
|
| 282 |
+
feature_batch = feature_batch.to(device, non_blocking=True)
|
| 283 |
+
logits = model(feature_batch)
|
| 284 |
+
prediction = torch.argmax(logits, dim=1)
|
| 285 |
+
true_labels.append(label_batch.numpy())
|
| 286 |
+
predicted_labels.append(prediction.cpu().numpy())
|
| 287 |
+
return np.concatenate(true_labels), np.concatenate(predicted_labels)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def evaluate_one_model(model_name, spec, models_root_path, test_features, test_raw_labels, sequence_length, feature_count, batch_size, device, output_directory_path):
|
| 291 |
+
model_root = models_root_path / model_name
|
| 292 |
+
weights_root = model_root / "weights"
|
| 293 |
+
results_root = model_root / "results"
|
| 294 |
+
|
| 295 |
+
scaler = joblib.load(weights_root / spec["scaler"])
|
| 296 |
+
label_encoder = joblib.load(weights_root / spec["encoder"])
|
| 297 |
+
|
| 298 |
+
scaled_test = scaler.transform(test_features).reshape(-1, sequence_length, feature_count)
|
| 299 |
+
encoded_labels = label_encoder.transform(test_raw_labels)
|
| 300 |
+
data_loader = build_loader(scaled_test, encoded_labels, batch_size)
|
| 301 |
+
|
| 302 |
+
class_count = len(label_encoder.classes_)
|
| 303 |
+
model = spec["builder"](feature_count, class_count).to(device)
|
| 304 |
+
state_dict = torch.load(weights_root / spec["weight"], map_location=device)
|
| 305 |
+
model.load_state_dict(state_dict)
|
| 306 |
+
|
| 307 |
+
true_labels, predicted_labels = predict_labels(model, data_loader, device)
|
| 308 |
+
|
| 309 |
+
accuracy = accuracy_score(true_labels, predicted_labels)
|
| 310 |
+
precision = precision_score(true_labels, predicted_labels, average="weighted", zero_division=0)
|
| 311 |
+
recall = recall_score(true_labels, predicted_labels, average="weighted", zero_division=0)
|
| 312 |
+
f1_weighted = f1_score(true_labels, predicted_labels, average="weighted", zero_division=0)
|
| 313 |
+
f1_macro = f1_score(true_labels, predicted_labels, average="macro", zero_division=0)
|
| 314 |
+
matrix = confusion_matrix(true_labels, predicted_labels)
|
| 315 |
+
report_text = classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_, zero_division=0)
|
| 316 |
+
|
| 317 |
+
metrics = {
|
| 318 |
+
"model": model_name,
|
| 319 |
+
"accuracy": float(accuracy),
|
| 320 |
+
"precision_weighted": float(precision),
|
| 321 |
+
"recall_weighted": float(recall),
|
| 322 |
+
"f1_weighted": float(f1_weighted),
|
| 323 |
+
"f1_macro": float(f1_macro),
|
| 324 |
+
"classes": list(label_encoder.classes_),
|
| 325 |
+
"confusion_matrix": matrix.tolist(),
|
| 326 |
+
"classification_report_text": report_text,
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
metrics_row = {
|
| 330 |
+
"model": model_name,
|
| 331 |
+
"accuracy": metrics["accuracy"],
|
| 332 |
+
"precision_weighted": metrics["precision_weighted"],
|
| 333 |
+
"recall_weighted": metrics["recall_weighted"],
|
| 334 |
+
"f1_weighted": metrics["f1_weighted"],
|
| 335 |
+
"f1_macro": metrics["f1_macro"],
|
| 336 |
+
}
|
| 337 |
+
output_metrics_path = output_directory_path / f"{model_name}_home_metrics.csv"
|
| 338 |
+
pd.DataFrame([metrics_row]).to_csv(output_metrics_path, index=False)
|
| 339 |
+
pd.DataFrame(matrix).to_csv(results_root / "home_confusion_matrix.csv", index=False)
|
| 340 |
+
|
| 341 |
+
return metrics
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def main():
|
| 345 |
+
args = parse_args()
|
| 346 |
+
|
| 347 |
+
test_file_path = Path(args.test_file)
|
| 348 |
+
models_root_path = Path(args.models_root)
|
| 349 |
+
output_directory_path = Path(args.output_dir)
|
| 350 |
+
output_directory_path.mkdir(parents=True, exist_ok=True)
|
| 351 |
+
|
| 352 |
+
sequence_length = args.sequence_length
|
| 353 |
+
feature_count = args.feature_count
|
| 354 |
+
batch_size = args.batch_size
|
| 355 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 356 |
+
|
| 357 |
+
test_features, test_raw_labels = load_test_table(test_file_path)
|
| 358 |
+
|
| 359 |
+
all_metrics = []
|
| 360 |
+
for model_name, spec in MODEL_SPECS.items():
|
| 361 |
+
print(f"Evaluating: {model_name}")
|
| 362 |
+
metrics = evaluate_one_model(
|
| 363 |
+
model_name=model_name,
|
| 364 |
+
spec=spec,
|
| 365 |
+
models_root_path=models_root_path,
|
| 366 |
+
test_features=test_features,
|
| 367 |
+
test_raw_labels=test_raw_labels,
|
| 368 |
+
sequence_length=sequence_length,
|
| 369 |
+
feature_count=feature_count,
|
| 370 |
+
batch_size=batch_size,
|
| 371 |
+
device=device,
|
| 372 |
+
output_directory_path=output_directory_path,
|
| 373 |
+
)
|
| 374 |
+
all_metrics.append(metrics)
|
| 375 |
+
|
| 376 |
+
leaderboard_table = pd.DataFrame(
|
| 377 |
+
[
|
| 378 |
+
{
|
| 379 |
+
"model": metric["model"],
|
| 380 |
+
"accuracy": metric["accuracy"],
|
| 381 |
+
"f1_macro": metric["f1_macro"],
|
| 382 |
+
"f1_weighted": metric["f1_weighted"],
|
| 383 |
+
"precision_weighted": metric["precision_weighted"],
|
| 384 |
+
"recall_weighted": metric["recall_weighted"],
|
| 385 |
+
}
|
| 386 |
+
for metric in all_metrics
|
| 387 |
+
]
|
| 388 |
+
).sort_values("accuracy", ascending=False)
|
| 389 |
+
|
| 390 |
+
leaderboard_csv_path = output_directory_path / "home_leaderboard.csv"
|
| 391 |
+
leaderboard_table.to_csv(leaderboard_csv_path, index=False)
|
| 392 |
+
|
| 393 |
+
print("\nHome leaderboard")
|
| 394 |
+
print(leaderboard_table.to_string(index=False))
|
| 395 |
+
print(f"\nSaved: {leaderboard_csv_path}")
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
if __name__ == "__main__":
|
| 399 |
+
main()
|
scripts/evaluate/rep_counting_methods.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class ExerciseAngleConfig:
|
| 9 |
+
primary_triplet_left: tuple[str, str, str]
|
| 10 |
+
primary_triplet_right: tuple[str, str, str]
|
| 11 |
+
fixed_low: float
|
| 12 |
+
fixed_high: float
|
| 13 |
+
min_state_frames: int = 2
|
| 14 |
+
smoothing_window: int = 5
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
EXERCISE_CONFIGS: Dict[str, ExerciseAngleConfig] = {
|
| 18 |
+
"squat": ExerciseAngleConfig(
|
| 19 |
+
primary_triplet_left=("LEFT_HIP", "LEFT_KNEE", "LEFT_ANKLE"),
|
| 20 |
+
primary_triplet_right=("RIGHT_HIP", "RIGHT_KNEE", "RIGHT_ANKLE"),
|
| 21 |
+
fixed_low=95.0,
|
| 22 |
+
fixed_high=160.0,
|
| 23 |
+
),
|
| 24 |
+
"push up": ExerciseAngleConfig(
|
| 25 |
+
primary_triplet_left=("LEFT_SHOULDER", "LEFT_ELBOW", "LEFT_WRIST"),
|
| 26 |
+
primary_triplet_right=("RIGHT_SHOULDER", "RIGHT_ELBOW", "RIGHT_WRIST"),
|
| 27 |
+
fixed_low=95.0,
|
| 28 |
+
fixed_high=155.0,
|
| 29 |
+
),
|
| 30 |
+
"barbell biceps curl": ExerciseAngleConfig(
|
| 31 |
+
primary_triplet_left=("LEFT_SHOULDER", "LEFT_ELBOW", "LEFT_WRIST"),
|
| 32 |
+
primary_triplet_right=("RIGHT_SHOULDER", "RIGHT_ELBOW", "RIGHT_WRIST"),
|
| 33 |
+
fixed_low=55.0,
|
| 34 |
+
fixed_high=145.0,
|
| 35 |
+
),
|
| 36 |
+
"shoulder press": ExerciseAngleConfig(
|
| 37 |
+
primary_triplet_left=("LEFT_SHOULDER", "LEFT_ELBOW", "LEFT_WRIST"),
|
| 38 |
+
primary_triplet_right=("RIGHT_SHOULDER", "RIGHT_ELBOW", "RIGHT_WRIST"),
|
| 39 |
+
fixed_low=70.0,
|
| 40 |
+
fixed_high=155.0,
|
| 41 |
+
),
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
EXERCISE_ALIASES = {
|
| 46 |
+
"push-up": "push up",
|
| 47 |
+
"pushups": "push up",
|
| 48 |
+
"pushup": "push up",
|
| 49 |
+
"curls": "barbell biceps curl",
|
| 50 |
+
"bicep curl": "barbell biceps curl",
|
| 51 |
+
"biceps curl": "barbell biceps curl",
|
| 52 |
+
"shoulder_press": "shoulder press",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def normalize_exercise_name(exercise_name: str) -> str:
|
| 57 |
+
key = exercise_name.strip().lower()
|
| 58 |
+
return EXERCISE_ALIASES.get(key, key)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def calculate_angle_degrees(point_a: np.ndarray, point_b: np.ndarray, point_c: np.ndarray) -> float:
|
| 62 |
+
if np.allclose(point_a, 0.0) or np.allclose(point_b, 0.0) or np.allclose(point_c, 0.0):
|
| 63 |
+
return np.nan
|
| 64 |
+
vector_ab = point_a[:2] - point_b[:2]
|
| 65 |
+
vector_cb = point_c[:2] - point_b[:2]
|
| 66 |
+
denominator = np.linalg.norm(vector_ab) * np.linalg.norm(vector_cb)
|
| 67 |
+
if denominator == 0.0:
|
| 68 |
+
return np.nan
|
| 69 |
+
cosine_value = np.clip(np.dot(vector_ab, vector_cb) / denominator, -1.0, 1.0)
|
| 70 |
+
return float(np.degrees(np.arccos(cosine_value)))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def extract_primary_angle(landmarks: Dict[str, np.ndarray], config: ExerciseAngleConfig) -> float:
|
| 74 |
+
left_angle = calculate_angle_degrees(
|
| 75 |
+
landmarks[config.primary_triplet_left[0]],
|
| 76 |
+
landmarks[config.primary_triplet_left[1]],
|
| 77 |
+
landmarks[config.primary_triplet_left[2]],
|
| 78 |
+
)
|
| 79 |
+
right_angle = calculate_angle_degrees(
|
| 80 |
+
landmarks[config.primary_triplet_right[0]],
|
| 81 |
+
landmarks[config.primary_triplet_right[1]],
|
| 82 |
+
landmarks[config.primary_triplet_right[2]],
|
| 83 |
+
)
|
| 84 |
+
if np.isnan(left_angle) and np.isnan(right_angle):
|
| 85 |
+
return np.nan
|
| 86 |
+
if np.isnan(left_angle):
|
| 87 |
+
return right_angle
|
| 88 |
+
if np.isnan(right_angle):
|
| 89 |
+
return left_angle
|
| 90 |
+
return float((left_angle + right_angle) / 2.0)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class SmoothingBuffer:
|
| 94 |
+
def __init__(self, window_size: int):
|
| 95 |
+
self.window_size = window_size
|
| 96 |
+
self.values: List[float] = []
|
| 97 |
+
|
| 98 |
+
def update(self, value: float) -> float:
|
| 99 |
+
if np.isnan(value):
|
| 100 |
+
return np.nan
|
| 101 |
+
self.values.append(value)
|
| 102 |
+
if len(self.values) > self.window_size:
|
| 103 |
+
self.values.pop(0)
|
| 104 |
+
return float(np.mean(self.values))
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class FixedThresholdFSMCounter:
|
| 108 |
+
def __init__(self, low_threshold: float, high_threshold: float, min_state_frames: int = 2):
|
| 109 |
+
self.low_threshold = low_threshold
|
| 110 |
+
self.high_threshold = high_threshold
|
| 111 |
+
self.min_state_frames = min_state_frames
|
| 112 |
+
self.reps = 0
|
| 113 |
+
self.current_state = "unknown"
|
| 114 |
+
self.pending_state = "unknown"
|
| 115 |
+
self.pending_state_frames = 0
|
| 116 |
+
|
| 117 |
+
def _angle_state(self, angle: float) -> str:
|
| 118 |
+
if angle <= self.low_threshold:
|
| 119 |
+
return "flexed"
|
| 120 |
+
if angle >= self.high_threshold:
|
| 121 |
+
return "extended"
|
| 122 |
+
return "mid"
|
| 123 |
+
|
| 124 |
+
def update(self, angle: float) -> int:
|
| 125 |
+
if np.isnan(angle):
|
| 126 |
+
return self.reps
|
| 127 |
+
|
| 128 |
+
next_state = self._angle_state(angle)
|
| 129 |
+
if next_state == "mid":
|
| 130 |
+
self.pending_state = "unknown"
|
| 131 |
+
self.pending_state_frames = 0
|
| 132 |
+
return self.reps
|
| 133 |
+
|
| 134 |
+
if next_state == self.pending_state:
|
| 135 |
+
self.pending_state_frames += 1
|
| 136 |
+
else:
|
| 137 |
+
self.pending_state = next_state
|
| 138 |
+
self.pending_state_frames = 1
|
| 139 |
+
|
| 140 |
+
if self.pending_state_frames < self.min_state_frames:
|
| 141 |
+
return self.reps
|
| 142 |
+
|
| 143 |
+
if self.current_state != self.pending_state:
|
| 144 |
+
previous_state = self.current_state
|
| 145 |
+
self.current_state = self.pending_state
|
| 146 |
+
if previous_state == "flexed" and self.current_state == "extended":
|
| 147 |
+
self.reps += 1
|
| 148 |
+
return self.reps
|
scripts/preprocess/build_similarity_assets.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import joblib
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def parse_args():
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument("--train-file", default="data/train_sequences.csv")
|
| 12 |
+
parser.add_argument("--models-root", default="models")
|
| 13 |
+
parser.add_argument("--supported-models", nargs="+", default=["bilstm", "lstm", "gru", "tcn", "cnn_bilstm", "st_gcn"])
|
| 14 |
+
parser.add_argument("--output-filename", default="similarity_centroids.pkl")
|
| 15 |
+
return parser.parse_args()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
MODEL_FILES = {
|
| 19 |
+
"bilstm": {"scaler": "bidirectionallstm_scaler.pkl", "encoder": "bidirectionallstm_label_encoder.pkl"},
|
| 20 |
+
"lstm": {"scaler": "lstm_scaler.pkl", "encoder": "lstm_label_encoder.pkl"},
|
| 21 |
+
"tcn": {"scaler": "tcn_scaler.pkl", "encoder": "tcn_label_encoder.pkl"},
|
| 22 |
+
"gru": {"scaler": "gru_scaler.pkl", "encoder": "gru_label_encoder.pkl"},
|
| 23 |
+
"cnn_bilstm": {"scaler": "cnn_bilstm_scaler.pkl", "encoder": "cnn_bilstm_label_encoder.pkl"},
|
| 24 |
+
"st_gcn": {"scaler": "st_gcn_scaler.pkl", "encoder": "st_gcn_label_encoder.pkl"},
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_train_table(train_file_path):
|
| 29 |
+
train_table = pd.read_csv(train_file_path)
|
| 30 |
+
metadata_columns = {"video_id", "exercise_label", "start_frame_index", "end_frame_index"}
|
| 31 |
+
feature_columns = [column_name for column_name in train_table.columns if column_name not in metadata_columns]
|
| 32 |
+
feature_matrix = train_table[feature_columns].to_numpy(dtype=np.float32)
|
| 33 |
+
label_array = train_table["exercise_label"].to_numpy()
|
| 34 |
+
return feature_matrix, label_array
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def compute_normalized_centroid(vectors):
|
| 38 |
+
centroid = np.mean(vectors, axis=0)
|
| 39 |
+
centroid_norm = np.linalg.norm(centroid)
|
| 40 |
+
if centroid_norm == 0.0:
|
| 41 |
+
return centroid
|
| 42 |
+
return centroid / centroid_norm
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def build_model_centroids(model_name, model_files, models_root_path, train_features, train_labels):
|
| 46 |
+
weights_dir = models_root_path / model_name / "weights"
|
| 47 |
+
scaler = joblib.load(weights_dir / model_files["scaler"])
|
| 48 |
+
label_encoder = joblib.load(weights_dir / model_files["encoder"])
|
| 49 |
+
|
| 50 |
+
scaled_features = scaler.transform(train_features)
|
| 51 |
+
class_names = list(label_encoder.classes_)
|
| 52 |
+
|
| 53 |
+
centroids = {}
|
| 54 |
+
sample_counts = {}
|
| 55 |
+
for class_name in class_names:
|
| 56 |
+
class_mask = train_labels == class_name
|
| 57 |
+
class_vectors = scaled_features[class_mask]
|
| 58 |
+
class_vectors = class_vectors / np.clip(np.linalg.norm(class_vectors, axis=1, keepdims=True), 1e-8, None)
|
| 59 |
+
centroids[class_name] = compute_normalized_centroid(class_vectors)
|
| 60 |
+
sample_counts[class_name] = int(class_vectors.shape[0])
|
| 61 |
+
|
| 62 |
+
return {
|
| 63 |
+
"model": model_name,
|
| 64 |
+
"similarity_method": "cosine",
|
| 65 |
+
"class_order": class_names,
|
| 66 |
+
"sample_counts": sample_counts,
|
| 67 |
+
"centroids": centroids,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def save_model_asset(weights_dir, output_filename, similarity_asset):
|
| 72 |
+
output_path = weights_dir / output_filename
|
| 73 |
+
joblib.dump(similarity_asset, output_path)
|
| 74 |
+
return output_path
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def save_supported_models_manifest(models_root_path, supported_models):
|
| 78 |
+
manifest_path = models_root_path / "similarity_supported_models.csv"
|
| 79 |
+
pd.DataFrame([{"model": model_name, "method": "cosine", "version": "v1_centroid"} for model_name in supported_models]).to_csv(manifest_path, index=False)
|
| 80 |
+
return manifest_path
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def main():
|
| 84 |
+
args = parse_args()
|
| 85 |
+
|
| 86 |
+
train_file_path = Path(args.train_file)
|
| 87 |
+
models_root_path = Path(args.models_root)
|
| 88 |
+
supported_models = args.supported_models
|
| 89 |
+
output_filename = args.output_filename
|
| 90 |
+
|
| 91 |
+
train_features, train_labels = load_train_table(train_file_path)
|
| 92 |
+
|
| 93 |
+
for model_name in supported_models:
|
| 94 |
+
if model_name not in MODEL_FILES:
|
| 95 |
+
print(f"Skipping unsupported model key: {model_name}")
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
model_files = MODEL_FILES[model_name]
|
| 99 |
+
similarity_asset = build_model_centroids(
|
| 100 |
+
model_name=model_name,
|
| 101 |
+
model_files=model_files,
|
| 102 |
+
models_root_path=models_root_path,
|
| 103 |
+
train_features=train_features,
|
| 104 |
+
train_labels=train_labels,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
weights_dir = models_root_path / model_name / "weights"
|
| 108 |
+
output_path = save_model_asset(weights_dir, output_filename, similarity_asset)
|
| 109 |
+
print(f"Saved: {output_path}")
|
| 110 |
+
|
| 111 |
+
manifest_path = save_supported_models_manifest(models_root_path, supported_models)
|
| 112 |
+
print(f"Saved: {manifest_path}")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
main()
|
scripts/preprocess/create_fixed_splits.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def parse_args():
|
| 9 |
+
parser = argparse.ArgumentParser()
|
| 10 |
+
parser.add_argument("--input-file", default="data/train_sequences_full.csv")
|
| 11 |
+
parser.add_argument("--output-dir", default="data")
|
| 12 |
+
parser.add_argument("--train-ratio", type=float, default=0.70)
|
| 13 |
+
parser.add_argument("--val-ratio", type=float, default=0.15)
|
| 14 |
+
parser.add_argument("--test-ratio", type=float, default=0.15)
|
| 15 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 16 |
+
return parser.parse_args()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def validate_split_ratios(train_ratio, val_ratio, test_ratio):
|
| 20 |
+
ratio_sum = train_ratio + val_ratio + test_ratio
|
| 21 |
+
if abs(ratio_sum - 1.0) > 1e-8:
|
| 22 |
+
raise ValueError(f"Split ratios must sum to 1.0. Got {ratio_sum:.6f}")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def build_split_tables(sequence_table, train_ratio, val_ratio, test_ratio, seed):
|
| 26 |
+
labels = sequence_table["exercise_label"]
|
| 27 |
+
|
| 28 |
+
train_table, holdout_table = train_test_split(
|
| 29 |
+
sequence_table,
|
| 30 |
+
test_size=(1.0 - train_ratio),
|
| 31 |
+
random_state=seed,
|
| 32 |
+
stratify=labels,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
holdout_labels = holdout_table["exercise_label"]
|
| 36 |
+
val_fraction_of_holdout = val_ratio / (val_ratio + test_ratio)
|
| 37 |
+
val_table, test_table = train_test_split(
|
| 38 |
+
holdout_table,
|
| 39 |
+
test_size=(1.0 - val_fraction_of_holdout),
|
| 40 |
+
random_state=seed,
|
| 41 |
+
stratify=holdout_labels,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
return train_table, val_table, test_table
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def save_split_table(split_table, output_file_path):
|
| 48 |
+
split_table.to_csv(output_file_path, index=False)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def build_split_info(train_table, val_table, test_table, args):
|
| 52 |
+
split_info = {
|
| 53 |
+
"input_file": str(Path(args.input_file)),
|
| 54 |
+
"seed": args.seed,
|
| 55 |
+
"train_ratio": args.train_ratio,
|
| 56 |
+
"val_ratio": args.val_ratio,
|
| 57 |
+
"test_ratio": args.test_ratio,
|
| 58 |
+
"train_samples": int(len(train_table)),
|
| 59 |
+
"val_samples": int(len(val_table)),
|
| 60 |
+
"test_internal_samples": int(len(test_table)),
|
| 61 |
+
"class_counts": {
|
| 62 |
+
"train": train_table["exercise_label"].value_counts().to_dict(),
|
| 63 |
+
"val": val_table["exercise_label"].value_counts().to_dict(),
|
| 64 |
+
"test_internal": test_table["exercise_label"].value_counts().to_dict(),
|
| 65 |
+
},
|
| 66 |
+
}
|
| 67 |
+
return split_info
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def save_split_info(split_info, output_file_path):
|
| 71 |
+
info_rows = [
|
| 72 |
+
{"key": "input_file", "value": split_info["input_file"]},
|
| 73 |
+
{"key": "seed", "value": split_info["seed"]},
|
| 74 |
+
{"key": "train_ratio", "value": split_info["train_ratio"]},
|
| 75 |
+
{"key": "val_ratio", "value": split_info["val_ratio"]},
|
| 76 |
+
{"key": "test_ratio", "value": split_info["test_ratio"]},
|
| 77 |
+
{"key": "train_samples", "value": split_info["train_samples"]},
|
| 78 |
+
{"key": "val_samples", "value": split_info["val_samples"]},
|
| 79 |
+
{"key": "test_internal_samples", "value": split_info["test_internal_samples"]},
|
| 80 |
+
]
|
| 81 |
+
for split_name, class_counts in split_info["class_counts"].items():
|
| 82 |
+
for class_name, class_count in class_counts.items():
|
| 83 |
+
info_rows.append({"key": f"class_counts.{split_name}.{class_name}", "value": class_count})
|
| 84 |
+
pd.DataFrame(info_rows).to_csv(output_file_path, index=False)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def main():
|
| 88 |
+
args = parse_args()
|
| 89 |
+
|
| 90 |
+
validate_split_ratios(args.train_ratio, args.val_ratio, args.test_ratio)
|
| 91 |
+
|
| 92 |
+
input_file_path = Path(args.input_file)
|
| 93 |
+
output_directory_path = Path(args.output_dir)
|
| 94 |
+
output_directory_path.mkdir(parents=True, exist_ok=True)
|
| 95 |
+
|
| 96 |
+
sequence_table = pd.read_csv(input_file_path)
|
| 97 |
+
if "exercise_label" not in sequence_table.columns:
|
| 98 |
+
raise ValueError("Input file must include an 'exercise_label' column.")
|
| 99 |
+
|
| 100 |
+
train_table, val_table, test_table = build_split_tables(
|
| 101 |
+
sequence_table=sequence_table,
|
| 102 |
+
train_ratio=args.train_ratio,
|
| 103 |
+
val_ratio=args.val_ratio,
|
| 104 |
+
test_ratio=args.test_ratio,
|
| 105 |
+
seed=args.seed,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
train_output_path = output_directory_path / "train_sequences.csv"
|
| 109 |
+
val_output_path = output_directory_path / "val_sequences.csv"
|
| 110 |
+
test_output_path = output_directory_path / "test_internal_sequences.csv"
|
| 111 |
+
info_output_path = output_directory_path / "split_info.csv"
|
| 112 |
+
|
| 113 |
+
save_split_table(train_table, train_output_path)
|
| 114 |
+
save_split_table(val_table, val_output_path)
|
| 115 |
+
save_split_table(test_table, test_output_path)
|
| 116 |
+
|
| 117 |
+
split_info = build_split_info(train_table, val_table, test_table, args)
|
| 118 |
+
save_split_info(split_info, info_output_path)
|
| 119 |
+
|
| 120 |
+
print(f"Saved: {train_output_path} ({len(train_table)} rows)")
|
| 121 |
+
print(f"Saved: {val_output_path} ({len(val_table)} rows)")
|
| 122 |
+
print(f"Saved: {test_output_path} ({len(test_table)} rows)")
|
| 123 |
+
print(f"Saved: {info_output_path}")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
main()
|
scripts/preprocess/create_sequence_of_features.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def parse_args():
|
| 9 |
+
parser = argparse.ArgumentParser()
|
| 10 |
+
parser.add_argument("--input-dir", default="data/interim")
|
| 11 |
+
parser.add_argument("--input-pattern", default="*_frame_features.csv")
|
| 12 |
+
parser.add_argument("--output-file", default="data/train_sequences_full.csv")
|
| 13 |
+
parser.add_argument("--sequence-length", type=int, default=30)
|
| 14 |
+
parser.add_argument("--stride", type=int, default=30)
|
| 15 |
+
return parser.parse_args()
|
| 16 |
+
|
| 17 |
+
# Load all per-frame feature tables from the specified directory and pattern
|
| 18 |
+
def load_frame_feature_tables(input_directory_path, input_pattern):
|
| 19 |
+
frame_feature_paths = sorted(input_directory_path.glob(input_pattern))
|
| 20 |
+
if not frame_feature_paths:
|
| 21 |
+
raise FileNotFoundError(f"No files found in {input_directory_path} matching {input_pattern}")
|
| 22 |
+
|
| 23 |
+
dataframes = []
|
| 24 |
+
for frame_feature_path in frame_feature_paths:
|
| 25 |
+
dataframe = pd.read_csv(frame_feature_path)
|
| 26 |
+
dataframes.append(dataframe)
|
| 27 |
+
return dataframes
|
| 28 |
+
|
| 29 |
+
# Extract the names of the feature columns, excluding metadata columns
|
| 30 |
+
def get_frame_feature_column_names(frame_feature_table):
|
| 31 |
+
excluded_columns = {"video_id", "exercise_label", "frame_index"}
|
| 32 |
+
frame_feature_column_names = [
|
| 33 |
+
column_name for column_name in frame_feature_table.columns if column_name not in excluded_columns
|
| 34 |
+
]
|
| 35 |
+
return frame_feature_column_names
|
| 36 |
+
|
| 37 |
+
# Build a list of flattened feature names for the sequence table based on the frame feature column names and sequence length
|
| 38 |
+
def build_flattened_sequence_feature_names(sequence_length, frame_feature_column_names):
|
| 39 |
+
flattened_feature_names = []
|
| 40 |
+
for timestep_index in range(sequence_length):
|
| 41 |
+
for feature_name in frame_feature_column_names:
|
| 42 |
+
flattened_feature_names.append(f"t{timestep_index:02d}_{feature_name}")
|
| 43 |
+
return flattened_feature_names
|
| 44 |
+
|
| 45 |
+
# Create fixed-length sequences of frame features for a single video, returning a list of sequence rows with metadata and flattened features
|
| 46 |
+
def create_sequences_from_video_table(video_table, frame_feature_column_names, sequence_length, stride):
|
| 47 |
+
sequence_rows = []
|
| 48 |
+
sorted_video_table = video_table.sort_values("frame_index")
|
| 49 |
+
total_frames = len(sorted_video_table)
|
| 50 |
+
max_start_index = total_frames - sequence_length
|
| 51 |
+
|
| 52 |
+
if max_start_index < 0:
|
| 53 |
+
return sequence_rows
|
| 54 |
+
|
| 55 |
+
for start_index in range(0, max_start_index + 1, stride):
|
| 56 |
+
end_index = start_index + sequence_length
|
| 57 |
+
sequence_slice = sorted_video_table.iloc[start_index:end_index]
|
| 58 |
+
sequence_label = sequence_slice.iloc[0]["exercise_label"]
|
| 59 |
+
sequence_video_id = sequence_slice.iloc[0]["video_id"]
|
| 60 |
+
sequence_feature_matrix = sequence_slice[frame_feature_column_names].to_numpy(dtype=np.float32)
|
| 61 |
+
|
| 62 |
+
sequence_rows.append(
|
| 63 |
+
{
|
| 64 |
+
"video_id": sequence_video_id,
|
| 65 |
+
"exercise_label": sequence_label,
|
| 66 |
+
"start_frame_index": int(sequence_slice.iloc[0]["frame_index"]),
|
| 67 |
+
"end_frame_index": int(sequence_slice.iloc[-1]["frame_index"]),
|
| 68 |
+
"flattened_features": sequence_feature_matrix.reshape(-1),
|
| 69 |
+
}
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
return sequence_rows
|
| 73 |
+
|
| 74 |
+
# Convert the list of per-frame feature tables into a single sequence table with flattened features for each sequence, including metadata columns for video ID, exercise label, and frame indices
|
| 75 |
+
def convert_frame_tables_to_sequence_table(frame_feature_tables, sequence_length, stride):
|
| 76 |
+
merged_frame_feature_table = pd.concat(frame_feature_tables, ignore_index=True)
|
| 77 |
+
frame_feature_column_names = get_frame_feature_column_names(merged_frame_feature_table)
|
| 78 |
+
flattened_feature_names = build_flattened_sequence_feature_names(sequence_length, frame_feature_column_names)
|
| 79 |
+
|
| 80 |
+
all_sequence_rows = []
|
| 81 |
+
grouped_video_tables = merged_frame_feature_table.groupby("video_id", sort=False)
|
| 82 |
+
|
| 83 |
+
for _, video_table in grouped_video_tables:
|
| 84 |
+
video_sequences = create_sequences_from_video_table(
|
| 85 |
+
video_table=video_table,
|
| 86 |
+
frame_feature_column_names=frame_feature_column_names,
|
| 87 |
+
sequence_length=sequence_length,
|
| 88 |
+
stride=stride,
|
| 89 |
+
)
|
| 90 |
+
all_sequence_rows.extend(video_sequences)
|
| 91 |
+
|
| 92 |
+
sequence_table_rows = []
|
| 93 |
+
for sequence_row in all_sequence_rows:
|
| 94 |
+
flat_feature_values = sequence_row["flattened_features"]
|
| 95 |
+
flattened_feature_dict = dict(zip(flattened_feature_names, flat_feature_values))
|
| 96 |
+
|
| 97 |
+
output_row = {
|
| 98 |
+
"video_id": sequence_row["video_id"],
|
| 99 |
+
"exercise_label": sequence_row["exercise_label"],
|
| 100 |
+
"start_frame_index": sequence_row["start_frame_index"],
|
| 101 |
+
"end_frame_index": sequence_row["end_frame_index"],
|
| 102 |
+
}
|
| 103 |
+
output_row.update(flattened_feature_dict)
|
| 104 |
+
sequence_table_rows.append(output_row)
|
| 105 |
+
|
| 106 |
+
return pd.DataFrame(sequence_table_rows)
|
| 107 |
+
|
| 108 |
+
# Save the final sequence table to a CSV file
|
| 109 |
+
def save_sequence_table(sequence_table, output_file_path):
|
| 110 |
+
output_file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
sequence_table.to_csv(output_file_path, index=False)
|
| 112 |
+
|
| 113 |
+
# Loads per-frame feature tables, converts them into fixed-length sequences with flattened features, and saves the resulting sequence table to a CSV file
|
| 114 |
+
def main():
|
| 115 |
+
args = parse_args()
|
| 116 |
+
|
| 117 |
+
input_directory_path = Path(args.input_dir)
|
| 118 |
+
input_pattern = args.input_pattern
|
| 119 |
+
output_file_path = Path(args.output_file)
|
| 120 |
+
sequence_length = args.sequence_length
|
| 121 |
+
stride = args.stride
|
| 122 |
+
|
| 123 |
+
frame_feature_tables = load_frame_feature_tables(input_directory_path, input_pattern)
|
| 124 |
+
sequence_table = convert_frame_tables_to_sequence_table(frame_feature_tables, sequence_length, stride)
|
| 125 |
+
save_sequence_table(sequence_table, output_file_path)
|
| 126 |
+
|
| 127 |
+
print(f"Saved: {output_file_path}")
|
| 128 |
+
print(f"Sequences: {len(sequence_table)}")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
main()
|
scripts/preprocess/extract_features.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import cv2
|
| 5 |
+
import mediapipe as mp
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def parse_args():
|
| 12 |
+
parser = argparse.ArgumentParser()
|
| 13 |
+
parser.add_argument("--dataset-root", default="data/raw/real-time-exercise-recognition-dataset")
|
| 14 |
+
parser.add_argument("--input-datasets", nargs="+", default=["final_kaggle_with_additional_video", "synthetic_dataset/synthetic_dataset", "similar_dataset"])
|
| 15 |
+
parser.add_argument("--output-dir", default="data/interim")
|
| 16 |
+
parser.add_argument("--min-visibility", type=float, default=0.5)
|
| 17 |
+
return parser.parse_args()
|
| 18 |
+
|
| 19 |
+
# Build a mapping of landmark names to their corresponding MediaPipe indices
|
| 20 |
+
def build_landmark_indices(mp_pose):
|
| 21 |
+
return {
|
| 22 |
+
"LEFT_SHOULDER": mp_pose.PoseLandmark.LEFT_SHOULDER.value,
|
| 23 |
+
"RIGHT_SHOULDER": mp_pose.PoseLandmark.RIGHT_SHOULDER.value,
|
| 24 |
+
"LEFT_HIP": mp_pose.PoseLandmark.LEFT_HIP.value,
|
| 25 |
+
"RIGHT_HIP": mp_pose.PoseLandmark.RIGHT_HIP.value,
|
| 26 |
+
"LEFT_KNEE": mp_pose.PoseLandmark.LEFT_KNEE.value,
|
| 27 |
+
"RIGHT_KNEE": mp_pose.PoseLandmark.RIGHT_KNEE.value,
|
| 28 |
+
"LEFT_ELBOW": mp_pose.PoseLandmark.LEFT_ELBOW.value,
|
| 29 |
+
"RIGHT_ELBOW": mp_pose.PoseLandmark.RIGHT_ELBOW.value,
|
| 30 |
+
"LEFT_WRIST": mp_pose.PoseLandmark.LEFT_WRIST.value,
|
| 31 |
+
"RIGHT_WRIST": mp_pose.PoseLandmark.RIGHT_WRIST.value,
|
| 32 |
+
"LEFT_ANKLE": mp_pose.PoseLandmark.LEFT_ANKLE.value,
|
| 33 |
+
"RIGHT_ANKLE": mp_pose.PoseLandmark.RIGHT_ANKLE.value,
|
| 34 |
+
"LEFT_HEEL": mp_pose.PoseLandmark.LEFT_HEEL.value,
|
| 35 |
+
"RIGHT_HEEL": mp_pose.PoseLandmark.RIGHT_HEEL.value,
|
| 36 |
+
"LEFT_FOOT_INDEX": mp_pose.PoseLandmark.LEFT_FOOT_INDEX.value,
|
| 37 |
+
"RIGHT_FOOT_INDEX": mp_pose.PoseLandmark.RIGHT_FOOT_INDEX.value,
|
| 38 |
+
"LEFT_PINKY": mp_pose.PoseLandmark.LEFT_PINKY.value,
|
| 39 |
+
"RIGHT_PINKY": mp_pose.PoseLandmark.RIGHT_PINKY.value,
|
| 40 |
+
"LEFT_INDEX": mp_pose.PoseLandmark.LEFT_INDEX.value,
|
| 41 |
+
"RIGHT_INDEX": mp_pose.PoseLandmark.RIGHT_INDEX.value,
|
| 42 |
+
"LEFT_THUMB": mp_pose.PoseLandmark.LEFT_THUMB.value,
|
| 43 |
+
"RIGHT_THUMB": mp_pose.PoseLandmark.RIGHT_THUMB.value,
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Define the list of landmark names we want to extract coordinates for
|
| 47 |
+
def get_coordinate_landmark_names():
|
| 48 |
+
return [
|
| 49 |
+
"LEFT_SHOULDER",
|
| 50 |
+
"RIGHT_SHOULDER",
|
| 51 |
+
"LEFT_HIP",
|
| 52 |
+
"RIGHT_HIP",
|
| 53 |
+
"LEFT_KNEE",
|
| 54 |
+
"RIGHT_KNEE",
|
| 55 |
+
"LEFT_ELBOW",
|
| 56 |
+
"RIGHT_ELBOW",
|
| 57 |
+
"LEFT_WRIST",
|
| 58 |
+
"RIGHT_WRIST",
|
| 59 |
+
"LEFT_ANKLE",
|
| 60 |
+
"RIGHT_ANKLE",
|
| 61 |
+
"LEFT_HEEL",
|
| 62 |
+
"RIGHT_HEEL",
|
| 63 |
+
"LEFT_FOOT_INDEX",
|
| 64 |
+
"RIGHT_FOOT_INDEX",
|
| 65 |
+
"LEFT_PINKY",
|
| 66 |
+
"RIGHT_PINKY",
|
| 67 |
+
"LEFT_INDEX",
|
| 68 |
+
"RIGHT_INDEX",
|
| 69 |
+
"LEFT_THUMB",
|
| 70 |
+
"RIGHT_THUMB",
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# Define the triplets of landmarks for which we want to calculate joint angles
|
| 74 |
+
def get_angle_triplets():
|
| 75 |
+
return [
|
| 76 |
+
("LEFT_HIP", "LEFT_SHOULDER", "LEFT_ELBOW"),
|
| 77 |
+
("RIGHT_HIP", "RIGHT_SHOULDER", "RIGHT_ELBOW"),
|
| 78 |
+
("LEFT_SHOULDER", "LEFT_ELBOW", "LEFT_WRIST"),
|
| 79 |
+
("RIGHT_SHOULDER", "RIGHT_ELBOW", "RIGHT_WRIST"),
|
| 80 |
+
("LEFT_HIP", "LEFT_KNEE", "LEFT_ANKLE"),
|
| 81 |
+
("RIGHT_HIP", "RIGHT_KNEE", "RIGHT_ANKLE"),
|
| 82 |
+
("LEFT_SHOULDER", "LEFT_HIP", "LEFT_KNEE"),
|
| 83 |
+
("RIGHT_SHOULDER", "RIGHT_HIP", "RIGHT_KNEE"),
|
| 84 |
+
("LEFT_KNEE", "LEFT_ANKLE", "LEFT_HEEL"),
|
| 85 |
+
("RIGHT_KNEE", "RIGHT_ANKLE", "RIGHT_HEEL"),
|
| 86 |
+
("LEFT_ANKLE", "LEFT_HEEL", "LEFT_FOOT_INDEX"),
|
| 87 |
+
("RIGHT_ANKLE", "RIGHT_HEEL", "RIGHT_FOOT_INDEX"),
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
# function to recursively list video files in a dataset folder, filtering by video file extensions
|
| 91 |
+
def list_video_files(dataset_path):
|
| 92 |
+
allowed_suffixes = {".mp4", ".avi", ".mov", ".m4v", ".asf", ".MOV"}
|
| 93 |
+
video_paths = []
|
| 94 |
+
for file_path in dataset_path.rglob("*"):
|
| 95 |
+
if file_path.is_file() and file_path.suffix in allowed_suffixes:
|
| 96 |
+
video_paths.append(file_path)
|
| 97 |
+
return sorted(video_paths)
|
| 98 |
+
|
| 99 |
+
# normalise exercise labels by mapping known variations to a standard label, and lowercasing/stripping whitespace for consistency
|
| 100 |
+
def normalize_exercise_label(raw_label):
|
| 101 |
+
lower_label = raw_label.strip().lower()
|
| 102 |
+
label_mapping = {
|
| 103 |
+
"hammer curl": "barbell biceps curl",
|
| 104 |
+
"bicept curl": "barbell biceps curl",
|
| 105 |
+
}
|
| 106 |
+
return label_mapping.get(lower_label, lower_label)
|
| 107 |
+
|
| 108 |
+
# Check if a MediaPipe landmark is valid based on its visibility score compared to a minimum threshold
|
| 109 |
+
def is_landmark_valid(landmark, min_visibility):
|
| 110 |
+
return landmark.visibility >= min_visibility
|
| 111 |
+
|
| 112 |
+
# Return a placeholder point (0, 0, 0) for missing or invalid landmarks to maintain consistent feature dimensions
|
| 113 |
+
def get_placeholder_point():
|
| 114 |
+
return np.array([0.0, 0.0, 0.0], dtype=np.float32)
|
| 115 |
+
|
| 116 |
+
# Calculate the angle in degrees between three points (A, B, C) where B is the vertex point. If any point is invalid (all zeros), return 0 degrees.
|
| 117 |
+
def calculate_angle_degrees(point_a, point_b, point_c):
|
| 118 |
+
if np.allclose(point_a, 0.0) or np.allclose(point_b, 0.0) or np.allclose(point_c, 0.0):
|
| 119 |
+
return 0.0
|
| 120 |
+
|
| 121 |
+
vector_ab = point_a[:2] - point_b[:2]
|
| 122 |
+
vector_cb = point_c[:2] - point_b[:2]
|
| 123 |
+
denominator = np.linalg.norm(vector_ab) * np.linalg.norm(vector_cb)
|
| 124 |
+
|
| 125 |
+
if denominator == 0.0:
|
| 126 |
+
return 0.0
|
| 127 |
+
|
| 128 |
+
cosine_value = np.dot(vector_ab, vector_cb) / denominator
|
| 129 |
+
cosine_value = np.clip(cosine_value, -1.0, 1.0) # Clip cosine value to the valid range to avoid numerical issues with arccos
|
| 130 |
+
angle_radians = np.arccos(cosine_value) # Calculate angle in radians and convert to degrees
|
| 131 |
+
return float(np.degrees(angle_radians))
|
| 132 |
+
|
| 133 |
+
# Extract the specified landmarks from a MediaPipe pose estimation result for a single frame, checking visibility and using placeholders for missing landmarks. Returns a dictionary of landmark names to their (x, y, z) coordinates.
|
| 134 |
+
def extract_frame_landmarks(media_pipe_results, landmark_indices, coordinate_landmark_names, min_visibility):
|
| 135 |
+
extracted_landmarks = {}
|
| 136 |
+
if not media_pipe_results.pose_landmarks:
|
| 137 |
+
return extracted_landmarks
|
| 138 |
+
|
| 139 |
+
for landmark_name in coordinate_landmark_names:
|
| 140 |
+
landmark_index = landmark_indices[landmark_name]
|
| 141 |
+
detected_landmark = media_pipe_results.pose_landmarks.landmark[landmark_index]
|
| 142 |
+
|
| 143 |
+
if is_landmark_valid(detected_landmark, min_visibility):
|
| 144 |
+
extracted_landmarks[landmark_name] = np.array(
|
| 145 |
+
[detected_landmark.x, detected_landmark.y, detected_landmark.z],
|
| 146 |
+
dtype=np.float32,
|
| 147 |
+
)
|
| 148 |
+
else:
|
| 149 |
+
extracted_landmarks[landmark_name] = get_placeholder_point()
|
| 150 |
+
|
| 151 |
+
return extracted_landmarks
|
| 152 |
+
|
| 153 |
+
# Check if all essential landmarks for a given body side (LEFT or RIGHT) are valid (not all zeros) to determine if we can trust the pose estimation for that side. This helps filter out frames where the pose estimation failed for one side of the body.
|
| 154 |
+
def has_valid_body_side(extracted_landmarks, side_prefix):
|
| 155 |
+
essential_points = ["SHOULDER", "ELBOW", "WRIST", "HIP", "KNEE", "ANKLE"]
|
| 156 |
+
for point_name in essential_points:
|
| 157 |
+
full_landmark_name = f"{side_prefix}_{point_name}"
|
| 158 |
+
if np.allclose(extracted_landmarks[full_landmark_name], 0.0):
|
| 159 |
+
return False
|
| 160 |
+
return True
|
| 161 |
+
|
| 162 |
+
# Build a feature row dictionary for a single frame, including the video identifier, exercise label, frame index, landmark coordinates, and calculated angles based on the specified triplets. This function combines all the extracted information into a structured format for later saving to CSV.
|
| 163 |
+
def build_feature_row(
|
| 164 |
+
extracted_landmarks,
|
| 165 |
+
coordinate_landmark_names,
|
| 166 |
+
angle_triplets,
|
| 167 |
+
frame_index,
|
| 168 |
+
video_identifier,
|
| 169 |
+
exercise_label,
|
| 170 |
+
):
|
| 171 |
+
feature_row = {
|
| 172 |
+
"video_id": video_identifier,
|
| 173 |
+
"exercise_label": exercise_label,
|
| 174 |
+
"frame_index": frame_index,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
for landmark_name in coordinate_landmark_names:
|
| 178 |
+
landmark_value = extracted_landmarks[landmark_name]
|
| 179 |
+
feature_row[f"{landmark_name.lower()}_x"] = landmark_value[0]
|
| 180 |
+
feature_row[f"{landmark_name.lower()}_y"] = landmark_value[1]
|
| 181 |
+
feature_row[f"{landmark_name.lower()}_z"] = landmark_value[2]
|
| 182 |
+
|
| 183 |
+
for point_a, point_b, point_c in angle_triplets:
|
| 184 |
+
angle_name = f"angle_{point_a.lower()}_{point_b.lower()}_{point_c.lower()}"
|
| 185 |
+
feature_row[angle_name] = calculate_angle_degrees(
|
| 186 |
+
extracted_landmarks[point_a],
|
| 187 |
+
extracted_landmarks[point_b],
|
| 188 |
+
extracted_landmarks[point_c],
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
return feature_row
|
| 192 |
+
|
| 193 |
+
# Process a single video file to extract per-frame features. For each frame, it runs pose estimation, extracts landmarks, checks validity, and builds feature rows for valid frames. It returns a list of feature rows for the entire video.
|
| 194 |
+
def extract_features_from_video(
|
| 195 |
+
video_path,
|
| 196 |
+
exercise_label,
|
| 197 |
+
dataset_name,
|
| 198 |
+
pose_estimator,
|
| 199 |
+
landmark_indices,
|
| 200 |
+
coordinate_landmark_names,
|
| 201 |
+
angle_triplets,
|
| 202 |
+
min_visibility,
|
| 203 |
+
):
|
| 204 |
+
video_capture = cv2.VideoCapture(str(video_path))
|
| 205 |
+
frame_index = 0
|
| 206 |
+
extracted_rows = []
|
| 207 |
+
video_identifier = f"{dataset_name}/{exercise_label}/{video_path.stem}"
|
| 208 |
+
|
| 209 |
+
while video_capture.isOpened():
|
| 210 |
+
frame_read_success, frame_bgr = video_capture.read()
|
| 211 |
+
if not frame_read_success:
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 215 |
+
pose_results = pose_estimator.process(frame_rgb)
|
| 216 |
+
extracted_landmarks = extract_frame_landmarks(
|
| 217 |
+
pose_results,
|
| 218 |
+
landmark_indices,
|
| 219 |
+
coordinate_landmark_names,
|
| 220 |
+
min_visibility,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
if extracted_landmarks:
|
| 224 |
+
left_side_is_valid = has_valid_body_side(extracted_landmarks, "LEFT")
|
| 225 |
+
right_side_is_valid = has_valid_body_side(extracted_landmarks, "RIGHT")
|
| 226 |
+
|
| 227 |
+
if left_side_is_valid or right_side_is_valid:
|
| 228 |
+
extracted_rows.append(
|
| 229 |
+
build_feature_row(
|
| 230 |
+
extracted_landmarks=extracted_landmarks,
|
| 231 |
+
coordinate_landmark_names=coordinate_landmark_names,
|
| 232 |
+
angle_triplets=angle_triplets,
|
| 233 |
+
frame_index=frame_index,
|
| 234 |
+
video_identifier=video_identifier,
|
| 235 |
+
exercise_label=exercise_label,
|
| 236 |
+
)
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
frame_index += 1
|
| 240 |
+
|
| 241 |
+
video_capture.release()
|
| 242 |
+
return extracted_rows
|
| 243 |
+
|
| 244 |
+
# Save the list of feature row dictionaries to a CSV file using pandas, ensuring the output directory exists. Each row in the CSV corresponds to a single frame's extracted features.
|
| 245 |
+
def save_rows_to_csv(rows, output_file_path):
|
| 246 |
+
output_file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 247 |
+
dataframe = pd.DataFrame(rows)
|
| 248 |
+
dataframe.to_csv(output_file_path, index=False)
|
| 249 |
+
|
| 250 |
+
# Sets up the main execution flow: parses arguments, initializes MediaPipe pose estimator, iterates through specified datasets and videos, extracts features for each video, and saves the results to CSV files in the output directory.
|
| 251 |
+
def main():
|
| 252 |
+
args = parse_args()
|
| 253 |
+
|
| 254 |
+
dataset_root_path = Path(args.dataset_root)
|
| 255 |
+
input_dataset_names = args.input_datasets
|
| 256 |
+
output_directory_path = Path(args.output_dir)
|
| 257 |
+
minimum_landmark_visibility = args.min_visibility
|
| 258 |
+
|
| 259 |
+
media_pipe_pose = mp.solutions.pose
|
| 260 |
+
landmark_indices = build_landmark_indices(media_pipe_pose)
|
| 261 |
+
coordinate_landmark_names = get_coordinate_landmark_names()
|
| 262 |
+
angle_triplets = get_angle_triplets()
|
| 263 |
+
|
| 264 |
+
with media_pipe_pose.Pose(
|
| 265 |
+
static_image_mode=False,
|
| 266 |
+
model_complexity=1,
|
| 267 |
+
enable_segmentation=False,
|
| 268 |
+
min_detection_confidence=0.5,
|
| 269 |
+
min_tracking_confidence=0.5,
|
| 270 |
+
) as pose_estimator:
|
| 271 |
+
for dataset_name in input_dataset_names:
|
| 272 |
+
dataset_path = dataset_root_path / dataset_name
|
| 273 |
+
if not dataset_path.exists():
|
| 274 |
+
print(f"Skipping missing dataset folder: {dataset_path}")
|
| 275 |
+
continue
|
| 276 |
+
|
| 277 |
+
print(f"Processing dataset: {dataset_name}")
|
| 278 |
+
dataset_rows = []
|
| 279 |
+
video_paths = list_video_files(dataset_path)
|
| 280 |
+
|
| 281 |
+
for video_path in tqdm(video_paths, desc=f"Videos in {dataset_name}"):
|
| 282 |
+
exercise_label = normalize_exercise_label(video_path.parent.name)
|
| 283 |
+
video_rows = extract_features_from_video(
|
| 284 |
+
video_path=video_path,
|
| 285 |
+
exercise_label=exercise_label,
|
| 286 |
+
dataset_name=dataset_name,
|
| 287 |
+
pose_estimator=pose_estimator,
|
| 288 |
+
landmark_indices=landmark_indices,
|
| 289 |
+
coordinate_landmark_names=coordinate_landmark_names,
|
| 290 |
+
angle_triplets=angle_triplets,
|
| 291 |
+
min_visibility=minimum_landmark_visibility,
|
| 292 |
+
)
|
| 293 |
+
dataset_rows.extend(video_rows)
|
| 294 |
+
|
| 295 |
+
output_file_path = output_directory_path / f"{dataset_name.replace('/', '_')}_frame_features.csv"
|
| 296 |
+
save_rows_to_csv(dataset_rows, output_file_path)
|
| 297 |
+
print(f"Saved: {output_file_path} ({len(dataset_rows)} rows)")
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
main()
|
scripts/realtime_eval/evaluate_realtime_webcam.py
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import cv2
|
| 7 |
+
import joblib
|
| 8 |
+
import mediapipe as mp
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import torch
|
| 12 |
+
from torch import nn
|
| 13 |
+
|
| 14 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 15 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 16 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 17 |
+
|
| 18 |
+
from scripts.evaluate.rep_counting_methods import EXERCISE_CONFIGS, FixedThresholdFSMCounter, SmoothingBuffer, extract_primary_angle, normalize_exercise_name
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def parse_args():
|
| 22 |
+
parser = argparse.ArgumentParser()
|
| 23 |
+
parser.add_argument("--model-name", choices=["bilstm", "lstm", "gru", "tcn", "cnn_bilstm", "st_gcn"], required=True)
|
| 24 |
+
parser.add_argument("--models-root", default="models")
|
| 25 |
+
parser.add_argument("--output-dir", default="results/eval_realtime")
|
| 26 |
+
parser.add_argument("--sequence-length", type=int, default=30)
|
| 27 |
+
parser.add_argument("--feature-count", type=int, default=78)
|
| 28 |
+
parser.add_argument("--camera-index", type=int, default=0)
|
| 29 |
+
parser.add_argument("--run-seconds", type=int, default=75)
|
| 30 |
+
parser.add_argument("--prediction-interval", type=float, default=1.0)
|
| 31 |
+
return parser.parse_args()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class BidirectionalLstmClassifier(nn.Module):
|
| 35 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 36 |
+
super().__init__()
|
| 37 |
+
self.bilstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 38 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 39 |
+
self.classifier = nn.Linear(hidden_size * 2, class_count)
|
| 40 |
+
|
| 41 |
+
def forward(self, input_sequence):
|
| 42 |
+
recurrent_output, _ = self.bilstm(input_sequence)
|
| 43 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 44 |
+
return self.classifier(self.dropout(final_timestep_output))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class LstmClassifier(nn.Module):
|
| 48 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 49 |
+
super().__init__()
|
| 50 |
+
self.lstm = nn.LSTM(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 51 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 52 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 53 |
+
|
| 54 |
+
def forward(self, input_sequence):
|
| 55 |
+
recurrent_output, _ = self.lstm(input_sequence)
|
| 56 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 57 |
+
return self.classifier(self.dropout(final_timestep_output))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class GruClassifier(nn.Module):
|
| 61 |
+
def __init__(self, feature_count, hidden_size, class_count, dropout_probability):
|
| 62 |
+
super().__init__()
|
| 63 |
+
self.gru = nn.GRU(input_size=feature_count, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=False)
|
| 64 |
+
self.dropout = nn.Dropout(dropout_probability)
|
| 65 |
+
self.classifier = nn.Linear(hidden_size, class_count)
|
| 66 |
+
|
| 67 |
+
def forward(self, input_sequence):
|
| 68 |
+
recurrent_output, _ = self.gru(input_sequence)
|
| 69 |
+
final_timestep_output = recurrent_output[:, -1, :]
|
| 70 |
+
return self.classifier(self.dropout(final_timestep_output))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class Chomp1d(nn.Module):
|
| 74 |
+
def __init__(self, chomp_size):
|
| 75 |
+
super().__init__()
|
| 76 |
+
self.chomp_size = chomp_size
|
| 77 |
+
|
| 78 |
+
def forward(self, input_tensor):
|
| 79 |
+
if self.chomp_size == 0:
|
| 80 |
+
return input_tensor
|
| 81 |
+
return input_tensor[:, :, :-self.chomp_size].contiguous()
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class TemporalBlock(nn.Module):
|
| 85 |
+
def __init__(self, input_channels, output_channels, kernel_size, dilation, dropout):
|
| 86 |
+
super().__init__()
|
| 87 |
+
padding = (kernel_size - 1) * dilation
|
| 88 |
+
self.conv1 = nn.Conv1d(input_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 89 |
+
self.chomp1 = Chomp1d(padding)
|
| 90 |
+
self.relu1 = nn.ReLU()
|
| 91 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 92 |
+
self.conv2 = nn.Conv1d(output_channels, output_channels, kernel_size, padding=padding, dilation=dilation)
|
| 93 |
+
self.chomp2 = Chomp1d(padding)
|
| 94 |
+
self.relu2 = nn.ReLU()
|
| 95 |
+
self.dropout2 = nn.Dropout(dropout)
|
| 96 |
+
self.downsample = nn.Conv1d(input_channels, output_channels, kernel_size=1) if input_channels != output_channels else None
|
| 97 |
+
self.final_relu = nn.ReLU()
|
| 98 |
+
|
| 99 |
+
def forward(self, input_tensor):
|
| 100 |
+
output_tensor = self.dropout1(self.relu1(self.chomp1(self.conv1(input_tensor))))
|
| 101 |
+
output_tensor = self.dropout2(self.relu2(self.chomp2(self.conv2(output_tensor))))
|
| 102 |
+
residual_tensor = input_tensor if self.downsample is None else self.downsample(input_tensor)
|
| 103 |
+
return self.final_relu(output_tensor + residual_tensor)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class TcnClassifier(nn.Module):
|
| 107 |
+
def __init__(self, feature_count, class_count, channel_width, kernel_size, dropout):
|
| 108 |
+
super().__init__()
|
| 109 |
+
self.input_projection = nn.Conv1d(feature_count, channel_width, kernel_size=1)
|
| 110 |
+
self.block1 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=1, dropout=dropout)
|
| 111 |
+
self.block2 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=2, dropout=dropout)
|
| 112 |
+
self.block3 = TemporalBlock(channel_width, channel_width, kernel_size, dilation=4, dropout=dropout)
|
| 113 |
+
self.classifier = nn.Linear(channel_width, class_count)
|
| 114 |
+
|
| 115 |
+
def forward(self, input_sequence):
|
| 116 |
+
temporal_tensor = input_sequence.transpose(1, 2)
|
| 117 |
+
temporal_tensor = self.block3(self.block2(self.block1(self.input_projection(temporal_tensor))))
|
| 118 |
+
return self.classifier(temporal_tensor[:, :, -1])
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class CnnBiLstmClassifier(nn.Module):
|
| 122 |
+
def __init__(self, feature_count, class_count, cnn_filters, cnn_kernel_size, lstm_units, dropout_probability):
|
| 123 |
+
super().__init__()
|
| 124 |
+
self.conv1d = nn.Conv1d(feature_count, cnn_filters, kernel_size=cnn_kernel_size, padding=cnn_kernel_size // 2)
|
| 125 |
+
self.relu = nn.ReLU()
|
| 126 |
+
self.dropout1 = nn.Dropout(dropout_probability)
|
| 127 |
+
self.bilstm = nn.LSTM(input_size=cnn_filters, hidden_size=lstm_units, num_layers=2, batch_first=True, dropout=dropout_probability, bidirectional=True)
|
| 128 |
+
self.dropout2 = nn.Dropout(dropout_probability)
|
| 129 |
+
self.classifier = nn.Linear(lstm_units * 2, class_count)
|
| 130 |
+
|
| 131 |
+
def forward(self, input_sequence):
|
| 132 |
+
temporal_tensor = self.dropout1(self.relu(self.conv1d(input_sequence.transpose(1, 2)))).transpose(1, 2)
|
| 133 |
+
recurrent_output, _ = self.bilstm(temporal_tensor)
|
| 134 |
+
return self.classifier(self.dropout2(recurrent_output[:, -1, :]))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class GraphConvolution(nn.Module):
|
| 138 |
+
def __init__(self, input_channels, output_channels):
|
| 139 |
+
super().__init__()
|
| 140 |
+
self.projection = nn.Conv2d(input_channels, output_channels, kernel_size=1)
|
| 141 |
+
|
| 142 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 143 |
+
return torch.einsum("nctv,vw->nctw", self.projection(input_tensor), adjacency_matrix)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class StGcnBlock(nn.Module):
|
| 147 |
+
def __init__(self, input_channels, output_channels, dropout):
|
| 148 |
+
super().__init__()
|
| 149 |
+
self.graph_convolution = GraphConvolution(input_channels, output_channels)
|
| 150 |
+
self.temporal_convolution = nn.Sequential(
|
| 151 |
+
nn.BatchNorm2d(output_channels),
|
| 152 |
+
nn.ReLU(inplace=True),
|
| 153 |
+
nn.Conv2d(output_channels, output_channels, kernel_size=(9, 1), padding=(4, 0)),
|
| 154 |
+
nn.BatchNorm2d(output_channels),
|
| 155 |
+
nn.Dropout(dropout),
|
| 156 |
+
)
|
| 157 |
+
self.residual = nn.Sequential(nn.Conv2d(input_channels, output_channels, kernel_size=1), nn.BatchNorm2d(output_channels)) if input_channels != output_channels else nn.Identity()
|
| 158 |
+
self.activation = nn.ReLU(inplace=True)
|
| 159 |
+
|
| 160 |
+
def forward(self, input_tensor, adjacency_matrix):
|
| 161 |
+
residual_tensor = self.residual(input_tensor)
|
| 162 |
+
output_tensor = self.temporal_convolution(self.graph_convolution(input_tensor, adjacency_matrix))
|
| 163 |
+
return self.activation(output_tensor + residual_tensor)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class StGcnClassifier(nn.Module):
|
| 167 |
+
def __init__(self, feature_count, class_count, dropout):
|
| 168 |
+
super().__init__()
|
| 169 |
+
self.input_batch_norm = nn.BatchNorm1d(feature_count)
|
| 170 |
+
self.register_parameter("adjacency_logits", nn.Parameter(torch.eye(feature_count)))
|
| 171 |
+
self.block1 = StGcnBlock(1, 64, dropout)
|
| 172 |
+
self.block2 = StGcnBlock(64, 64, dropout)
|
| 173 |
+
self.block3 = StGcnBlock(64, 128, dropout)
|
| 174 |
+
self.classifier = nn.Linear(128, class_count)
|
| 175 |
+
|
| 176 |
+
def forward(self, input_sequence):
|
| 177 |
+
batch_size, sequence_length, feature_count = input_sequence.shape
|
| 178 |
+
normalized = self.input_batch_norm(input_sequence.reshape(batch_size * sequence_length, feature_count)).reshape(batch_size, sequence_length, feature_count)
|
| 179 |
+
graph_tensor = normalized.unsqueeze(1)
|
| 180 |
+
adjacency = torch.softmax(self.adjacency_logits, dim=1)
|
| 181 |
+
graph_tensor = self.block3(self.block2(self.block1(graph_tensor, adjacency), adjacency), adjacency)
|
| 182 |
+
pooled = graph_tensor.mean(dim=2).mean(dim=2)
|
| 183 |
+
return self.classifier(pooled)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
MODEL_SPECS = {
|
| 187 |
+
"bilstm": {"weight": "bidirectionallstm_model.pt", "scaler": "bidirectionallstm_scaler.pkl", "encoder": "bidirectionallstm_label_encoder.pkl", "builder": lambda f, c: BidirectionalLstmClassifier(f, 73, c, 0.2174)},
|
| 188 |
+
"lstm": {"weight": "lstm_model.pt", "scaler": "lstm_scaler.pkl", "encoder": "lstm_label_encoder.pkl", "builder": lambda f, c: LstmClassifier(f, 117, c, 0.3829)},
|
| 189 |
+
"gru": {"weight": "gru_model.pt", "scaler": "gru_scaler.pkl", "encoder": "gru_label_encoder.pkl", "builder": lambda f, c: GruClassifier(f, 96, c, 0.2)},
|
| 190 |
+
"tcn": {"weight": "tcn_model.pt", "scaler": "tcn_scaler.pkl", "encoder": "tcn_label_encoder.pkl", "builder": lambda f, c: TcnClassifier(f, c, 128, 3, 0.2)},
|
| 191 |
+
"cnn_bilstm": {"weight": "cnn_bilstm_model.pt", "scaler": "cnn_bilstm_scaler.pkl", "encoder": "cnn_bilstm_label_encoder.pkl", "builder": lambda f, c: CnnBiLstmClassifier(f, c, 128, 3, 73, 0.2)},
|
| 192 |
+
"st_gcn": {"weight": "st_gcn_model.pt", "scaler": "st_gcn_scaler.pkl", "encoder": "st_gcn_label_encoder.pkl", "builder": lambda f, c: StGcnClassifier(f, c, 0.2)},
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def load_pose_module():
|
| 197 |
+
return mp.solutions.pose
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def build_landmark_indices(mp_pose):
|
| 201 |
+
names = [
|
| 202 |
+
"LEFT_SHOULDER", "RIGHT_SHOULDER", "LEFT_HIP", "RIGHT_HIP", "LEFT_KNEE", "RIGHT_KNEE",
|
| 203 |
+
"LEFT_ELBOW", "RIGHT_ELBOW", "LEFT_WRIST", "RIGHT_WRIST", "LEFT_ANKLE", "RIGHT_ANKLE",
|
| 204 |
+
"LEFT_HEEL", "RIGHT_HEEL", "LEFT_FOOT_INDEX", "RIGHT_FOOT_INDEX", "LEFT_PINKY", "RIGHT_PINKY",
|
| 205 |
+
"LEFT_INDEX", "RIGHT_INDEX", "LEFT_THUMB", "RIGHT_THUMB"
|
| 206 |
+
]
|
| 207 |
+
return {name: mp_pose.PoseLandmark[name].value for name in names}
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def get_angle_triplets():
|
| 211 |
+
return [
|
| 212 |
+
("LEFT_HIP", "LEFT_SHOULDER", "LEFT_ELBOW"), ("RIGHT_HIP", "RIGHT_SHOULDER", "RIGHT_ELBOW"),
|
| 213 |
+
("LEFT_SHOULDER", "LEFT_ELBOW", "LEFT_WRIST"), ("RIGHT_SHOULDER", "RIGHT_ELBOW", "RIGHT_WRIST"),
|
| 214 |
+
("LEFT_HIP", "LEFT_KNEE", "LEFT_ANKLE"), ("RIGHT_HIP", "RIGHT_KNEE", "RIGHT_ANKLE"),
|
| 215 |
+
("LEFT_SHOULDER", "LEFT_HIP", "LEFT_KNEE"), ("RIGHT_SHOULDER", "RIGHT_HIP", "RIGHT_KNEE"),
|
| 216 |
+
("LEFT_KNEE", "LEFT_ANKLE", "LEFT_HEEL"), ("RIGHT_KNEE", "RIGHT_ANKLE", "RIGHT_HEEL"),
|
| 217 |
+
("LEFT_ANKLE", "LEFT_HEEL", "LEFT_FOOT_INDEX"), ("RIGHT_ANKLE", "RIGHT_HEEL", "RIGHT_FOOT_INDEX"),
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def calculate_angle_degrees(point_a, point_b, point_c):
|
| 222 |
+
if np.allclose(point_a, 0.0) or np.allclose(point_b, 0.0) or np.allclose(point_c, 0.0):
|
| 223 |
+
return 0.0
|
| 224 |
+
vector_ab = point_a[:2] - point_b[:2]
|
| 225 |
+
vector_cb = point_c[:2] - point_b[:2]
|
| 226 |
+
denominator = np.linalg.norm(vector_ab) * np.linalg.norm(vector_cb)
|
| 227 |
+
if denominator == 0.0:
|
| 228 |
+
return 0.0
|
| 229 |
+
cosine_value = np.clip(np.dot(vector_ab, vector_cb) / denominator, -1.0, 1.0)
|
| 230 |
+
return float(np.degrees(np.arccos(cosine_value)))
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def extract_frame_features(results, landmark_indices, angle_triplets, min_visibility=0.5):
|
| 234 |
+
if not results.pose_landmarks:
|
| 235 |
+
return None
|
| 236 |
+
points = {}
|
| 237 |
+
for name, idx in landmark_indices.items():
|
| 238 |
+
lm = results.pose_landmarks.landmark[idx]
|
| 239 |
+
if lm.visibility >= min_visibility:
|
| 240 |
+
points[name] = np.array([lm.x, lm.y, lm.z], dtype=np.float32)
|
| 241 |
+
else:
|
| 242 |
+
points[name] = np.array([0.0, 0.0, 0.0], dtype=np.float32)
|
| 243 |
+
|
| 244 |
+
features = []
|
| 245 |
+
for name in landmark_indices:
|
| 246 |
+
point = points[name]
|
| 247 |
+
features.extend([point[0], point[1], point[2]])
|
| 248 |
+
for a, b, c in angle_triplets:
|
| 249 |
+
features.append(calculate_angle_degrees(points[a], points[b], points[c]))
|
| 250 |
+
return np.array(features, dtype=np.float32)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def build_model_and_tools(args, device):
|
| 254 |
+
model_name = args.model_name
|
| 255 |
+
spec = MODEL_SPECS[model_name]
|
| 256 |
+
weights_dir = Path(args.models_root) / model_name / "weights"
|
| 257 |
+
|
| 258 |
+
scaler = joblib.load(weights_dir / spec["scaler"])
|
| 259 |
+
label_encoder = joblib.load(weights_dir / spec["encoder"])
|
| 260 |
+
class_count = len(label_encoder.classes_)
|
| 261 |
+
|
| 262 |
+
model = spec["builder"](args.feature_count, class_count).to(device)
|
| 263 |
+
model.load_state_dict(torch.load(weights_dir / spec["weight"], map_location=device))
|
| 264 |
+
model.eval()
|
| 265 |
+
return model, scaler, label_encoder
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def main():
|
| 269 |
+
args = parse_args()
|
| 270 |
+
output_dir = Path(args.output_dir)
|
| 271 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 272 |
+
|
| 273 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 274 |
+
model, scaler, label_encoder = build_model_and_tools(args, device)
|
| 275 |
+
|
| 276 |
+
pose_module = load_pose_module()
|
| 277 |
+
landmark_indices = build_landmark_indices(pose_module)
|
| 278 |
+
angle_triplets = get_angle_triplets()
|
| 279 |
+
|
| 280 |
+
capture = cv2.VideoCapture(args.camera_index)
|
| 281 |
+
if not capture.isOpened():
|
| 282 |
+
raise RuntimeError("Could not open webcam.")
|
| 283 |
+
|
| 284 |
+
print("Realtime evaluation started.")
|
| 285 |
+
print("Protocol: 0-20s exercise A, 20-40s exercise B, 40-60s exercise C, 60-75s free.")
|
| 286 |
+
|
| 287 |
+
window = []
|
| 288 |
+
events = []
|
| 289 |
+
prediction_latencies_ms = []
|
| 290 |
+
frame_times = []
|
| 291 |
+
predicted_labels = []
|
| 292 |
+
|
| 293 |
+
last_prediction_time = 0.0
|
| 294 |
+
current_label = "none"
|
| 295 |
+
rep_counters = {}
|
| 296 |
+
rep_smoothers = {}
|
| 297 |
+
|
| 298 |
+
start_time = time.time()
|
| 299 |
+
with pose_module.Pose(static_image_mode=False, model_complexity=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_estimator:
|
| 300 |
+
drawing_utils = mp.solutions.drawing_utils
|
| 301 |
+
drawing_spec_points = drawing_utils.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=3)
|
| 302 |
+
drawing_spec_lines = drawing_utils.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=1)
|
| 303 |
+
while True:
|
| 304 |
+
loop_start = time.time()
|
| 305 |
+
ok, frame_bgr = capture.read()
|
| 306 |
+
if not ok:
|
| 307 |
+
break
|
| 308 |
+
|
| 309 |
+
elapsed = time.time() - start_time
|
| 310 |
+
if elapsed >= args.run_seconds:
|
| 311 |
+
break
|
| 312 |
+
|
| 313 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 314 |
+
results = pose_estimator.process(frame_rgb)
|
| 315 |
+
|
| 316 |
+
if results.pose_landmarks:
|
| 317 |
+
drawing_utils.draw_landmarks(
|
| 318 |
+
frame_bgr,
|
| 319 |
+
results.pose_landmarks,
|
| 320 |
+
pose_module.POSE_CONNECTIONS,
|
| 321 |
+
landmark_drawing_spec=drawing_spec_points,
|
| 322 |
+
connection_drawing_spec=drawing_spec_lines,
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
frame_features = extract_frame_features(results, landmark_indices, angle_triplets)
|
| 326 |
+
|
| 327 |
+
if frame_features is not None:
|
| 328 |
+
window.append(frame_features)
|
| 329 |
+
if len(window) > args.sequence_length:
|
| 330 |
+
window.pop(0)
|
| 331 |
+
|
| 332 |
+
normalized_label = normalize_exercise_name(current_label)
|
| 333 |
+
if results.pose_landmarks and normalized_label in EXERCISE_CONFIGS:
|
| 334 |
+
if normalized_label not in rep_counters:
|
| 335 |
+
config = EXERCISE_CONFIGS[normalized_label]
|
| 336 |
+
rep_counters[normalized_label] = FixedThresholdFSMCounter(config.fixed_low, config.fixed_high, config.min_state_frames)
|
| 337 |
+
rep_smoothers[normalized_label] = SmoothingBuffer(config.smoothing_window)
|
| 338 |
+
landmarks = {}
|
| 339 |
+
for name, index in landmark_indices.items():
|
| 340 |
+
lm = results.pose_landmarks.landmark[index]
|
| 341 |
+
landmarks[name] = np.array([lm.x, lm.y, lm.z], dtype=np.float32) if lm.visibility >= 0.5 else np.array([0.0, 0.0, 0.0], dtype=np.float32)
|
| 342 |
+
current_config = EXERCISE_CONFIGS[normalized_label]
|
| 343 |
+
raw_angle = extract_primary_angle(landmarks, current_config)
|
| 344 |
+
smoothed_angle = rep_smoothers[normalized_label].update(raw_angle)
|
| 345 |
+
rep_counters[normalized_label].update(smoothed_angle)
|
| 346 |
+
|
| 347 |
+
if len(window) == args.sequence_length and (time.time() - last_prediction_time) >= args.prediction_interval:
|
| 348 |
+
infer_start = time.time()
|
| 349 |
+
sequence_array = np.array(window, dtype=np.float32).reshape(1, -1)
|
| 350 |
+
scaled = scaler.transform(sequence_array).reshape(1, args.sequence_length, args.feature_count)
|
| 351 |
+
input_tensor = torch.tensor(scaled, dtype=torch.float32, device=device)
|
| 352 |
+
|
| 353 |
+
with torch.inference_mode():
|
| 354 |
+
logits = model(input_tensor)
|
| 355 |
+
prediction_index = int(torch.argmax(logits, dim=1).item())
|
| 356 |
+
current_label = label_encoder.classes_[prediction_index]
|
| 357 |
+
|
| 358 |
+
infer_ms = (time.time() - infer_start) * 1000.0
|
| 359 |
+
prediction_latencies_ms.append(infer_ms)
|
| 360 |
+
predicted_labels.append(current_label)
|
| 361 |
+
events.append({"timestamp_sec": elapsed, "predicted_label": current_label, "latency_ms": infer_ms})
|
| 362 |
+
last_prediction_time = time.time()
|
| 363 |
+
|
| 364 |
+
frame_times.append(time.time() - loop_start)
|
| 365 |
+
|
| 366 |
+
cv2.putText(frame_bgr, f"Model: {args.model_name}", (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 367 |
+
cv2.putText(frame_bgr, f"Pred: {current_label}", (20, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 368 |
+
current_reps = rep_counters[normalized_label].reps if normalized_label in rep_counters else 0
|
| 369 |
+
cv2.putText(frame_bgr, f"Reps: {current_reps}", (20, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 370 |
+
cv2.putText(frame_bgr, f"Time: {elapsed:5.1f}s", (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
|
| 371 |
+
cv2.imshow("Realtime Evaluation", frame_bgr)
|
| 372 |
+
|
| 373 |
+
if cv2.waitKey(1) & 0xFF == ord("q"):
|
| 374 |
+
break
|
| 375 |
+
|
| 376 |
+
capture.release()
|
| 377 |
+
cv2.destroyAllWindows()
|
| 378 |
+
|
| 379 |
+
fps = 1.0 / np.mean(frame_times) if frame_times else 0.0
|
| 380 |
+
mean_latency = float(np.mean(prediction_latencies_ms)) if prediction_latencies_ms else None
|
| 381 |
+
p95_latency = float(np.percentile(prediction_latencies_ms, 95)) if prediction_latencies_ms else None
|
| 382 |
+
|
| 383 |
+
flips = 0
|
| 384 |
+
for index in range(1, len(predicted_labels)):
|
| 385 |
+
if predicted_labels[index] != predicted_labels[index - 1]:
|
| 386 |
+
flips += 1
|
| 387 |
+
flip_rate = float(flips / max(1, len(predicted_labels) - 1))
|
| 388 |
+
|
| 389 |
+
summary = {
|
| 390 |
+
"model": args.model_name,
|
| 391 |
+
"device": str(device),
|
| 392 |
+
"run_seconds": args.run_seconds,
|
| 393 |
+
"prediction_count": len(predicted_labels),
|
| 394 |
+
"mean_latency_ms": mean_latency,
|
| 395 |
+
"p95_latency_ms": p95_latency,
|
| 396 |
+
"pipeline_fps": float(fps),
|
| 397 |
+
"prediction_flip_rate": flip_rate,
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
summary_path = output_dir / f"{args.model_name}_realtime_metrics.csv"
|
| 401 |
+
events_path = output_dir / f"{args.model_name}_realtime_events.csv"
|
| 402 |
+
pd.DataFrame([summary]).to_csv(summary_path, index=False)
|
| 403 |
+
pd.DataFrame(events).to_csv(events_path, index=False)
|
| 404 |
+
|
| 405 |
+
print("Realtime evaluation completed.")
|
| 406 |
+
print(summary)
|
| 407 |
+
print(f"Saved: {summary_path}")
|
| 408 |
+
print(f"Saved: {events_path}")
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
if __name__ == "__main__":
|
| 412 |
+
main()
|