SunilS commited on
Commit ·
d0afa2c
1
Parent(s): 492e9c4
UI: Improve About App page with Streamlit tabs and training details
Browse files
app.py
CHANGED
|
@@ -284,16 +284,17 @@ if app_mode =='About App':
|
|
| 284 |
)
|
| 285 |
# st.video('https://www.youtube.com/watch?v=FMaNNXgB_5c&ab_channel=AugmentedStartups')
|
| 286 |
|
| 287 |
-
st.
|
|
|
|
|
|
|
|
|
|
| 288 |
# Dataset Used \n
|
| 289 |
This model is trained using [INCLUDE](https://zenodo.org/records/4010759) dataset. \n
|
| 290 |
|
| 291 |
### Key Statistics for the dataset is as follows-
|
| 292 |
|
| 293 |
-
|
| 294 |
-
|
| 295 |
+-----------------------+-----------------+
|
| 296 |
-
|
|
| 297 |
+-----------------------+-----------------+
|
| 298 |
| Categories | 15 |
|
| 299 |
| Words | 263 |
|
|
@@ -305,8 +306,8 @@ if app_mode =='About App':
|
|
| 305 |
| Frame Rate | 25fps |
|
| 306 |
| Resolution | 1920x1080 |
|
| 307 |
+-----------------------+-----------------+
|
| 308 |
-
#### Size of each category
|
| 309 |
|
|
|
|
| 310 |
|
| 311 |
+--------------------+-------------------+------------------+
|
| 312 |
| Category | Number of Classes | Number of Videos |
|
|
@@ -328,76 +329,70 @@ if app_mode =='About App':
|
|
| 328 |
| Society | 23 | 324 |
|
| 329 |
| | Categories# 263 | Total Videos-4287|
|
| 330 |
+--------------------+-------------------+------------------+
|
|
|
|
|
|
|
| 331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
''
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
translation_model.add(BatchNormalization())
|
| 368 |
-
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2, return_sequences=True)))
|
| 369 |
-
|
| 370 |
-
translation_model.add(Dropout(0.2))
|
| 371 |
-
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
|
| 372 |
-
|
| 373 |
-
translation_model.add(keras.layers.Activation('elu'))
|
| 374 |
-
translation_model.add(Dense(32, use_bias=False, kernel_initializer='he_normal'))
|
| 375 |
-
|
| 376 |
-
translation_model.add(BatchNormalization())
|
| 377 |
-
translation_model.add(Dropout(0.2))
|
| 378 |
-
translation_model.add(keras.layers.Activation('elu'))
|
| 379 |
-
translation_model.add(Dense(32, kernel_initializer='he_normal',use_bias=False))
|
| 380 |
-
|
| 381 |
-
translation_model.add(BatchNormalization())
|
| 382 |
-
translation_model.add(keras.layers.Activation('elu'))
|
| 383 |
-
translation_model.add(Dropout(0.2))
|
| 384 |
-
translation_model.add(Dense(len(list(expression_mapping.keys())), activation='softmax'))
|
| 385 |
-
isl_translator=ISLSignPosTranslator(bodypose_25_model(),handpose_model(), translation_model)
|
| 386 |
-
```
|
| 387 |
-
|
| 388 |
-
Total params: 82,679 (322.96 KB)
|
| 389 |
-
Trainable params: 82,239 (321.25 KB)
|
| 390 |
-
Non-trainable params: 440 (1.72 KB)
|
| 391 |
''')
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
st.
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
| 400 |
|
|
|
|
|
|
|
| 401 |
''')
|
| 402 |
|
| 403 |
elif app_mode =='Run on Test Videos':
|
|
|
|
| 284 |
)
|
| 285 |
# st.video('https://www.youtube.com/watch?v=FMaNNXgB_5c&ab_channel=AugmentedStartups')
|
| 286 |
|
| 287 |
+
tab1, tab2, tab3, tab4 = st.tabs(["Dataset Overview", "Data Pipeline", "Model Architecture", "Training Details"])
|
| 288 |
+
|
| 289 |
+
with tab1:
|
| 290 |
+
st.markdown('''
|
| 291 |
# Dataset Used \n
|
| 292 |
This model is trained using [INCLUDE](https://zenodo.org/records/4010759) dataset. \n
|
| 293 |
|
| 294 |
### Key Statistics for the dataset is as follows-
|
| 295 |
|
|
|
|
|
|
|
| 296 |
+-----------------------+-----------------+
|
| 297 |
+
| Characteristics | INCLUDE-DATASET |
|
| 298 |
+-----------------------+-----------------+
|
| 299 |
| Categories | 15 |
|
| 300 |
| Words | 263 |
|
|
|
|
| 306 |
| Frame Rate | 25fps |
|
| 307 |
| Resolution | 1920x1080 |
|
| 308 |
+-----------------------+-----------------+
|
|
|
|
| 309 |
|
| 310 |
+
#### Size of each category
|
| 311 |
|
| 312 |
+--------------------+-------------------+------------------+
|
| 313 |
| Category | Number of Classes | Number of Videos |
|
|
|
|
| 329 |
| Society | 23 | 324 |
|
| 330 |
| | Categories# 263 | Total Videos-4287|
|
| 331 |
+--------------------+-------------------+------------------+
|
| 332 |
+
''')
|
| 333 |
+
st.info("💡 **Note:** The dataset used for training contains **1986 processed videos out of 4287**. We processed a limited set of records due to time and compute constraints.")
|
| 334 |
|
| 335 |
+
with tab2:
|
| 336 |
+
st.markdown('''
|
| 337 |
+
### Data Pipeline
|
| 338 |
+
The pipeline processes video frames to extract pose and hand landmarks using an OpenPose-like approach.
|
| 339 |
+
|
| 340 |
+
For each frame, the feature extraction process produces a vector of **156 features**, consisting of:
|
| 341 |
+
- **Body Pose**: X and Y coordinates for 15 body keypoints, along with edge lengths and angles.
|
| 342 |
+
- **Hand Pose**: X and Y coordinates for 21 keypoints on each hand (left and right).
|
| 343 |
+
|
| 344 |
+
This structured tabular data is then grouped sequentially into sliding windows to capture the temporal motion of the signs.
|
| 345 |
+
''')
|
| 346 |
|
| 347 |
+
with tab3:
|
| 348 |
+
st.markdown('''
|
| 349 |
+
### Model Structure
|
| 350 |
+
The translation model utilizes a sequence-to-sequence architecture based on Bidirectional LSTMs.
|
| 351 |
+
|
| 352 |
+
```python
|
| 353 |
+
translation_model = Sequential()
|
| 354 |
+
translation_model.add(Input(shape=((20, 156))))
|
| 355 |
+
translation_model.add(keras.layers.Masking(mask_value=0.))
|
| 356 |
+
translation_model.add(BatchNormalization())
|
| 357 |
+
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2, return_sequences=True)))
|
| 358 |
+
|
| 359 |
+
translation_model.add(Dropout(0.2))
|
| 360 |
+
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
|
| 361 |
+
|
| 362 |
+
translation_model.add(keras.layers.Activation('elu'))
|
| 363 |
+
translation_model.add(Dense(32, use_bias=False, kernel_initializer='he_normal'))
|
| 364 |
+
|
| 365 |
+
translation_model.add(BatchNormalization())
|
| 366 |
+
translation_model.add(Dropout(0.2))
|
| 367 |
+
translation_model.add(keras.layers.Activation('elu'))
|
| 368 |
+
translation_model.add(Dense(32, kernel_initializer='he_normal',use_bias=False))
|
| 369 |
+
|
| 370 |
+
translation_model.add(BatchNormalization())
|
| 371 |
+
translation_model.add(keras.layers.Activation('elu'))
|
| 372 |
+
translation_model.add(Dropout(0.2))
|
| 373 |
+
translation_model.add(Dense(len(list(expression_mapping.keys())), activation='softmax'))
|
| 374 |
+
isl_translator=ISLSignPosTranslator(bodypose_25_model(),handpose_model(), translation_model)
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
**Parameters:**
|
| 378 |
+
- Total params: 82,679 (322.96 KB)
|
| 379 |
+
- Trainable params: 82,239 (321.25 KB)
|
| 380 |
+
- Non-trainable params: 440 (1.72 KB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
''')
|
| 382 |
+
|
| 383 |
+
with tab4:
|
| 384 |
+
st.markdown('''
|
| 385 |
+
### Training Details
|
| 386 |
+
The model was trained using the **Keras 3 API with a PyTorch backend**.
|
| 387 |
+
|
| 388 |
+
**Dataset Size:**
|
| 389 |
+
- **123,743 individual frames** were extracted from the processed videos and used as the training set.
|
| 390 |
+
|
| 391 |
+
**Sequence Windowing:**
|
| 392 |
+
- The temporal data is structured into windows of **20 frames** per sequence (`shape=(20, 156)`), allowing the Bidirectional LSTMs to learn the motion context of the signs.
|
| 393 |
|
| 394 |
+
**Metrics & Logs:**
|
| 395 |
+
- You can view the detailed training progression on [Tensorboard](https://huggingface.co/cdsteameight/ISL-SignLanguageTranslation/tensorboard).
|
| 396 |
''')
|
| 397 |
|
| 398 |
elif app_mode =='Run on Test Videos':
|