Spaces:
Sleeping
Sleeping
Commit
·
982555b
0
Parent(s):
add files
Browse files- .gitattributes +3 -0
- .python-version +1 -0
- Dockerfile +0 -0
- README.md +0 -0
- data/bbal6n.align +8 -0
- data/bbal6n.mpg +3 -0
- data/bbwgzn.align +8 -0
- data/bbwgzn.mpg +3 -0
- data/bgan7a.align +8 -0
- data/bgan7a.mpg +3 -0
- data/brwg6n.align +8 -0
- data/brwg6n.mpg +3 -0
- data/pgad9s.align +8 -0
- data/pgad9s.mpg +3 -0
- data/swih7s.align +8 -0
- data/swih7s.mpg +3 -0
- data/swwv8p.align +8 -0
- data/swwv8p.mpg +3 -0
- frontend.py +257 -0
- main.py +6 -0
- model_utils/__init__.py +0 -0
- model_utils/inference.py +21 -0
- model_utils/lipnet_model_full_scripted.pt +3 -0
- model_utils/tokenizer.py +15 -0
- model_utils/utils.py +108 -0
- model_utils/vocabulary.py +4 -0
- pyproject.toml +14 -0
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.mpg filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.mgp filter=lfs diff=lfs merge=lfs -text
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.13
|
Dockerfile
ADDED
|
File without changes
|
README.md
ADDED
|
File without changes
|
data/bbal6n.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 16000 sil
|
| 2 |
+
16000 21250 bin
|
| 3 |
+
21250 26750 blue
|
| 4 |
+
26750 27750 at
|
| 5 |
+
27750 31500 l
|
| 6 |
+
31500 39250 six
|
| 7 |
+
39250 45250 now
|
| 8 |
+
45250 74500 sil
|
data/bbal6n.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c7f36eaceda1bb64adc2877cfe3370104d9451059addc9035f7da55390e9510
|
| 3 |
+
size 428032
|
data/bbwgzn.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 17250 sil
|
| 2 |
+
17250 22750 bin
|
| 3 |
+
22750 28000 blue
|
| 4 |
+
28000 30250 with
|
| 5 |
+
30250 34250 g
|
| 6 |
+
34250 43500 zero
|
| 7 |
+
43500 49250 now
|
| 8 |
+
49250 74500 sil
|
data/bbwgzn.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73bc132c76e16da1592fad16eb270532857b2100bf5cd30eff579c95bad38f8a
|
| 3 |
+
size 419840
|
data/bgan7a.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 14250 sil
|
| 2 |
+
14250 20000 bin
|
| 3 |
+
20000 24250 green
|
| 4 |
+
24250 27500 at
|
| 5 |
+
27500 32000 n
|
| 6 |
+
32000 40000 seven
|
| 7 |
+
40000 49250 again
|
| 8 |
+
49250 74500 sil
|
data/bgan7a.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:403233bcb9018dca8b47436da55793d3ced10f336d2dcfbcacef14825f5f38f4
|
| 3 |
+
size 409600
|
data/brwg6n.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 16500 sil
|
| 2 |
+
16500 22500 bin
|
| 3 |
+
22500 26500 red
|
| 4 |
+
26500 28500 with
|
| 5 |
+
28500 32250 g
|
| 6 |
+
32250 40000 six
|
| 7 |
+
40000 46000 now
|
| 8 |
+
46000 74500 sil
|
data/brwg6n.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bc095f1b186a28cd81fd50ad1477ec071ee5f431dd429c4e046d02c6f7c6ada
|
| 3 |
+
size 413696
|
data/pgad9s.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 8250 sil
|
| 2 |
+
8250 14500 place
|
| 3 |
+
14500 19750 green
|
| 4 |
+
19750 22500 at
|
| 5 |
+
22500 25500 d
|
| 6 |
+
25500 32250 nine
|
| 7 |
+
32250 41750 soon
|
| 8 |
+
41750 74500 sil
|
data/pgad9s.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d20fbf469814d7cda9f14515dc631bbfb05041648f4d1cdcc038729f4db0cfe8
|
| 3 |
+
size 401408
|
data/swih7s.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 7750 sil
|
| 2 |
+
7750 16000 set
|
| 3 |
+
16000 23000 white
|
| 4 |
+
23000 25500 in
|
| 5 |
+
25500 29750 h
|
| 6 |
+
29750 37250 seven
|
| 7 |
+
37250 46500 soon
|
| 8 |
+
46500 74500 sil
|
data/swih7s.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b42de3a3d28a8f3f99adb4c9307441d8619d872f54a008fbef003432c864a51
|
| 3 |
+
size 475136
|
data/swwv8p.align
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 10250 sil
|
| 2 |
+
10250 22000 set
|
| 3 |
+
22000 29750 white
|
| 4 |
+
29750 32500 with
|
| 5 |
+
32500 36000 v
|
| 6 |
+
36000 40250 eight
|
| 7 |
+
40250 51250 please
|
| 8 |
+
51250 74500 sil
|
data/swwv8p.mpg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06102ed813a4b6897dafda2ee49a39377edf9b2a70e8ba2d10600b6bed1a4f22
|
| 3 |
+
size 413696
|
frontend.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import imageio
|
| 4 |
+
from model_utils.utils import load_video_for_gif, load_data, load_alignments
|
| 5 |
+
from model_utils.inference import inference
|
| 6 |
+
import time
|
| 7 |
+
import uuid
|
| 8 |
+
|
| 9 |
+
# Page configuration
|
| 10 |
+
st.set_page_config(
|
| 11 |
+
page_title="MedLipReader",
|
| 12 |
+
page_icon="🏥",
|
| 13 |
+
layout='wide',
|
| 14 |
+
initial_sidebar_state="collapsed"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Custom CSS for enhanced styling
|
| 18 |
+
st.markdown("""
|
| 19 |
+
<style>
|
| 20 |
+
.main-header {
|
| 21 |
+
font-size: 3rem;
|
| 22 |
+
font-weight: bold;
|
| 23 |
+
color: #2E86AB;
|
| 24 |
+
text-align: center;
|
| 25 |
+
margin-bottom: 0.5rem;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
.subtitle {
|
| 29 |
+
font-size: 1.2rem;
|
| 30 |
+
color: #6C757D;
|
| 31 |
+
text-align: center;
|
| 32 |
+
margin-bottom: 2rem;
|
| 33 |
+
font-style: italic;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.section-header {
|
| 37 |
+
font-size: 1.5rem;
|
| 38 |
+
font-weight: bold;
|
| 39 |
+
color: #495057;
|
| 40 |
+
margin-bottom: 1rem;
|
| 41 |
+
padding: 0.5rem;
|
| 42 |
+
border-left: 4px solid #2E86AB;
|
| 43 |
+
background-color: #F8F9FA;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
.info-card {
|
| 47 |
+
background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
|
| 48 |
+
padding: 1.5rem;
|
| 49 |
+
border-radius: 10px;
|
| 50 |
+
color: white;
|
| 51 |
+
margin: 1rem 0;
|
| 52 |
+
text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.success-card {
|
| 56 |
+
background: linear-gradient(135deg, #2d5016 0%, #3e7b27 100%);
|
| 57 |
+
padding: 1.5rem;
|
| 58 |
+
border-radius: 10px;
|
| 59 |
+
color: white;
|
| 60 |
+
margin: 1rem 0;
|
| 61 |
+
text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.feature-box {
|
| 65 |
+
background: #F8F9FA;
|
| 66 |
+
padding: 1rem;
|
| 67 |
+
border-radius: 8px;
|
| 68 |
+
border: 2px solid #DEE2E6;
|
| 69 |
+
margin: 0.5rem 0;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.feature-box h4 {
|
| 73 |
+
color: #2E86AB;
|
| 74 |
+
margin-bottom: 0.5rem;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.feature-box p {
|
| 78 |
+
color: #495057;
|
| 79 |
+
margin: 0;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.metric-container {
|
| 83 |
+
background: white;
|
| 84 |
+
padding: 1rem;
|
| 85 |
+
border-radius: 8px;
|
| 86 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 87 |
+
text-align: center;
|
| 88 |
+
margin: 0.5rem 0;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.stSelectbox > div > div > select {
|
| 92 |
+
background-color: #F8F9FA;
|
| 93 |
+
border: 2px solid #2E86AB;
|
| 94 |
+
border-radius: 5px;
|
| 95 |
+
}
|
| 96 |
+
</style>
|
| 97 |
+
""", unsafe_allow_html=True)
|
| 98 |
+
|
| 99 |
+
# Header Section
|
| 100 |
+
st.markdown('<h1 class="main-header">🏥 MedLipReader</h1>', unsafe_allow_html=True)
|
| 101 |
+
st.markdown('<p class="subtitle">AI-Powered Lip Reading Technology for Healthcare Accessibility</p>', unsafe_allow_html=True)
|
| 102 |
+
|
| 103 |
+
# Information banner
|
| 104 |
+
st.markdown("""
|
| 105 |
+
<div class="info-card">
|
| 106 |
+
<h3>🎯 Empowering Communication for the Deaf and Hard-of-Hearing Community</h3>
|
| 107 |
+
<p>MedLipReader uses advanced AI to translate lip movements into text, enabling seamless communication in healthcare settings without requiring audio input.</p>
|
| 108 |
+
</div>
|
| 109 |
+
""", unsafe_allow_html=True)
|
| 110 |
+
|
| 111 |
+
# Key features section
|
| 112 |
+
st.markdown('<div class="section-header">✨ Key Features</div>', unsafe_allow_html=True)
|
| 113 |
+
|
| 114 |
+
feature_col1, feature_col2, feature_col3 = st.columns(3)
|
| 115 |
+
with feature_col1:
|
| 116 |
+
st.markdown("""
|
| 117 |
+
<div class="feature-box">
|
| 118 |
+
<h4>🔇 Silent Operation</h4>
|
| 119 |
+
<p>Works entirely without audio input</p>
|
| 120 |
+
</div>
|
| 121 |
+
""", unsafe_allow_html=True)
|
| 122 |
+
|
| 123 |
+
with feature_col2:
|
| 124 |
+
st.markdown("""
|
| 125 |
+
<div class="feature-box">
|
| 126 |
+
<h4>⚡ Real-time Processing</h4>
|
| 127 |
+
<p>Instant lip-to-text conversion</p>
|
| 128 |
+
</div>
|
| 129 |
+
""", unsafe_allow_html=True)
|
| 130 |
+
|
| 131 |
+
with feature_col3:
|
| 132 |
+
st.markdown("""
|
| 133 |
+
<div class="feature-box">
|
| 134 |
+
<h4>🏥 Healthcare Optimized</h4>
|
| 135 |
+
<p>Designed for medical environments</p>
|
| 136 |
+
</div>
|
| 137 |
+
""", unsafe_allow_html=True)
|
| 138 |
+
|
| 139 |
+
st.markdown("---")
|
| 140 |
+
|
| 141 |
+
# Main application section
|
| 142 |
+
st.markdown('<div class="section-header">🎬 Video Analysis Demo</div>', unsafe_allow_html=True)
|
| 143 |
+
|
| 144 |
+
# Video selection
|
| 145 |
+
options = [x for x in os.listdir(os.path.join('data')) if x.endswith('.mpg')]
|
| 146 |
+
selected_video = st.selectbox('🎥 Choose a video for lip reading analysis:', options, help="Select a video file to analyze with MedLipReader")
|
| 147 |
+
|
| 148 |
+
# Main content columns
|
| 149 |
+
col1, col2 = st.columns([1, 1])
|
| 150 |
+
|
| 151 |
+
if options and selected_video:
|
| 152 |
+
with col1:
|
| 153 |
+
st.markdown("""
|
| 154 |
+
<div style="background: #E3F2FD; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
|
| 155 |
+
<h4 style="color: #1565C0; margin: 0;">📹 Input Video Analysis</h4>
|
| 156 |
+
<p style="margin: 0.5rem 0 0 0; color: #424242;">Visual lip movement data processed by AI model</p>
|
| 157 |
+
</div>
|
| 158 |
+
""", unsafe_allow_html=True)
|
| 159 |
+
|
| 160 |
+
# Process video
|
| 161 |
+
file_path = os.path.join('data', selected_video)
|
| 162 |
+
|
| 163 |
+
with st.spinner('🔄 Loading and processing video...'):
|
| 164 |
+
gif_frames = load_video_for_gif(file_path)
|
| 165 |
+
alignment = load_alignments(file_path.replace('.mpg','.align'))
|
| 166 |
+
os.makedirs("tmp", exist_ok=True)
|
| 167 |
+
gif_filename = f"tmp/animation_{uuid.uuid4().hex}.gif"
|
| 168 |
+
imageio.mimsave(gif_filename, gif_frames, fps=15, loop=0)
|
| 169 |
+
|
| 170 |
+
# Display video
|
| 171 |
+
st.image(gif_filename, width=400, caption="Lip movement sequence being analyzed")
|
| 172 |
+
|
| 173 |
+
st.markdown("""
|
| 174 |
+
<div style="background: #E8F4FD; padding: 0.8rem; border-radius: 6px; margin: 1rem 0; border-left: 3px solid #2196F3;">
|
| 175 |
+
<p style="margin: 0; color: #1565C0; font-size: 0.9rem; font-style: italic;">
|
| 176 |
+
💡 <strong>Innovation:</strong> Unlike image-based systems, this model learns from temporal dependencies across frames, capturing the natural flow of speech articulation.
|
| 177 |
+
</p>
|
| 178 |
+
</div>
|
| 179 |
+
""", unsafe_allow_html=True)
|
| 180 |
+
|
| 181 |
+
# Ground truth section
|
| 182 |
+
st.markdown("**📝 Ground Truth (Reference Text):**")
|
| 183 |
+
st.markdown(f"""
|
| 184 |
+
<div style="background: #F5F5F5; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #4CAF50; color: #333;">
|
| 185 |
+
{alignment}
|
| 186 |
+
</div>
|
| 187 |
+
""", unsafe_allow_html=True)
|
| 188 |
+
|
| 189 |
+
with col2:
|
| 190 |
+
st.markdown("""
|
| 191 |
+
<div style="background: #E8F5E8; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
|
| 192 |
+
<h4 style="color: #2E7D32; margin: 0;">🤖 AI Model Output</h4>
|
| 193 |
+
<p style="margin: 0.5rem 0 0 0; color: #424242;">Real-time lip reading results</p>
|
| 194 |
+
</div>
|
| 195 |
+
""", unsafe_allow_html=True)
|
| 196 |
+
|
| 197 |
+
# Run inference
|
| 198 |
+
with st.spinner('🧠 AI model processing lip movements...'):
|
| 199 |
+
start_time = time.time()
|
| 200 |
+
result = inference(file_path)
|
| 201 |
+
processing_time = time.time() - start_time
|
| 202 |
+
|
| 203 |
+
# Processing metrics
|
| 204 |
+
metric_col1, metric_col2 = st.columns(2)
|
| 205 |
+
with metric_col1:
|
| 206 |
+
st.markdown(f"""
|
| 207 |
+
<div class="metric-container">
|
| 208 |
+
<h3 style="color: #2E86AB; margin: 0;">{processing_time:.2f}s</h3>
|
| 209 |
+
<p style="margin: 0; color: #666;">Processing Time</p>
|
| 210 |
+
</div>
|
| 211 |
+
""", unsafe_allow_html=True)
|
| 212 |
+
|
| 213 |
+
with metric_col2:
|
| 214 |
+
st.markdown(f"""
|
| 215 |
+
<div class="metric-container">
|
| 216 |
+
<h3 style="color: #2E86AB; margin: 0;">{len(result["tokens"][0])}</h3>
|
| 217 |
+
<p style="margin: 0; color: #666;">Tokens Generated</p>
|
| 218 |
+
</div>
|
| 219 |
+
""", unsafe_allow_html=True)
|
| 220 |
+
|
| 221 |
+
# Raw tokens output
|
| 222 |
+
st.markdown("**🔤 Raw Model Tokens:**")
|
| 223 |
+
st.markdown(f"""
|
| 224 |
+
<div style="background: #FFF3E0; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #FF9800; font-size: 0.9rem; color: #E65100;">
|
| 225 |
+
{result["tokens"][0]}
|
| 226 |
+
</div>
|
| 227 |
+
""", unsafe_allow_html=True)
|
| 228 |
+
|
| 229 |
+
# Final prediction
|
| 230 |
+
st.markdown("**💬 Decoded Text Prediction:**")
|
| 231 |
+
st.markdown(f"""
|
| 232 |
+
<div style="background: #E8F5E8; padding: 1.5rem; border-radius: 8px; border-left: 4px solid #4CAF50; font-size: 1.1rem; font-weight: bold; color: #2E7D32;">
|
| 233 |
+
"{result["prediction"][0]}"
|
| 234 |
+
</div>
|
| 235 |
+
""", unsafe_allow_html=True)
|
| 236 |
+
|
| 237 |
+
# Success message
|
| 238 |
+
if options and selected_video:
|
| 239 |
+
st.markdown("""
|
| 240 |
+
<div class="success-card">
|
| 241 |
+
<h3>✅ Analysis Complete!</h3>
|
| 242 |
+
<p>MedLipReader has successfully converted lip movements to text. This technology can help bridge communication gaps in healthcare settings, ensuring better patient care for the deaf and hard-of-hearing community.</p>
|
| 243 |
+
</div>
|
| 244 |
+
""", unsafe_allow_html=True)
|
| 245 |
+
|
| 246 |
+
# Footer section
|
| 247 |
+
st.markdown("---")
|
| 248 |
+
st.markdown("""
|
| 249 |
+
<div style="text-align: center; padding: 2rem; background: #F8F9FA; border-radius: 10px; margin-top: 2rem;">
|
| 250 |
+
<h4 style="color: #495057; margin-bottom: 1rem;">🌟 Impact on Healthcare Accessibility</h4>
|
| 251 |
+
<p style="color: #6C757D; margin: 0;">
|
| 252 |
+
MedLipReader represents a step forward in making healthcare more inclusive and accessible.
|
| 253 |
+
By enabling silent communication, we're breaking down barriers and ensuring everyone receives the care they deserve.
|
| 254 |
+
</p>
|
| 255 |
+
</div>
|
| 256 |
+
""", unsafe_allow_html=True)
|
| 257 |
+
st.markdown('<br><br><p class="subtitle">Developed by Farhan Ali Khan | Empowering Accessible Healthcare Through AI</p>', unsafe_allow_html=True)
|
main.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def main():
|
| 2 |
+
print("Hello from lip-reader-hf!")
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
model_utils/__init__.py
ADDED
|
File without changes
|
model_utils/inference.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from .model import MyModel
|
| 3 |
+
from .utils import load_data
|
| 4 |
+
from .vocabulary import vocabulary
|
| 5 |
+
from .utils import ctc_greedy_decoder
|
| 6 |
+
|
| 7 |
+
# model = MyModel(vocabulary).to('cpu')
|
| 8 |
+
# cp = torch.load("model_utils/lipnet_model.pt", map_location=torch.device('cpu'))
|
| 9 |
+
# model.load_state_dict(cp["model_state_dict"])
|
| 10 |
+
# model.eval()
|
| 11 |
+
|
| 12 |
+
model = torch.jit.load("model_utils/lipnet_model_full_scripted.pt", map_location=torch.device('cpu'))
|
| 13 |
+
model.eval()
|
| 14 |
+
|
| 15 |
+
def inference(path:str):
|
| 16 |
+
frames, _ = load_data(path)
|
| 17 |
+
input = frames.unsqueeze(0).to('cpu')
|
| 18 |
+
with torch.no_grad():
|
| 19 |
+
output = model(input)
|
| 20 |
+
result, tokens = ctc_greedy_decoder(output)
|
| 21 |
+
return {"prediction":result, "tokens":tokens}
|
model_utils/lipnet_model_full_scripted.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52e4097a11847f710937c7e3f00fa773096b1394045c4076ae990399dde53271
|
| 3 |
+
size 33926246
|
model_utils/tokenizer.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .vocabulary import vocabulary
|
| 2 |
+
|
| 3 |
+
class Tokenizer:
|
| 4 |
+
def __init__(self, vocab=vocabulary):
|
| 5 |
+
self.str_to_int = vocab
|
| 6 |
+
self.int_to_str = {id:token for token, id in vocab.items()}
|
| 7 |
+
|
| 8 |
+
def encode(self, text):
|
| 9 |
+
text_tokens = [token if token in self.str_to_int else "<|unk|>" for token in text]
|
| 10 |
+
ids = [self.str_to_int[token] for token in text_tokens]
|
| 11 |
+
return ids
|
| 12 |
+
|
| 13 |
+
def decode(self, ids):
|
| 14 |
+
text = "".join([self.int_to_str[id] for id in ids])
|
| 15 |
+
return text
|
model_utils/utils.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import torch
|
| 3 |
+
import torchvision.transforms.functional as F
|
| 4 |
+
import os
|
| 5 |
+
from .tokenizer import Tokenizer
|
| 6 |
+
from .vocabulary import vocabulary
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
tokenizer = Tokenizer()
|
| 10 |
+
|
| 11 |
+
def load_video_for_gif(path):
|
| 12 |
+
cap = cv2.VideoCapture(path)
|
| 13 |
+
frames = []
|
| 14 |
+
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
|
| 15 |
+
ret, frame = cap.read()
|
| 16 |
+
if not ret:
|
| 17 |
+
break
|
| 18 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 19 |
+
|
| 20 |
+
# Crop the mouth region
|
| 21 |
+
frame = frame[190:236, 80:220, :] # [H, W, C]
|
| 22 |
+
|
| 23 |
+
frames.append(frame)
|
| 24 |
+
cap.release()
|
| 25 |
+
|
| 26 |
+
# Convert to list of uint8 numpy arrays
|
| 27 |
+
frames_np = [np.array(f).astype(np.uint8) for f in frames]
|
| 28 |
+
return frames_np # List of [H, W, 3]
|
| 29 |
+
|
| 30 |
+
def load_video(path):
|
| 31 |
+
|
| 32 |
+
cap = cv2.VideoCapture(path)
|
| 33 |
+
frames = []
|
| 34 |
+
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
|
| 35 |
+
ret, frame = cap.read()
|
| 36 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 37 |
+
|
| 38 |
+
# Convert to torch tensor [H, W, C] → [C, H, W]
|
| 39 |
+
frame = torch.from_numpy(frame).permute(2, 0, 1).float()
|
| 40 |
+
|
| 41 |
+
# Grayscale
|
| 42 |
+
frame = F.rgb_to_grayscale(frame)
|
| 43 |
+
frame = frame[:, 190:236, 80:220] # keep channel dimension
|
| 44 |
+
frames.append(frame)
|
| 45 |
+
cap.release()
|
| 46 |
+
|
| 47 |
+
frames = torch.stack(frames) # Shape: [T, 1, H, W]
|
| 48 |
+
|
| 49 |
+
# Normalize (per video)
|
| 50 |
+
mean = frames.mean()
|
| 51 |
+
std = frames.std()
|
| 52 |
+
frames = (frames - mean) / (std + 1e-8)
|
| 53 |
+
|
| 54 |
+
return frames # Shape: [T, 1, 46, 140]
|
| 55 |
+
|
| 56 |
+
def load_alignments(path: str):
|
| 57 |
+
with open(path, 'r') as f:
|
| 58 |
+
lines = f.readlines()
|
| 59 |
+
|
| 60 |
+
tokens = []
|
| 61 |
+
for line in lines:
|
| 62 |
+
line = line.split()
|
| 63 |
+
if line[2] != "sil": # skip silence
|
| 64 |
+
tokens.append(" ")
|
| 65 |
+
tokens.append(line[2])
|
| 66 |
+
|
| 67 |
+
# Join into one string
|
| 68 |
+
text = "".join(tokens).strip()
|
| 69 |
+
|
| 70 |
+
# Encode using your tokenizer
|
| 71 |
+
# ids = tokenizer.encode(text)
|
| 72 |
+
# return torch.tensor(ids, dtype=torch.long)
|
| 73 |
+
|
| 74 |
+
return text
|
| 75 |
+
|
| 76 |
+
def load_data(path: str):
|
| 77 |
+
# path = bytes.decode(path.numpy())
|
| 78 |
+
# file_name = path.split('/')[-1].split('.')[0]
|
| 79 |
+
# File name splitting for windows
|
| 80 |
+
file_name = path.split('\\')[-1].split('.')[0]
|
| 81 |
+
video_path = os.path.join('data',f'{file_name}.mpg')
|
| 82 |
+
alignment_path = os.path.join('data',f'{file_name}.align')
|
| 83 |
+
frames = load_video(video_path)
|
| 84 |
+
alignments = load_alignments(alignment_path)
|
| 85 |
+
|
| 86 |
+
return frames, alignments
|
| 87 |
+
|
| 88 |
+
def ctc_greedy_decoder(logits_batch, vocab = tokenizer.int_to_str, blank_id=0):
|
| 89 |
+
# logits_batch: (batch, time, vocab_size)
|
| 90 |
+
probabilities = torch.argmax(logits_batch, dim=-1) # (batch, time)
|
| 91 |
+
output = []
|
| 92 |
+
|
| 93 |
+
tokens = []
|
| 94 |
+
|
| 95 |
+
for seq in probabilities: # iterate over batch
|
| 96 |
+
decoded = []
|
| 97 |
+
indices = []
|
| 98 |
+
prev = None
|
| 99 |
+
for idx in seq.tolist():
|
| 100 |
+
# CTC rule: ignore blanks + collapse repeats
|
| 101 |
+
if idx != blank_id and idx != prev:
|
| 102 |
+
indices.append(idx)
|
| 103 |
+
decoded.append(vocab[idx])
|
| 104 |
+
prev = idx
|
| 105 |
+
output.append("".join(decoded))
|
| 106 |
+
tokens.append(indices)
|
| 107 |
+
|
| 108 |
+
return output, tokens
|
model_utils/vocabulary.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
all_chars = "abcdefghijklmnopqrstuvwxyz'?!123456789 "
|
| 2 |
+
vocabulary = {token:id+1 for id, token in enumerate(all_chars)}
|
| 3 |
+
vocabulary["<blank>"] = 0
|
| 4 |
+
vocabulary["<unk>"] = len(vocabulary)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "lip-reader-hf"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.13"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"imageio[ffmpeg]>=2.37.0",
|
| 9 |
+
"moviepy>=2.2.1",
|
| 10 |
+
"opencv-python>=4.12.0.88",
|
| 11 |
+
"streamlit>=1.48.1",
|
| 12 |
+
"torch>=2.8.0",
|
| 13 |
+
"torchvision>=0.23.0",
|
| 14 |
+
]
|