FarhanAK128 commited on
Commit
982555b
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
2
+ *.mpg filter=lfs diff=lfs merge=lfs -text
3
+ *.mgp filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
Dockerfile ADDED
File without changes
README.md ADDED
File without changes
data/bbal6n.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 16000 sil
2
+ 16000 21250 bin
3
+ 21250 26750 blue
4
+ 26750 27750 at
5
+ 27750 31500 l
6
+ 31500 39250 six
7
+ 39250 45250 now
8
+ 45250 74500 sil
data/bbal6n.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c7f36eaceda1bb64adc2877cfe3370104d9451059addc9035f7da55390e9510
3
+ size 428032
data/bbwgzn.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 17250 sil
2
+ 17250 22750 bin
3
+ 22750 28000 blue
4
+ 28000 30250 with
5
+ 30250 34250 g
6
+ 34250 43500 zero
7
+ 43500 49250 now
8
+ 49250 74500 sil
data/bbwgzn.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73bc132c76e16da1592fad16eb270532857b2100bf5cd30eff579c95bad38f8a
3
+ size 419840
data/bgan7a.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 14250 sil
2
+ 14250 20000 bin
3
+ 20000 24250 green
4
+ 24250 27500 at
5
+ 27500 32000 n
6
+ 32000 40000 seven
7
+ 40000 49250 again
8
+ 49250 74500 sil
data/bgan7a.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:403233bcb9018dca8b47436da55793d3ced10f336d2dcfbcacef14825f5f38f4
3
+ size 409600
data/brwg6n.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 16500 sil
2
+ 16500 22500 bin
3
+ 22500 26500 red
4
+ 26500 28500 with
5
+ 28500 32250 g
6
+ 32250 40000 six
7
+ 40000 46000 now
8
+ 46000 74500 sil
data/brwg6n.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc095f1b186a28cd81fd50ad1477ec071ee5f431dd429c4e046d02c6f7c6ada
3
+ size 413696
data/pgad9s.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 8250 sil
2
+ 8250 14500 place
3
+ 14500 19750 green
4
+ 19750 22500 at
5
+ 22500 25500 d
6
+ 25500 32250 nine
7
+ 32250 41750 soon
8
+ 41750 74500 sil
data/pgad9s.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d20fbf469814d7cda9f14515dc631bbfb05041648f4d1cdcc038729f4db0cfe8
3
+ size 401408
data/swih7s.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 7750 sil
2
+ 7750 16000 set
3
+ 16000 23000 white
4
+ 23000 25500 in
5
+ 25500 29750 h
6
+ 29750 37250 seven
7
+ 37250 46500 soon
8
+ 46500 74500 sil
data/swih7s.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b42de3a3d28a8f3f99adb4c9307441d8619d872f54a008fbef003432c864a51
3
+ size 475136
data/swwv8p.align ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0 10250 sil
2
+ 10250 22000 set
3
+ 22000 29750 white
4
+ 29750 32500 with
5
+ 32500 36000 v
6
+ 36000 40250 eight
7
+ 40250 51250 please
8
+ 51250 74500 sil
data/swwv8p.mpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06102ed813a4b6897dafda2ee49a39377edf9b2a70e8ba2d10600b6bed1a4f22
3
+ size 413696
frontend.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import imageio
4
+ from model_utils.utils import load_video_for_gif, load_data, load_alignments
5
+ from model_utils.inference import inference
6
+ import time
7
+ import uuid
8
+
9
+ # Page configuration
10
+ st.set_page_config(
11
+ page_title="MedLipReader",
12
+ page_icon="🏥",
13
+ layout='wide',
14
+ initial_sidebar_state="collapsed"
15
+ )
16
+
17
+ # Custom CSS for enhanced styling
18
+ st.markdown("""
19
+ <style>
20
+ .main-header {
21
+ font-size: 3rem;
22
+ font-weight: bold;
23
+ color: #2E86AB;
24
+ text-align: center;
25
+ margin-bottom: 0.5rem;
26
+ }
27
+
28
+ .subtitle {
29
+ font-size: 1.2rem;
30
+ color: #6C757D;
31
+ text-align: center;
32
+ margin-bottom: 2rem;
33
+ font-style: italic;
34
+ }
35
+
36
+ .section-header {
37
+ font-size: 1.5rem;
38
+ font-weight: bold;
39
+ color: #495057;
40
+ margin-bottom: 1rem;
41
+ padding: 0.5rem;
42
+ border-left: 4px solid #2E86AB;
43
+ background-color: #F8F9FA;
44
+ }
45
+
46
+ .info-card {
47
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
48
+ padding: 1.5rem;
49
+ border-radius: 10px;
50
+ color: white;
51
+ margin: 1rem 0;
52
+ text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
53
+ }
54
+
55
+ .success-card {
56
+ background: linear-gradient(135deg, #2d5016 0%, #3e7b27 100%);
57
+ padding: 1.5rem;
58
+ border-radius: 10px;
59
+ color: white;
60
+ margin: 1rem 0;
61
+ text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
62
+ }
63
+
64
+ .feature-box {
65
+ background: #F8F9FA;
66
+ padding: 1rem;
67
+ border-radius: 8px;
68
+ border: 2px solid #DEE2E6;
69
+ margin: 0.5rem 0;
70
+ }
71
+
72
+ .feature-box h4 {
73
+ color: #2E86AB;
74
+ margin-bottom: 0.5rem;
75
+ }
76
+
77
+ .feature-box p {
78
+ color: #495057;
79
+ margin: 0;
80
+ }
81
+
82
+ .metric-container {
83
+ background: white;
84
+ padding: 1rem;
85
+ border-radius: 8px;
86
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
87
+ text-align: center;
88
+ margin: 0.5rem 0;
89
+ }
90
+
91
+ .stSelectbox > div > div > select {
92
+ background-color: #F8F9FA;
93
+ border: 2px solid #2E86AB;
94
+ border-radius: 5px;
95
+ }
96
+ </style>
97
+ """, unsafe_allow_html=True)
98
+
99
+ # Header Section
100
+ st.markdown('<h1 class="main-header">🏥 MedLipReader</h1>', unsafe_allow_html=True)
101
+ st.markdown('<p class="subtitle">AI-Powered Lip Reading Technology for Healthcare Accessibility</p>', unsafe_allow_html=True)
102
+
103
+ # Information banner
104
+ st.markdown("""
105
+ <div class="info-card">
106
+ <h3>🎯 Empowering Communication for the Deaf and Hard-of-Hearing Community</h3>
107
+ <p>MedLipReader uses advanced AI to translate lip movements into text, enabling seamless communication in healthcare settings without requiring audio input.</p>
108
+ </div>
109
+ """, unsafe_allow_html=True)
110
+
111
+ # Key features section
112
+ st.markdown('<div class="section-header">✨ Key Features</div>', unsafe_allow_html=True)
113
+
114
+ feature_col1, feature_col2, feature_col3 = st.columns(3)
115
+ with feature_col1:
116
+ st.markdown("""
117
+ <div class="feature-box">
118
+ <h4>🔇 Silent Operation</h4>
119
+ <p>Works entirely without audio input</p>
120
+ </div>
121
+ """, unsafe_allow_html=True)
122
+
123
+ with feature_col2:
124
+ st.markdown("""
125
+ <div class="feature-box">
126
+ <h4>⚡ Real-time Processing</h4>
127
+ <p>Instant lip-to-text conversion</p>
128
+ </div>
129
+ """, unsafe_allow_html=True)
130
+
131
+ with feature_col3:
132
+ st.markdown("""
133
+ <div class="feature-box">
134
+ <h4>🏥 Healthcare Optimized</h4>
135
+ <p>Designed for medical environments</p>
136
+ </div>
137
+ """, unsafe_allow_html=True)
138
+
139
+ st.markdown("---")
140
+
141
+ # Main application section
142
+ st.markdown('<div class="section-header">🎬 Video Analysis Demo</div>', unsafe_allow_html=True)
143
+
144
+ # Video selection
145
+ options = [x for x in os.listdir(os.path.join('data')) if x.endswith('.mpg')]
146
+ selected_video = st.selectbox('🎥 Choose a video for lip reading analysis:', options, help="Select a video file to analyze with MedLipReader")
147
+
148
+ # Main content columns
149
+ col1, col2 = st.columns([1, 1])
150
+
151
+ if options and selected_video:
152
+ with col1:
153
+ st.markdown("""
154
+ <div style="background: #E3F2FD; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
155
+ <h4 style="color: #1565C0; margin: 0;">📹 Input Video Analysis</h4>
156
+ <p style="margin: 0.5rem 0 0 0; color: #424242;">Visual lip movement data processed by AI model</p>
157
+ </div>
158
+ """, unsafe_allow_html=True)
159
+
160
+ # Process video
161
+ file_path = os.path.join('data', selected_video)
162
+
163
+ with st.spinner('🔄 Loading and processing video...'):
164
+ gif_frames = load_video_for_gif(file_path)
165
+ alignment = load_alignments(file_path.replace('.mpg','.align'))
166
+ os.makedirs("tmp", exist_ok=True)
167
+ gif_filename = f"tmp/animation_{uuid.uuid4().hex}.gif"
168
+ imageio.mimsave(gif_filename, gif_frames, fps=15, loop=0)
169
+
170
+ # Display video
171
+ st.image(gif_filename, width=400, caption="Lip movement sequence being analyzed")
172
+
173
+ st.markdown("""
174
+ <div style="background: #E8F4FD; padding: 0.8rem; border-radius: 6px; margin: 1rem 0; border-left: 3px solid #2196F3;">
175
+ <p style="margin: 0; color: #1565C0; font-size: 0.9rem; font-style: italic;">
176
+ 💡 <strong>Innovation:</strong> Unlike image-based systems, this model learns from temporal dependencies across frames, capturing the natural flow of speech articulation.
177
+ </p>
178
+ </div>
179
+ """, unsafe_allow_html=True)
180
+
181
+ # Ground truth section
182
+ st.markdown("**📝 Ground Truth (Reference Text):**")
183
+ st.markdown(f"""
184
+ <div style="background: #F5F5F5; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #4CAF50; color: #333;">
185
+ {alignment}
186
+ </div>
187
+ """, unsafe_allow_html=True)
188
+
189
+ with col2:
190
+ st.markdown("""
191
+ <div style="background: #E8F5E8; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
192
+ <h4 style="color: #2E7D32; margin: 0;">🤖 AI Model Output</h4>
193
+ <p style="margin: 0.5rem 0 0 0; color: #424242;">Real-time lip reading results</p>
194
+ </div>
195
+ """, unsafe_allow_html=True)
196
+
197
+ # Run inference
198
+ with st.spinner('🧠 AI model processing lip movements...'):
199
+ start_time = time.time()
200
+ result = inference(file_path)
201
+ processing_time = time.time() - start_time
202
+
203
+ # Processing metrics
204
+ metric_col1, metric_col2 = st.columns(2)
205
+ with metric_col1:
206
+ st.markdown(f"""
207
+ <div class="metric-container">
208
+ <h3 style="color: #2E86AB; margin: 0;">{processing_time:.2f}s</h3>
209
+ <p style="margin: 0; color: #666;">Processing Time</p>
210
+ </div>
211
+ """, unsafe_allow_html=True)
212
+
213
+ with metric_col2:
214
+ st.markdown(f"""
215
+ <div class="metric-container">
216
+ <h3 style="color: #2E86AB; margin: 0;">{len(result["tokens"][0])}</h3>
217
+ <p style="margin: 0; color: #666;">Tokens Generated</p>
218
+ </div>
219
+ """, unsafe_allow_html=True)
220
+
221
+ # Raw tokens output
222
+ st.markdown("**🔤 Raw Model Tokens:**")
223
+ st.markdown(f"""
224
+ <div style="background: #FFF3E0; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #FF9800; font-size: 0.9rem; color: #E65100;">
225
+ {result["tokens"][0]}
226
+ </div>
227
+ """, unsafe_allow_html=True)
228
+
229
+ # Final prediction
230
+ st.markdown("**💬 Decoded Text Prediction:**")
231
+ st.markdown(f"""
232
+ <div style="background: #E8F5E8; padding: 1.5rem; border-radius: 8px; border-left: 4px solid #4CAF50; font-size: 1.1rem; font-weight: bold; color: #2E7D32;">
233
+ "{result["prediction"][0]}"
234
+ </div>
235
+ """, unsafe_allow_html=True)
236
+
237
+ # Success message
238
+ if options and selected_video:
239
+ st.markdown("""
240
+ <div class="success-card">
241
+ <h3>✅ Analysis Complete!</h3>
242
+ <p>MedLipReader has successfully converted lip movements to text. This technology can help bridge communication gaps in healthcare settings, ensuring better patient care for the deaf and hard-of-hearing community.</p>
243
+ </div>
244
+ """, unsafe_allow_html=True)
245
+
246
+ # Footer section
247
+ st.markdown("---")
248
+ st.markdown("""
249
+ <div style="text-align: center; padding: 2rem; background: #F8F9FA; border-radius: 10px; margin-top: 2rem;">
250
+ <h4 style="color: #495057; margin-bottom: 1rem;">🌟 Impact on Healthcare Accessibility</h4>
251
+ <p style="color: #6C757D; margin: 0;">
252
+ MedLipReader represents a step forward in making healthcare more inclusive and accessible.
253
+ By enabling silent communication, we're breaking down barriers and ensuring everyone receives the care they deserve.
254
+ </p>
255
+ </div>
256
+ """, unsafe_allow_html=True)
257
+ st.markdown('<br><br><p class="subtitle">Developed by Farhan Ali Khan | Empowering Accessible Healthcare Through AI</p>', unsafe_allow_html=True)
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from lip-reader-hf!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
model_utils/__init__.py ADDED
File without changes
model_utils/inference.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .model import MyModel
3
+ from .utils import load_data
4
+ from .vocabulary import vocabulary
5
+ from .utils import ctc_greedy_decoder
6
+
7
+ # model = MyModel(vocabulary).to('cpu')
8
+ # cp = torch.load("model_utils/lipnet_model.pt", map_location=torch.device('cpu'))
9
+ # model.load_state_dict(cp["model_state_dict"])
10
+ # model.eval()
11
+
12
+ model = torch.jit.load("model_utils/lipnet_model_full_scripted.pt", map_location=torch.device('cpu'))
13
+ model.eval()
14
+
15
+ def inference(path:str):
16
+ frames, _ = load_data(path)
17
+ input = frames.unsqueeze(0).to('cpu')
18
+ with torch.no_grad():
19
+ output = model(input)
20
+ result, tokens = ctc_greedy_decoder(output)
21
+ return {"prediction":result, "tokens":tokens}
model_utils/lipnet_model_full_scripted.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e4097a11847f710937c7e3f00fa773096b1394045c4076ae990399dde53271
3
+ size 33926246
model_utils/tokenizer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .vocabulary import vocabulary
2
+
3
+ class Tokenizer:
4
+ def __init__(self, vocab=vocabulary):
5
+ self.str_to_int = vocab
6
+ self.int_to_str = {id:token for token, id in vocab.items()}
7
+
8
+ def encode(self, text):
9
+ text_tokens = [token if token in self.str_to_int else "<|unk|>" for token in text]
10
+ ids = [self.str_to_int[token] for token in text_tokens]
11
+ return ids
12
+
13
+ def decode(self, ids):
14
+ text = "".join([self.int_to_str[id] for id in ids])
15
+ return text
model_utils/utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import torch
3
+ import torchvision.transforms.functional as F
4
+ import os
5
+ from .tokenizer import Tokenizer
6
+ from .vocabulary import vocabulary
7
+ import numpy as np
8
+
9
+ tokenizer = Tokenizer()
10
+
11
+ def load_video_for_gif(path):
12
+ cap = cv2.VideoCapture(path)
13
+ frames = []
14
+ for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
15
+ ret, frame = cap.read()
16
+ if not ret:
17
+ break
18
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
19
+
20
+ # Crop the mouth region
21
+ frame = frame[190:236, 80:220, :] # [H, W, C]
22
+
23
+ frames.append(frame)
24
+ cap.release()
25
+
26
+ # Convert to list of uint8 numpy arrays
27
+ frames_np = [np.array(f).astype(np.uint8) for f in frames]
28
+ return frames_np # List of [H, W, 3]
29
+
30
+ def load_video(path):
31
+
32
+ cap = cv2.VideoCapture(path)
33
+ frames = []
34
+ for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
35
+ ret, frame = cap.read()
36
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
37
+
38
+ # Convert to torch tensor [H, W, C] → [C, H, W]
39
+ frame = torch.from_numpy(frame).permute(2, 0, 1).float()
40
+
41
+ # Grayscale
42
+ frame = F.rgb_to_grayscale(frame)
43
+ frame = frame[:, 190:236, 80:220] # keep channel dimension
44
+ frames.append(frame)
45
+ cap.release()
46
+
47
+ frames = torch.stack(frames) # Shape: [T, 1, H, W]
48
+
49
+ # Normalize (per video)
50
+ mean = frames.mean()
51
+ std = frames.std()
52
+ frames = (frames - mean) / (std + 1e-8)
53
+
54
+ return frames # Shape: [T, 1, 46, 140]
55
+
56
+ def load_alignments(path: str):
57
+ with open(path, 'r') as f:
58
+ lines = f.readlines()
59
+
60
+ tokens = []
61
+ for line in lines:
62
+ line = line.split()
63
+ if line[2] != "sil": # skip silence
64
+ tokens.append(" ")
65
+ tokens.append(line[2])
66
+
67
+ # Join into one string
68
+ text = "".join(tokens).strip()
69
+
70
+ # Encode using your tokenizer
71
+ # ids = tokenizer.encode(text)
72
+ # return torch.tensor(ids, dtype=torch.long)
73
+
74
+ return text
75
+
76
+ def load_data(path: str):
77
+ # path = bytes.decode(path.numpy())
78
+ # file_name = path.split('/')[-1].split('.')[0]
79
+ # File name splitting for windows
80
+ file_name = path.split('\\')[-1].split('.')[0]
81
+ video_path = os.path.join('data',f'{file_name}.mpg')
82
+ alignment_path = os.path.join('data',f'{file_name}.align')
83
+ frames = load_video(video_path)
84
+ alignments = load_alignments(alignment_path)
85
+
86
+ return frames, alignments
87
+
88
+ def ctc_greedy_decoder(logits_batch, vocab = tokenizer.int_to_str, blank_id=0):
89
+ # logits_batch: (batch, time, vocab_size)
90
+ probabilities = torch.argmax(logits_batch, dim=-1) # (batch, time)
91
+ output = []
92
+
93
+ tokens = []
94
+
95
+ for seq in probabilities: # iterate over batch
96
+ decoded = []
97
+ indices = []
98
+ prev = None
99
+ for idx in seq.tolist():
100
+ # CTC rule: ignore blanks + collapse repeats
101
+ if idx != blank_id and idx != prev:
102
+ indices.append(idx)
103
+ decoded.append(vocab[idx])
104
+ prev = idx
105
+ output.append("".join(decoded))
106
+ tokens.append(indices)
107
+
108
+ return output, tokens
model_utils/vocabulary.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ all_chars = "abcdefghijklmnopqrstuvwxyz'?!123456789 "
2
+ vocabulary = {token:id+1 for id, token in enumerate(all_chars)}
3
+ vocabulary["<blank>"] = 0
4
+ vocabulary["<unk>"] = len(vocabulary)
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "lip-reader-hf"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "imageio[ffmpeg]>=2.37.0",
9
+ "moviepy>=2.2.1",
10
+ "opencv-python>=4.12.0.88",
11
+ "streamlit>=1.48.1",
12
+ "torch>=2.8.0",
13
+ "torchvision>=0.23.0",
14
+ ]