omm7 commited on
Commit
09b0ff7
Β·
verified Β·
1 Parent(s): 5b08af8

First Commit

Browse files

Hopefully everything works

Files changed (4) hide show
  1. README.md +30 -8
  2. app.py +235 -0
  3. checkpoint.weights.h5 +3 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,14 +1,36 @@
1
  ---
2
- title: Lip Reader
3
- emoji: πŸš€
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: Reads Lips - Predicts sentences said in video without audio
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LipNet Silent Speech Recognition
3
+ emoji: πŸ‘„
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # LipNet β€” Silent Speech Recognition
13
+
14
+ A deep learning model that reads lips from video and predicts spoken text β€” no audio required.
15
+
16
+ ## Model Architecture
17
+ - **3Γ— Conv3D** layers for spatiotemporal feature extraction
18
+ - **2Γ— Bidirectional LSTM** layers for sequence modelling
19
+ - **CTC Loss** for sequence-to-sequence alignment
20
+ - Input: 75 frames of mouth region (46Γ—140 px, grayscale)
21
+
22
+ ## How to Use
23
+ 1. Upload a short `.mpg` or `.mp4` video showing a frontal face
24
+ 2. Click **READ LIPS**
25
+ 3. The predicted sentence appears on the right
26
+
27
+ ## Dataset
28
+ Trained on the [GRID Corpus](https://spandh.dcs.shef.ac.uk/gridcorpus/) β€” Speaker S1.
29
+ Vocabulary: `a-z`, digits `1-9`, punctuation `'?!` and space (40 characters total).
30
+
31
+ ## Files
32
+ ```
33
+ app.py ← Gradio app + inference
34
+ requirements.txt ← Dependencies
35
+ models/checkpoint.weights.h5 ← Model weights (upload manually)
36
+ ```
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import tempfile
4
+ import subprocess
5
+ import numpy as np
6
+ import imageio
7
+ import tensorflow as tf
8
+ from tensorflow.keras.models import Sequential
9
+ from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout,
10
+ Bidirectional, MaxPool3D, Activation, Reshape)
11
+ import gradio as gr
12
+
13
+ # ── Vocabulary ────────────────────────────────────────────────────────────────
14
+ vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
15
+ char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
16
+ num_to_char = tf.keras.layers.StringLookup(
17
+ vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
18
+ )
19
+
20
+ # ── Build & Load Model ────────────────────────────────────────────────────────
21
+ def build_model():
22
+ m = Sequential()
23
+ m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same'))
24
+ m.add(Activation('relu'))
25
+ m.add(MaxPool3D((1, 2, 2)))
26
+ m.add(Conv3D(256, 3, padding='same'))
27
+ m.add(Activation('relu'))
28
+ m.add(MaxPool3D((1, 2, 2)))
29
+ m.add(Conv3D(75, 3, padding='same'))
30
+ m.add(Activation('relu'))
31
+ m.add(MaxPool3D((1, 2, 2)))
32
+ m.add(Reshape((75, 5 * 17 * 75)))
33
+ m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
34
+ m.add(Dropout(0.5))
35
+ m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
36
+ m.add(Dropout(0.5))
37
+ m.add(Dense(char_to_num.vocabulary_size() + 1,
38
+ kernel_initializer='he_normal', activation='softmax'))
39
+ return m
40
+
41
+ model = build_model()
42
+ model.load_weights('checkpoint.weights.h5')
43
+
44
+ # ── Video Processing ──────────────────────────────────────────────────────────
45
+ def load_video_frames(path: str):
46
+ cap = cv2.VideoCapture(path)
47
+ processed_frames = []
48
+ for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
49
+ ret, frame = cap.read()
50
+ if not ret:
51
+ break
52
+ gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32))
53
+ processed_frames.append(gray[190:236, 80:220, :])
54
+ cap.release()
55
+
56
+ target = 75
57
+ if len(processed_frames) < target:
58
+ pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames))
59
+ processed_frames = processed_frames + pad
60
+ else:
61
+ processed_frames = processed_frames[:target]
62
+
63
+ frames_tensor = tf.stack(processed_frames)
64
+ mean = tf.math.reduce_mean(frames_tensor)
65
+ std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8)
66
+ return tf.cast((frames_tensor - mean), tf.float32) / std
67
+
68
+
69
+ def convert_to_mp4(input_path: str) -> str:
70
+ out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
71
+ out.close()
72
+ try:
73
+ subprocess.run(
74
+ ['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name],
75
+ check=True, capture_output=True
76
+ )
77
+ return out.name
78
+ except Exception:
79
+ return input_path
80
+
81
+
82
+ def make_mouth_gif(frames_tensor) -> str:
83
+ frames_np = frames_tensor.numpy()
84
+ gif_frames = []
85
+ for f in frames_np:
86
+ g = f[:, :, 0]
87
+ g = g - g.min()
88
+ rng = g.max()
89
+ if rng > 0:
90
+ g = g / rng
91
+ rgb = np.stack([g, g, g], axis=-1)
92
+ gif_frames.append((rgb * 255).astype(np.uint8))
93
+ tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False)
94
+ tmp.close()
95
+ imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0)
96
+ return tmp.name
97
+
98
+
99
+ # ── Inference ─────────────────────────────────────────────────────────────────
100
+ def predict(video_path: str):
101
+ if video_path is None:
102
+ return None, None, "Upload a video first.", "(no prediction)", "β€”"
103
+ try:
104
+ frames_tensor = load_video_frames(video_path)
105
+ mp4_path = convert_to_mp4(video_path)
106
+ gif_path = make_mouth_gif(frames_tensor)
107
+
108
+ inp = tf.expand_dims(frames_tensor, axis=0)
109
+ yhat = model.predict(inp, verbose=0)
110
+
111
+ decoded_indices = tf.keras.backend.ctc_decode(
112
+ yhat, input_length=[75], greedy=True
113
+ )[0][0].numpy()
114
+
115
+ tokens_str = str(decoded_indices[0].tolist())
116
+ prediction = tf.strings.reduce_join(
117
+ num_to_char(decoded_indices[0])
118
+ ).numpy().decode('utf-8').strip() or "(no prediction)"
119
+
120
+ confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
121
+
122
+ return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%"
123
+
124
+ except Exception as e:
125
+ err = f"Error: {str(e)}"
126
+ return None, None, err, err, "β€”"
127
+
128
+
129
+ # ── CSS ───────────────────────────────────────────────────────────────────────
130
+ css = """
131
+ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap');
132
+
133
+ body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; }
134
+
135
+ .hero { text-align: center; padding: 2.5rem 1rem 0.5rem; }
136
+ .hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; }
137
+ .hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; }
138
+ .hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; }
139
+
140
+ .section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; }
141
+
142
+ .divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; }
143
+
144
+ .mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; }
145
+
146
+ .prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; }
147
+
148
+ .confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; }
149
+
150
+ button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; }
151
+
152
+ .info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; }
153
+ .info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; }
154
+ .info-panel span { color: #6366f1; }
155
+ """
156
+
157
+ # ── UI ────────────────────────────────────────────────────────────────────────
158
+ with gr.Blocks(css=css, title="LipNet β€” Silent Speech Recognition") as demo:
159
+
160
+ gr.HTML("""
161
+ <div class="hero">
162
+ <h1>LipNet</h1>
163
+ <p class="sub">Silent Speech Recognition Β· No Audio Required</p>
164
+ <span class="badge">Conv3D β†’ BiLSTM Γ— 2 β†’ CTC Decode Β· GRID Corpus S1</span>
165
+ </div>
166
+ <div style="height:1.5rem"></div>
167
+ """)
168
+
169
+ # ── Row 1: Upload + Preview ───────────────────────────────────────────────
170
+ with gr.Row(equal_height=True):
171
+ with gr.Column(scale=1):
172
+ gr.HTML("<div class='section-label'>β‘  Upload Video (.mpg / .mp4)</div>")
173
+ video_input = gr.Video(label="", height=260, sources=["upload"])
174
+ submit_btn = gr.Button("β–Ά READ LIPS", variant="primary", size="lg")
175
+
176
+ with gr.Column(scale=1):
177
+ gr.HTML("<div class='section-label'>β‘‘ Converted Preview (mp4)</div>")
178
+ video_preview = gr.Video(label="", height=260, interactive=False)
179
+
180
+ gr.HTML("<hr class='divider'>")
181
+
182
+ # ── Row 2: Mouth GIF + Tokens ─────────────────────────────────────────────
183
+ with gr.Row(equal_height=True):
184
+ with gr.Column(scale=1):
185
+ gr.HTML("<div class='section-label'>β‘’ What the Model Sees β€” mouth crop Β· grayscale Β· normalized</div>")
186
+ gif_preview = gr.Image(label="", height=200, type="filepath")
187
+
188
+ with gr.Column(scale=1):
189
+ gr.HTML("<div class='section-label'>β‘£ Raw CTC Token Indices</div>")
190
+ tokens_out = gr.Textbox(
191
+ label="", lines=5, interactive=False,
192
+ placeholder="Token indices will appear here...",
193
+ elem_classes=["mono-out"]
194
+ )
195
+
196
+ gr.HTML("<hr class='divider'>")
197
+
198
+ # ── Row 3: Prediction + Confidence ────────────��──────────────────────────
199
+ with gr.Row():
200
+ with gr.Column(scale=3):
201
+ gr.HTML("<div class='section-label'>β‘€ Predicted Text</div>")
202
+ prediction_out = gr.Textbox(
203
+ label="", lines=2, interactive=False,
204
+ placeholder="Prediction will appear here...",
205
+ elem_classes=["prediction-out"]
206
+ )
207
+ with gr.Column(scale=1):
208
+ gr.HTML("<div class='section-label'>β‘₯ Avg Confidence</div>")
209
+ confidence_out = gr.Textbox(
210
+ label="", lines=2, interactive=False,
211
+ placeholder="β€”", elem_classes=["confidence-out"]
212
+ )
213
+
214
+ gr.HTML("<hr class='divider'>")
215
+
216
+ gr.HTML("""
217
+ <div class="info-panel">
218
+ <p>
219
+ <span>ARCHITECTURE</span> Β· Conv3D(128) β†’ Conv3D(256) β†’ Conv3D(75) β†’ Reshape β†’ BiLSTM(128)Γ—2 β†’ Dense(41) β†’ CTC<br>
220
+ <span>INPUT</span> Β· 75 frames Β· mouth crop 46Γ—140 px Β· grayscale Β· z-score normalized<br>
221
+ <span>VOCAB</span> Β· 40 chars β€” a–z, 1–9, ' ? ! (space) Β· output dim = 41 (+ CTC blank token)<br>
222
+ <span>DATASET</span> Β· GRID Corpus Speaker S1 Β· 500 videos Β· 450 train / 50 test<br>
223
+ <span>NOTE</span> Β· Upload frontal-face .mpg or .mp4 videos for best results
224
+ </p>
225
+ </div>
226
+ <div style="height:1.5rem"></div>
227
+ """)
228
+
229
+ submit_btn.click(
230
+ fn=predict,
231
+ inputs=[video_input],
232
+ outputs=[video_preview, gif_preview, tokens_out, prediction_out, confidence_out]
233
+ )
234
+
235
+ demo.launch()
checkpoint.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c83685a701a669da61e49860463943d0a5fd0a52cbe813c3b2b3ddf075fd3c0
3
+ size 101741136
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ tensorflow-cpu==2.15.0
2
+ opencv-python-headless==4.9.0.80
3
+ gradio==4.44.0
4
+ numpy==1.26.4
5
+ imageio==2.34.0
6
+ ffmpeg-python==0.2.0