Spaces:
Build error
Build error
First Commit
Browse filesHopefully everything works
- README.md +30 -8
- app.py +235 -0
- checkpoint.weights.h5 +3 -0
- requirements.txt +6 -0
README.md
CHANGED
|
@@ -1,14 +1,36 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
short_description: Reads Lips - Predicts sentences said in video without audio
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: LipNet Silent Speech Recognition
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# LipNet β Silent Speech Recognition
|
| 13 |
+
|
| 14 |
+
A deep learning model that reads lips from video and predicts spoken text β no audio required.
|
| 15 |
+
|
| 16 |
+
## Model Architecture
|
| 17 |
+
- **3Γ Conv3D** layers for spatiotemporal feature extraction
|
| 18 |
+
- **2Γ Bidirectional LSTM** layers for sequence modelling
|
| 19 |
+
- **CTC Loss** for sequence-to-sequence alignment
|
| 20 |
+
- Input: 75 frames of mouth region (46Γ140 px, grayscale)
|
| 21 |
+
|
| 22 |
+
## How to Use
|
| 23 |
+
1. Upload a short `.mpg` or `.mp4` video showing a frontal face
|
| 24 |
+
2. Click **READ LIPS**
|
| 25 |
+
3. The predicted sentence appears on the right
|
| 26 |
+
|
| 27 |
+
## Dataset
|
| 28 |
+
Trained on the [GRID Corpus](https://spandh.dcs.shef.ac.uk/gridcorpus/) β Speaker S1.
|
| 29 |
+
Vocabulary: `a-z`, digits `1-9`, punctuation `'?!` and space (40 characters total).
|
| 30 |
+
|
| 31 |
+
## Files
|
| 32 |
+
```
|
| 33 |
+
app.py β Gradio app + inference
|
| 34 |
+
requirements.txt β Dependencies
|
| 35 |
+
models/checkpoint.weights.h5 β Model weights (upload manually)
|
| 36 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import cv2
|
| 3 |
+
import tempfile
|
| 4 |
+
import subprocess
|
| 5 |
+
import numpy as np
|
| 6 |
+
import imageio
|
| 7 |
+
import tensorflow as tf
|
| 8 |
+
from tensorflow.keras.models import Sequential
|
| 9 |
+
from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout,
|
| 10 |
+
Bidirectional, MaxPool3D, Activation, Reshape)
|
| 11 |
+
import gradio as gr
|
| 12 |
+
|
| 13 |
+
# ββ Vocabulary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
+
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
|
| 15 |
+
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
|
| 16 |
+
num_to_char = tf.keras.layers.StringLookup(
|
| 17 |
+
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# ββ Build & Load Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
def build_model():
|
| 22 |
+
m = Sequential()
|
| 23 |
+
m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same'))
|
| 24 |
+
m.add(Activation('relu'))
|
| 25 |
+
m.add(MaxPool3D((1, 2, 2)))
|
| 26 |
+
m.add(Conv3D(256, 3, padding='same'))
|
| 27 |
+
m.add(Activation('relu'))
|
| 28 |
+
m.add(MaxPool3D((1, 2, 2)))
|
| 29 |
+
m.add(Conv3D(75, 3, padding='same'))
|
| 30 |
+
m.add(Activation('relu'))
|
| 31 |
+
m.add(MaxPool3D((1, 2, 2)))
|
| 32 |
+
m.add(Reshape((75, 5 * 17 * 75)))
|
| 33 |
+
m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
| 34 |
+
m.add(Dropout(0.5))
|
| 35 |
+
m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
| 36 |
+
m.add(Dropout(0.5))
|
| 37 |
+
m.add(Dense(char_to_num.vocabulary_size() + 1,
|
| 38 |
+
kernel_initializer='he_normal', activation='softmax'))
|
| 39 |
+
return m
|
| 40 |
+
|
| 41 |
+
model = build_model()
|
| 42 |
+
model.load_weights('checkpoint.weights.h5')
|
| 43 |
+
|
| 44 |
+
# ββ Video Processing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
def load_video_frames(path: str):
|
| 46 |
+
cap = cv2.VideoCapture(path)
|
| 47 |
+
processed_frames = []
|
| 48 |
+
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
|
| 49 |
+
ret, frame = cap.read()
|
| 50 |
+
if not ret:
|
| 51 |
+
break
|
| 52 |
+
gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32))
|
| 53 |
+
processed_frames.append(gray[190:236, 80:220, :])
|
| 54 |
+
cap.release()
|
| 55 |
+
|
| 56 |
+
target = 75
|
| 57 |
+
if len(processed_frames) < target:
|
| 58 |
+
pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames))
|
| 59 |
+
processed_frames = processed_frames + pad
|
| 60 |
+
else:
|
| 61 |
+
processed_frames = processed_frames[:target]
|
| 62 |
+
|
| 63 |
+
frames_tensor = tf.stack(processed_frames)
|
| 64 |
+
mean = tf.math.reduce_mean(frames_tensor)
|
| 65 |
+
std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8)
|
| 66 |
+
return tf.cast((frames_tensor - mean), tf.float32) / std
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def convert_to_mp4(input_path: str) -> str:
|
| 70 |
+
out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
|
| 71 |
+
out.close()
|
| 72 |
+
try:
|
| 73 |
+
subprocess.run(
|
| 74 |
+
['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name],
|
| 75 |
+
check=True, capture_output=True
|
| 76 |
+
)
|
| 77 |
+
return out.name
|
| 78 |
+
except Exception:
|
| 79 |
+
return input_path
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def make_mouth_gif(frames_tensor) -> str:
|
| 83 |
+
frames_np = frames_tensor.numpy()
|
| 84 |
+
gif_frames = []
|
| 85 |
+
for f in frames_np:
|
| 86 |
+
g = f[:, :, 0]
|
| 87 |
+
g = g - g.min()
|
| 88 |
+
rng = g.max()
|
| 89 |
+
if rng > 0:
|
| 90 |
+
g = g / rng
|
| 91 |
+
rgb = np.stack([g, g, g], axis=-1)
|
| 92 |
+
gif_frames.append((rgb * 255).astype(np.uint8))
|
| 93 |
+
tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False)
|
| 94 |
+
tmp.close()
|
| 95 |
+
imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0)
|
| 96 |
+
return tmp.name
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ββ Inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
def predict(video_path: str):
|
| 101 |
+
if video_path is None:
|
| 102 |
+
return None, None, "Upload a video first.", "(no prediction)", "β"
|
| 103 |
+
try:
|
| 104 |
+
frames_tensor = load_video_frames(video_path)
|
| 105 |
+
mp4_path = convert_to_mp4(video_path)
|
| 106 |
+
gif_path = make_mouth_gif(frames_tensor)
|
| 107 |
+
|
| 108 |
+
inp = tf.expand_dims(frames_tensor, axis=0)
|
| 109 |
+
yhat = model.predict(inp, verbose=0)
|
| 110 |
+
|
| 111 |
+
decoded_indices = tf.keras.backend.ctc_decode(
|
| 112 |
+
yhat, input_length=[75], greedy=True
|
| 113 |
+
)[0][0].numpy()
|
| 114 |
+
|
| 115 |
+
tokens_str = str(decoded_indices[0].tolist())
|
| 116 |
+
prediction = tf.strings.reduce_join(
|
| 117 |
+
num_to_char(decoded_indices[0])
|
| 118 |
+
).numpy().decode('utf-8').strip() or "(no prediction)"
|
| 119 |
+
|
| 120 |
+
confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
|
| 121 |
+
|
| 122 |
+
return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%"
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
err = f"Error: {str(e)}"
|
| 126 |
+
return None, None, err, err, "β"
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
css = """
|
| 131 |
+
@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap');
|
| 132 |
+
|
| 133 |
+
body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; }
|
| 134 |
+
|
| 135 |
+
.hero { text-align: center; padding: 2.5rem 1rem 0.5rem; }
|
| 136 |
+
.hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; }
|
| 137 |
+
.hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; }
|
| 138 |
+
.hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; }
|
| 139 |
+
|
| 140 |
+
.section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; }
|
| 141 |
+
|
| 142 |
+
.divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; }
|
| 143 |
+
|
| 144 |
+
.mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; }
|
| 145 |
+
|
| 146 |
+
.prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; }
|
| 147 |
+
|
| 148 |
+
.confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; }
|
| 149 |
+
|
| 150 |
+
button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; }
|
| 151 |
+
|
| 152 |
+
.info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; }
|
| 153 |
+
.info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; }
|
| 154 |
+
.info-panel span { color: #6366f1; }
|
| 155 |
+
"""
|
| 156 |
+
|
| 157 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
with gr.Blocks(css=css, title="LipNet β Silent Speech Recognition") as demo:
|
| 159 |
+
|
| 160 |
+
gr.HTML("""
|
| 161 |
+
<div class="hero">
|
| 162 |
+
<h1>LipNet</h1>
|
| 163 |
+
<p class="sub">Silent Speech Recognition Β· No Audio Required</p>
|
| 164 |
+
<span class="badge">Conv3D β BiLSTM Γ 2 β CTC Decode Β· GRID Corpus S1</span>
|
| 165 |
+
</div>
|
| 166 |
+
<div style="height:1.5rem"></div>
|
| 167 |
+
""")
|
| 168 |
+
|
| 169 |
+
# ββ Row 1: Upload + Preview βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
+
with gr.Row(equal_height=True):
|
| 171 |
+
with gr.Column(scale=1):
|
| 172 |
+
gr.HTML("<div class='section-label'>β Upload Video (.mpg / .mp4)</div>")
|
| 173 |
+
video_input = gr.Video(label="", height=260, sources=["upload"])
|
| 174 |
+
submit_btn = gr.Button("βΆ READ LIPS", variant="primary", size="lg")
|
| 175 |
+
|
| 176 |
+
with gr.Column(scale=1):
|
| 177 |
+
gr.HTML("<div class='section-label'>β‘ Converted Preview (mp4)</div>")
|
| 178 |
+
video_preview = gr.Video(label="", height=260, interactive=False)
|
| 179 |
+
|
| 180 |
+
gr.HTML("<hr class='divider'>")
|
| 181 |
+
|
| 182 |
+
# ββ Row 2: Mouth GIF + Tokens βββββββββββββββββββββββββββββββββββββββββββββ
|
| 183 |
+
with gr.Row(equal_height=True):
|
| 184 |
+
with gr.Column(scale=1):
|
| 185 |
+
gr.HTML("<div class='section-label'>β’ What the Model Sees β mouth crop Β· grayscale Β· normalized</div>")
|
| 186 |
+
gif_preview = gr.Image(label="", height=200, type="filepath")
|
| 187 |
+
|
| 188 |
+
with gr.Column(scale=1):
|
| 189 |
+
gr.HTML("<div class='section-label'>β£ Raw CTC Token Indices</div>")
|
| 190 |
+
tokens_out = gr.Textbox(
|
| 191 |
+
label="", lines=5, interactive=False,
|
| 192 |
+
placeholder="Token indices will appear here...",
|
| 193 |
+
elem_classes=["mono-out"]
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
gr.HTML("<hr class='divider'>")
|
| 197 |
+
|
| 198 |
+
# ββ Row 3: Prediction + Confidence ββββββββββββοΏ½οΏ½ββββββββββββββββββββββββββ
|
| 199 |
+
with gr.Row():
|
| 200 |
+
with gr.Column(scale=3):
|
| 201 |
+
gr.HTML("<div class='section-label'>β€ Predicted Text</div>")
|
| 202 |
+
prediction_out = gr.Textbox(
|
| 203 |
+
label="", lines=2, interactive=False,
|
| 204 |
+
placeholder="Prediction will appear here...",
|
| 205 |
+
elem_classes=["prediction-out"]
|
| 206 |
+
)
|
| 207 |
+
with gr.Column(scale=1):
|
| 208 |
+
gr.HTML("<div class='section-label'>β₯ Avg Confidence</div>")
|
| 209 |
+
confidence_out = gr.Textbox(
|
| 210 |
+
label="", lines=2, interactive=False,
|
| 211 |
+
placeholder="β", elem_classes=["confidence-out"]
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
gr.HTML("<hr class='divider'>")
|
| 215 |
+
|
| 216 |
+
gr.HTML("""
|
| 217 |
+
<div class="info-panel">
|
| 218 |
+
<p>
|
| 219 |
+
<span>ARCHITECTURE</span> Β· Conv3D(128) β Conv3D(256) β Conv3D(75) β Reshape β BiLSTM(128)Γ2 β Dense(41) β CTC<br>
|
| 220 |
+
<span>INPUT</span> Β· 75 frames Β· mouth crop 46Γ140 px Β· grayscale Β· z-score normalized<br>
|
| 221 |
+
<span>VOCAB</span> Β· 40 chars β aβz, 1β9, ' ? ! (space) Β· output dim = 41 (+ CTC blank token)<br>
|
| 222 |
+
<span>DATASET</span> Β· GRID Corpus Speaker S1 Β· 500 videos Β· 450 train / 50 test<br>
|
| 223 |
+
<span>NOTE</span> Β· Upload frontal-face .mpg or .mp4 videos for best results
|
| 224 |
+
</p>
|
| 225 |
+
</div>
|
| 226 |
+
<div style="height:1.5rem"></div>
|
| 227 |
+
""")
|
| 228 |
+
|
| 229 |
+
submit_btn.click(
|
| 230 |
+
fn=predict,
|
| 231 |
+
inputs=[video_input],
|
| 232 |
+
outputs=[video_preview, gif_preview, tokens_out, prediction_out, confidence_out]
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
demo.launch()
|
checkpoint.weights.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c83685a701a669da61e49860463943d0a5fd0a52cbe813c3b2b3ddf075fd3c0
|
| 3 |
+
size 101741136
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tensorflow-cpu==2.15.0
|
| 2 |
+
opencv-python-headless==4.9.0.80
|
| 3 |
+
gradio==4.44.0
|
| 4 |
+
numpy==1.26.4
|
| 5 |
+
imageio==2.34.0
|
| 6 |
+
ffmpeg-python==0.2.0
|