omm7 commited on
Commit
06b5e5b
Β·
verified Β·
1 Parent(s): cdae4af

Upload app/app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app/app.py +227 -0
app/app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ import subprocess
4
+ import tempfile
5
+ import imageio
6
+ import streamlit as st
7
+ import tensorflow as tf
8
+ from modelutil import load_model
9
+ from utils import load_data, num_to_char
10
+
11
+ # ── Page config ───────────────────────────────────────────────────────────────
12
+ st.set_page_config(
13
+ page_title="LipNet β€” Silent Speech Recognition",
14
+ page_icon="πŸ‘„",
15
+ layout="wide",
16
+ )
17
+
18
+ # ── Custom CSS ────────────────────────────────────────────────────────────────
19
+ st.markdown("""
20
+ <style>
21
+ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
22
+
23
+ html, body, [class*="css"] {
24
+ font-family: 'Syne', sans-serif;
25
+ background-color: #07070f;
26
+ color: #e2e2f0;
27
+ }
28
+ .stApp { background-color: #07070f; }
29
+
30
+ /* Sidebar */
31
+ [data-testid="stSidebar"] {
32
+ background-color: #0f0f1c !important;
33
+ border-right: 1px solid #1e1e32;
34
+ }
35
+ [data-testid="stSidebar"] * { color: #9ca3af !important; }
36
+
37
+ /* Headers */
38
+ h1 {
39
+ font-weight: 800 !important;
40
+ background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
41
+ -webkit-background-clip: text;
42
+ -webkit-text-fill-color: transparent;
43
+ letter-spacing: -0.03em;
44
+ }
45
+ h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
46
+
47
+ /* Info / success boxes */
48
+ .stAlert { border-radius: 10px !important; }
49
+ [data-testid="stInfo"] {
50
+ background: #0f0f1c !important;
51
+ border: 1px solid #2d2d4e !important;
52
+ color: #a5b4fc !important;
53
+ font-family: 'Space Mono', monospace;
54
+ font-size: 0.82rem;
55
+ }
56
+ [data-testid="stSuccess"] {
57
+ background: #0a1a14 !important;
58
+ border: 1px solid #1a3330 !important;
59
+ color: #34d399 !important;
60
+ font-family: 'Space Mono', monospace;
61
+ font-size: 1.1rem;
62
+ }
63
+
64
+ /* Code / preformatted */
65
+ code, pre {
66
+ font-family: 'Space Mono', monospace !important;
67
+ background: #0a0a16 !important;
68
+ color: #a5b4fc !important;
69
+ border-radius: 8px !important;
70
+ font-size: 0.8rem !important;
71
+ }
72
+
73
+ /* Selectbox */
74
+ [data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
75
+
76
+ /* Divider */
77
+ hr { border-color: #1a1a2e !important; }
78
+ </style>
79
+ """, unsafe_allow_html=True)
80
+
81
+ # ── Sidebar ───────────────────────────────────────────────────────────────────
82
+ with st.sidebar:
83
+ st.markdown("## πŸ‘„ LipNet")
84
+ st.markdown(
85
+ "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
86
+ "letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>",
87
+ unsafe_allow_html=True,
88
+ )
89
+ st.divider()
90
+ st.markdown("**Architecture**")
91
+ st.markdown("""
92
+ <p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'>
93
+ Conv3D(128) ↓<br>
94
+ Conv3D(256) ↓<br>
95
+ Conv3D(75) ↓<br>
96
+ Reshape ↓<br>
97
+ BiLSTM(128) ↓<br>
98
+ BiLSTM(128) ↓<br>
99
+ Dense(41) + CTC
100
+ </p>
101
+ """, unsafe_allow_html=True)
102
+ st.divider()
103
+ st.markdown("**Dataset**")
104
+ st.markdown(
105
+ "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
106
+ "line-height:2;'>GRID Corpus Β· Speaker S1<br>500 videos<br>"
107
+ "450 train / 50 test<br>Vocab: a–z 1–9 ' ? ! (space)</p>",
108
+ unsafe_allow_html=True,
109
+ )
110
+ st.divider()
111
+ st.caption("No audio. Lips only.")
112
+
113
+ # ── Title ─────────────────────────────────────────────────────────────────────
114
+ st.title("LipNet β€” Silent Speech Recognition")
115
+ st.markdown(
116
+ "<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;"
117
+ "letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC Β· NO AUDIO REQUIRED</p>",
118
+ unsafe_allow_html=True,
119
+ )
120
+ st.divider()
121
+
122
+ # ── Data paths ────────────────────────────────────────────────────────────────
123
+ BASE_DIR = Path(__file__).resolve().parent
124
+ DATA_DIR = BASE_DIR / 'data' / 's1'
125
+
126
+ options = sorted([item.name for item in DATA_DIR.glob('*.mpg')])
127
+ if not options:
128
+ st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.")
129
+ st.stop()
130
+
131
+ selected_video = st.selectbox("**Choose a video**", options)
132
+ file_path = DATA_DIR / selected_video
133
+
134
+ st.divider()
135
+
136
+ # ── Load model (cached) ────────────────────────────────���──────────────────────
137
+ @st.cache_resource(show_spinner="Loading LipNet model...")
138
+ def get_model():
139
+ return load_model()
140
+
141
+ model = get_model()
142
+
143
+ # ── Two-column layout ─────────────────────────────────────────────────────────
144
+ col1, col2 = st.columns(2, gap="large")
145
+
146
+ # ── Column 1: Video preview ───────────────────────────────────────────────────
147
+ with col1:
148
+ st.markdown("### πŸ“Ή Original Video")
149
+ st.info("Video converted to mp4 for browser playback")
150
+
151
+ output_path = None
152
+ try:
153
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
154
+ output_path = Path(f.name)
155
+ subprocess.run(
156
+ ["ffmpeg", "-i", str(file_path), "-vcodec", "libx264",
157
+ "-crf", "23", str(output_path), "-y"],
158
+ check=True, capture_output=True, text=True,
159
+ )
160
+ st.video(output_path.read_bytes())
161
+ except subprocess.CalledProcessError as exc:
162
+ st.error("ffmpeg conversion failed.")
163
+ st.code(exc.stderr or "No error output.")
164
+ finally:
165
+ if output_path and output_path.exists():
166
+ output_path.unlink()
167
+
168
+ # ── Column 2: Model inference ─────────────────────────────────────────────────
169
+ with col2:
170
+ st.markdown("### 🧠 Model Inference")
171
+
172
+ # Load frames + alignment
173
+ video_tensor, annotations = load_data(tf.convert_to_tensor(str(file_path)))
174
+
175
+ # ── Mouth crop GIF ────────────────────────────────────────────────────────
176
+ st.info("Mouth crop β€” what the model actually sees (grayscale Β· normalized)")
177
+ gif_path = None
178
+ try:
179
+ with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf:
180
+ gif_path = Path(gf.name)
181
+ frames_np = video_tensor.numpy()
182
+ gif_frames = []
183
+ for f in frames_np:
184
+ g = f[:, :, 0]
185
+ g = (g - g.min()) / max(g.max() - g.min(), 1e-8)
186
+ rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8")
187
+ gif_frames.append(rgb)
188
+ imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0)
189
+ st.image(str(gif_path), width=400)
190
+ finally:
191
+ if gif_path and gif_path.exists():
192
+ gif_path.unlink()
193
+
194
+ st.divider()
195
+
196
+ # ── Ground truth ──────────────────────────────────────────────────────────
197
+ st.info("Ground truth label (from `.align` file)")
198
+ ground_truth = tf.strings.reduce_join(
199
+ num_to_char(annotations)
200
+ ).numpy().decode('utf-8')
201
+ st.code(ground_truth, language=None)
202
+
203
+ st.divider()
204
+
205
+ # ── Raw tokens ────────────────────────────────────────────────────────────
206
+ st.info("Raw CTC token indices from model output")
207
+ yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
208
+ decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
209
+ st.code(str(decoded[0].tolist()), language=None)
210
+
211
+ st.divider()
212
+
213
+ # ── Final prediction ──────────────────────────────────────────────────────
214
+ prediction = tf.strings.reduce_join(
215
+ num_to_char(decoded[0])
216
+ ).numpy().decode('utf-8').strip()
217
+
218
+ st.success(f"**Prediction:** {prediction}")
219
+
220
+ # ── Confidence ────────────────────────────────────────────────────────────
221
+ import numpy as np
222
+ confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
223
+ st.markdown(
224
+ f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
225
+ f"AVG CONFIDENCE Β· <span style='color:#34d399'>{confidence:.1f}%</span></p>",
226
+ unsafe_allow_html=True,
227
+ )