from __future__ import annotations from pathlib import Path import subprocess import tempfile import imageio import numpy as np import streamlit as st import tensorflow as tf from modelutil import load_model from utils import load_data, num_to_char # ── Page config ─────────────────────────────────────────────────────────────── st.set_page_config( page_title="LipNet - Silent Speech Recognition", page_icon="👄", layout="wide", ) # ── Custom CSS ──────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Sidebar ─────────────────────────────────────────────────────────────────── with st.sidebar: st.markdown("## 👄 LipNet") st.markdown( "

SILENT SPEECH RECOGNITION

", unsafe_allow_html=True, ) st.divider() st.markdown("**Architecture**") st.markdown("""

Conv3D(128) ↓
Conv3D(256) ↓
Conv3D(75) ↓
Reshape ↓
BiLSTM(128) ↓
BiLSTM(128) ↓
Dense(41) + CTC

""", unsafe_allow_html=True) st.divider() st.markdown("**Dataset**") st.markdown( "

GRID Corpus · Speaker S1
500 videos
" "450 train / 50 test
Vocab: a–z 1–9 ' ? ! (space)

", unsafe_allow_html=True, ) st.divider() st.caption("No audio. Lips only.") # ── Title ───────────────────────────────────────────────────────────────────── st.title("LipNet - Silent Speech Recognition") st.markdown( "

CONV3D + BILSTM + CTC · NO AUDIO REQUIRED

", unsafe_allow_html=True, ) st.divider() # ── Data paths ──────────────────────────────────────────────────────────────── BASE_DIR = Path(__file__).resolve().parent DATA_DIR = BASE_DIR / 'data' / 's1' options = sorted([item.name for item in DATA_DIR.glob('*.mpg')]) if not options: st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.") st.stop() selected_video = st.selectbox("**Choose a video**", options) file_path = DATA_DIR / selected_video st.divider() # ── Load model (cached) ─────────────────────────────────────────────────────── @st.cache_resource(show_spinner="Loading LipNet model...") def get_model(): return load_model() model = get_model() # ── Load frames + alignment (cached per video) ──────────────────────────────── @st.cache_data(show_spinner="Processing video...") def get_video_data(path: str): video_tensor, annotations = load_data(tf.convert_to_tensor(path)) ground_truth = tf.strings.reduce_join( num_to_char(annotations) ).numpy().decode('utf-8') return video_tensor, annotations, ground_truth video_tensor, annotations, ground_truth = get_video_data(str(file_path)) # ── Two-column layout ───────────────────────────────────────────────────────── col1, col2 = st.columns(2, gap="large") # ── Column 1: Video preview + Ground truth ──────────────────────────────────── with col1: st.markdown("### 📹 Original Video") st.info("Video converted to mp4 for browser playback") output_path = None try: with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: output_path = Path(f.name) subprocess.run( ["ffmpeg", "-i", str(file_path), "-vcodec", "libx264", "-crf", "23", str(output_path), "-y"], check=True, capture_output=True, text=True, ) st.video(output_path.read_bytes()) except subprocess.CalledProcessError as exc: st.error("ffmpeg conversion failed.") st.code(exc.stderr or "No error output.") finally: if output_path and output_path.exists(): output_path.unlink() # ── Column 2: Model inference ───────────────────────────────────────────────── with col2: st.markdown("### 🧠 Model Inference") # ── Mouth crop GIF ──────────────────────────────────────────────────────── st.info("Mouth crop - what the model actually sees (grayscale · normalized)") gif_path = None try: with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf: gif_path = Path(gf.name) frames_np = video_tensor.numpy() gif_frames = [] for f in frames_np: g = f[:, :, 0] g = (g - g.min()) / max(g.max() - g.min(), 1e-8) rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8") gif_frames.append(rgb) imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0) st.image(str(gif_path), width=400) finally: if gif_path and gif_path.exists(): gif_path.unlink() st.divider() # ── Raw tokens ──────────────────────────────────────────────────────────── st.info("Raw CTC token indices from model output") yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0) decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() st.code(str(decoded[0].tolist()), language=None) # ── Ground truth (moved here) ───────────────────────────────────────────── st.divider() st.info("Ground truth label (from `.align` file)") st.code(ground_truth, language=None) st.divider() # ── Final prediction ────────────────────────────────────────────────────── prediction = tf.strings.reduce_join( num_to_char(decoded[0]) ).numpy().decode('utf-8').strip() st.success(f"**Prediction:** {prediction}") # ── Confidence ──────────────────────────────────────────────────────────── confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100) st.markdown( f"

" f"AVG CONFIDENCE · {confidence:.1f}%

", unsafe_allow_html=True, )