import os import cv2 import tempfile import subprocess import numpy as np import imageio import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape) import gradio as gr # ── Vocabulary ──────────────────────────────────────────────────────────────── vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) # ── Build & Load Model ──────────────────────────────────────────────────────── def build_model(): m = Sequential() m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Conv3D(256, 3, padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Conv3D(75, 3, padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Reshape((75, 5 * 17 * 75))) m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) m.add(Dropout(0.5)) m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) m.add(Dropout(0.5)) m.add(Dense(char_to_num.vocabulary_size() + 1, kernel_initializer='he_normal', activation='softmax')) return m model = build_model() model.load_weights('checkpoint.weights.h5') # ── Video Processing ────────────────────────────────────────────────────────── def load_video_frames(path: str): cap = cv2.VideoCapture(path) processed_frames = [] for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): ret, frame = cap.read() if not ret: break gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32)) processed_frames.append(gray[190:236, 80:220, :]) cap.release() target = 75 if len(processed_frames) < target: pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames)) processed_frames = processed_frames + pad else: processed_frames = processed_frames[:target] frames_tensor = tf.stack(processed_frames) mean = tf.math.reduce_mean(frames_tensor) std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8) return tf.cast((frames_tensor - mean), tf.float32) / std def convert_to_mp4(input_path: str) -> str: out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) out.close() try: subprocess.run( ['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name], check=True, capture_output=True ) return out.name except Exception: return input_path def make_mouth_gif(frames_tensor) -> str: frames_np = frames_tensor.numpy() gif_frames = [] for f in frames_np: g = f[:, :, 0] g = g - g.min() rng = g.max() if rng > 0: g = g / rng rgb = np.stack([g, g, g], axis=-1) gif_frames.append((rgb * 255).astype(np.uint8)) tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False) tmp.close() imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0) return tmp.name # ── Inference ───────────────────────────────────────────────────────────────── def predict(video_path: str): if video_path is None: return None, None, "Upload a video first.", "(no prediction)", "—" try: frames_tensor = load_video_frames(video_path) mp4_path = convert_to_mp4(video_path) gif_path = make_mouth_gif(frames_tensor) inp = tf.expand_dims(frames_tensor, axis=0) yhat = model.predict(inp, verbose=0) decoded_indices = tf.keras.backend.ctc_decode( yhat, input_length=[75], greedy=True )[0][0].numpy() tokens_str = str(decoded_indices[0].tolist()) prediction = tf.strings.reduce_join( num_to_char(decoded_indices[0]) ).numpy().decode('utf-8').strip() or "(no prediction)" confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100) return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%" except Exception as e: err = f"Error: {str(e)}" return None, None, err, err, "—" # ── CSS ─────────────────────────────────────────────────────────────────────── css = """ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap'); body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; } .hero { text-align: center; padding: 2.5rem 1rem 0.5rem; } .hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; } .hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; } .hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; } .section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; } .divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; } .mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; } .prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; } .confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; } button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; } .info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; } .info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; } .info-panel span { color: #6366f1; } """ # ── UI ──────────────────────────────────────────────────────────────────────── with gr.Blocks(css=css, title="LipNet — Silent Speech Recognition") as demo: gr.HTML("""
Silent Speech Recognition · No Audio Required
Conv3D → BiLSTM × 2 → CTC Decode · GRID Corpus S1
ARCHITECTURE · Conv3D(128) → Conv3D(256) → Conv3D(75) → Reshape → BiLSTM(128)×2 → Dense(41) → CTC
INPUT · 75 frames · mouth crop 46×140 px · grayscale · z-score normalized
VOCAB · 40 chars — a–z, 1–9, ' ? ! (space) · output dim = 41 (+ CTC blank token)
DATASET · GRID Corpus Speaker S1 · 500 videos · 450 train / 50 test
NOTE · Upload frontal-face .mpg or .mp4 videos for best results