import os import cv2 import tempfile import subprocess import numpy as np import imageio import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape) import gradio as gr # ── Vocabulary ──────────────────────────────────────────────────────────────── vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) # ── Build & Load Model ──────────────────────────────────────────────────────── def build_model(): m = Sequential() m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Conv3D(256, 3, padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Conv3D(75, 3, padding='same')) m.add(Activation('relu')) m.add(MaxPool3D((1, 2, 2))) m.add(Reshape((75, 5 * 17 * 75))) m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) m.add(Dropout(0.5)) m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) m.add(Dropout(0.5)) m.add(Dense(char_to_num.vocabulary_size() + 1, kernel_initializer='he_normal', activation='softmax')) return m model = build_model() model.load_weights('checkpoint.weights.h5') # ── Video Processing ────────────────────────────────────────────────────────── def load_video_frames(path: str): cap = cv2.VideoCapture(path) processed_frames = [] for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): ret, frame = cap.read() if not ret: break gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32)) processed_frames.append(gray[190:236, 80:220, :]) cap.release() target = 75 if len(processed_frames) < target: pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames)) processed_frames = processed_frames + pad else: processed_frames = processed_frames[:target] frames_tensor = tf.stack(processed_frames) mean = tf.math.reduce_mean(frames_tensor) std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8) return tf.cast((frames_tensor - mean), tf.float32) / std def convert_to_mp4(input_path: str) -> str: out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) out.close() try: subprocess.run( ['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name], check=True, capture_output=True ) return out.name except Exception: return input_path def make_mouth_gif(frames_tensor) -> str: frames_np = frames_tensor.numpy() gif_frames = [] for f in frames_np: g = f[:, :, 0] g = g - g.min() rng = g.max() if rng > 0: g = g / rng rgb = np.stack([g, g, g], axis=-1) gif_frames.append((rgb * 255).astype(np.uint8)) tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False) tmp.close() imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0) return tmp.name # ── Inference ───────────────────────────────────────────────────────────────── def predict(video_path: str): if video_path is None: return None, None, "Upload a video first.", "(no prediction)", "—" try: frames_tensor = load_video_frames(video_path) mp4_path = convert_to_mp4(video_path) gif_path = make_mouth_gif(frames_tensor) inp = tf.expand_dims(frames_tensor, axis=0) yhat = model.predict(inp, verbose=0) decoded_indices = tf.keras.backend.ctc_decode( yhat, input_length=[75], greedy=True )[0][0].numpy() tokens_str = str(decoded_indices[0].tolist()) prediction = tf.strings.reduce_join( num_to_char(decoded_indices[0]) ).numpy().decode('utf-8').strip() or "(no prediction)" confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100) return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%" except Exception as e: err = f"Error: {str(e)}" return None, None, err, err, "—" # ── CSS ─────────────────────────────────────────────────────────────────────── css = """ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap'); body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; } .hero { text-align: center; padding: 2.5rem 1rem 0.5rem; } .hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; } .hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; } .hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; } .section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; } .divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; } .mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; } .prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; } .confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; } button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; } .info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; } .info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; } .info-panel span { color: #6366f1; } """ # ── UI ──────────────────────────────────────────────────────────────────────── with gr.Blocks(css=css, title="LipNet — Silent Speech Recognition") as demo: gr.HTML("""

LipNet

Silent Speech Recognition · No Audio Required

Conv3D → BiLSTM × 2 → CTC Decode · GRID Corpus S1
""") # ── Row 1: Upload + Preview ─────────────────────────────────────────────── with gr.Row(equal_height=True): with gr.Column(scale=1): gr.HTML("
① Upload Video (.mpg / .mp4)
") video_input = gr.Video(label="", height=260, sources=["upload"]) submit_btn = gr.Button("▶ READ LIPS", variant="primary", size="lg") with gr.Column(scale=1): gr.HTML("
② Converted Preview (mp4)
") video_preview = gr.Video(label="", height=260, interactive=False) gr.HTML("
") # ── Row 2: Mouth GIF + Tokens ───────────────────────────────────────────── with gr.Row(equal_height=True): with gr.Column(scale=1): gr.HTML("
③ What the Model Sees — mouth crop · grayscale · normalized
") gif_preview = gr.Image(label="", height=200, type="filepath") with gr.Column(scale=1): gr.HTML("
④ Raw CTC Token Indices
") tokens_out = gr.Textbox( label="", lines=5, interactive=False, placeholder="Token indices will appear here...", elem_classes=["mono-out"] ) gr.HTML("
") # ── Row 3: Prediction + Confidence ─────────────────────────────────────── with gr.Row(): with gr.Column(scale=3): gr.HTML("
⑤ Predicted Text
") prediction_out = gr.Textbox( label="", lines=2, interactive=False, placeholder="Prediction will appear here...", elem_classes=["prediction-out"] ) with gr.Column(scale=1): gr.HTML("
⑥ Avg Confidence
") confidence_out = gr.Textbox( label="", lines=2, interactive=False, placeholder="—", elem_classes=["confidence-out"] ) gr.HTML("
") gr.HTML("""

ARCHITECTURE · Conv3D(128) → Conv3D(256) → Conv3D(75) → Reshape → BiLSTM(128)×2 → Dense(41) → CTC
INPUT · 75 frames · mouth crop 46×140 px · grayscale · z-score normalized
VOCAB · 40 chars — a–z, 1–9, ' ? ! (space) · output dim = 41 (+ CTC blank token)
DATASET · GRID Corpus Speaker S1 · 500 videos · 450 train / 50 test
NOTE · Upload frontal-face .mpg or .mp4 videos for best results

""") submit_btn.click( fn=predict, inputs=[video_input], outputs=[video_preview, gif_preview, tokens_out, prediction_out, confidence_out] ) demo.launch()