Spaces:
Build error
Build error
| import os | |
| import cv2 | |
| import tempfile | |
| import subprocess | |
| import numpy as np | |
| import imageio | |
| import tensorflow as tf | |
| from tensorflow.keras.models import Sequential | |
| from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout, | |
| Bidirectional, MaxPool3D, Activation, Reshape) | |
| import gradio as gr | |
| # ββ Vocabulary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] | |
| char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") | |
| num_to_char = tf.keras.layers.StringLookup( | |
| vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True | |
| ) | |
| # ββ Build & Load Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_model(): | |
| m = Sequential() | |
| m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same')) | |
| m.add(Activation('relu')) | |
| m.add(MaxPool3D((1, 2, 2))) | |
| m.add(Conv3D(256, 3, padding='same')) | |
| m.add(Activation('relu')) | |
| m.add(MaxPool3D((1, 2, 2))) | |
| m.add(Conv3D(75, 3, padding='same')) | |
| m.add(Activation('relu')) | |
| m.add(MaxPool3D((1, 2, 2))) | |
| m.add(Reshape((75, 5 * 17 * 75))) | |
| m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
| m.add(Dropout(0.5)) | |
| m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
| m.add(Dropout(0.5)) | |
| m.add(Dense(char_to_num.vocabulary_size() + 1, | |
| kernel_initializer='he_normal', activation='softmax')) | |
| return m | |
| model = build_model() | |
| model.load_weights('checkpoint.weights.h5') | |
| # ββ Video Processing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_video_frames(path: str): | |
| cap = cv2.VideoCapture(path) | |
| processed_frames = [] | |
| for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32)) | |
| processed_frames.append(gray[190:236, 80:220, :]) | |
| cap.release() | |
| target = 75 | |
| if len(processed_frames) < target: | |
| pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames)) | |
| processed_frames = processed_frames + pad | |
| else: | |
| processed_frames = processed_frames[:target] | |
| frames_tensor = tf.stack(processed_frames) | |
| mean = tf.math.reduce_mean(frames_tensor) | |
| std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8) | |
| return tf.cast((frames_tensor - mean), tf.float32) / std | |
| def convert_to_mp4(input_path: str) -> str: | |
| out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) | |
| out.close() | |
| try: | |
| subprocess.run( | |
| ['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name], | |
| check=True, capture_output=True | |
| ) | |
| return out.name | |
| except Exception: | |
| return input_path | |
| def make_mouth_gif(frames_tensor) -> str: | |
| frames_np = frames_tensor.numpy() | |
| gif_frames = [] | |
| for f in frames_np: | |
| g = f[:, :, 0] | |
| g = g - g.min() | |
| rng = g.max() | |
| if rng > 0: | |
| g = g / rng | |
| rgb = np.stack([g, g, g], axis=-1) | |
| gif_frames.append((rgb * 255).astype(np.uint8)) | |
| tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False) | |
| tmp.close() | |
| imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0) | |
| return tmp.name | |
| # ββ Inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def predict(video_path: str): | |
| if video_path is None: | |
| return None, None, "Upload a video first.", "(no prediction)", "β" | |
| try: | |
| frames_tensor = load_video_frames(video_path) | |
| mp4_path = convert_to_mp4(video_path) | |
| gif_path = make_mouth_gif(frames_tensor) | |
| inp = tf.expand_dims(frames_tensor, axis=0) | |
| yhat = model.predict(inp, verbose=0) | |
| decoded_indices = tf.keras.backend.ctc_decode( | |
| yhat, input_length=[75], greedy=True | |
| )[0][0].numpy() | |
| tokens_str = str(decoded_indices[0].tolist()) | |
| prediction = tf.strings.reduce_join( | |
| num_to_char(decoded_indices[0]) | |
| ).numpy().decode('utf-8').strip() or "(no prediction)" | |
| confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100) | |
| return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%" | |
| except Exception as e: | |
| err = f"Error: {str(e)}" | |
| return None, None, err, err, "β" | |
| # ββ CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap'); | |
| body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; } | |
| .hero { text-align: center; padding: 2.5rem 1rem 0.5rem; } | |
| .hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; } | |
| .hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; } | |
| .hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; } | |
| .section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; } | |
| .divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; } | |
| .mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; } | |
| .prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; } | |
| .confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; } | |
| button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; } | |
| .info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; } | |
| .info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; } | |
| .info-panel span { color: #6366f1; } | |
| """ | |
| # ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(css=css, title="LipNet β Silent Speech Recognition") as demo: | |
| gr.HTML(""" | |
| <div class="hero"> | |
| <h1>LipNet</h1> | |
| <p class="sub">Silent Speech Recognition Β· No Audio Required</p> | |
| <span class="badge">Conv3D β BiLSTM Γ 2 β CTC Decode Β· GRID Corpus S1</span> | |
| </div> | |
| <div style="height:1.5rem"></div> | |
| """) | |
| # ββ Row 1: Upload + Preview βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>β Upload Video (.mpg / .mp4)</div>") | |
| video_input = gr.Video(label="", height=260, sources=["upload"]) | |
| submit_btn = gr.Button("βΆ READ LIPS", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>β‘ Converted Preview (mp4)</div>") | |
| video_preview = gr.Video(label="", height=260, interactive=False) | |
| gr.HTML("<hr class='divider'>") | |
| # ββ Row 2: Mouth GIF + Tokens βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>β’ What the Model Sees β mouth crop Β· grayscale Β· normalized</div>") | |
| gif_preview = gr.Image(label="", height=200, type="filepath") | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>β£ Raw CTC Token Indices</div>") | |
| tokens_out = gr.Textbox( | |
| label="", lines=5, interactive=False, | |
| placeholder="Token indices will appear here...", | |
| elem_classes=["mono-out"] | |
| ) | |
| gr.HTML("<hr class='divider'>") | |
| # ββ Row 3: Prediction + Confidence βββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.HTML("<div class='section-label'>β€ Predicted Text</div>") | |
| prediction_out = gr.Textbox( | |
| label="", lines=2, interactive=False, | |
| placeholder="Prediction will appear here...", | |
| elem_classes=["prediction-out"] | |
| ) | |
| with gr.Column(scale=1): | |
| gr.HTML("<div class='section-label'>β₯ Avg Confidence</div>") | |
| confidence_out = gr.Textbox( | |
| label="", lines=2, interactive=False, | |
| placeholder="β", elem_classes=["confidence-out"] | |
| ) | |
| gr.HTML("<hr class='divider'>") | |
| gr.HTML(""" | |
| <div class="info-panel"> | |
| <p> | |
| <span>ARCHITECTURE</span> Β· Conv3D(128) β Conv3D(256) β Conv3D(75) β Reshape β BiLSTM(128)Γ2 β Dense(41) β CTC<br> | |
| <span>INPUT</span> Β· 75 frames Β· mouth crop 46Γ140 px Β· grayscale Β· z-score normalized<br> | |
| <span>VOCAB</span> Β· 40 chars β aβz, 1β9, ' ? ! (space) Β· output dim = 41 (+ CTC blank token)<br> | |
| <span>DATASET</span> Β· GRID Corpus Speaker S1 Β· 500 videos Β· 450 train / 50 test<br> | |
| <span>NOTE</span> Β· Upload frontal-face .mpg or .mp4 videos for best results | |
| </p> | |
| </div> | |
| <div style="height:1.5rem"></div> | |
| """) | |
| submit_btn.click( | |
| fn=predict, | |
| inputs=[video_input], | |
| outputs=[video_preview, gif_preview, tokens_out, prediction_out, confidence_out] | |
| ) | |
| demo.launch() | |