Spaces:

omm7
/

Lip-Reader

Build error

App Files Files Community

omm7 commited on Mar 19

Commit

09b0ff7

verified ·

1 Parent(s): 5b08af8

First Commit

Browse files

Hopefully everything works

Files changed (4) hide show

README.md +30 -8
app.py +235 -0
checkpoint.weights.h5 +3 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,14 +1,36 @@
 ---
-title: Lip Reader
-emoji: 🚀
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
-license: mit
-short_description: Reads Lips - Predicts sentences said in video without audio
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LipNet Silent Speech Recognition
+emoji: 👄
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 ---
+# LipNet — Silent Speech Recognition
+A deep learning model that reads lips from video and predicts spoken text — no audio required.
+## Model Architecture
+- **3× Conv3D** layers for spatiotemporal feature extraction
+- **2× Bidirectional LSTM** layers for sequence modelling
+- **CTC Loss** for sequence-to-sequence alignment
+- Input: 75 frames of mouth region (46×140 px, grayscale)
+## How to Use
+1. Upload a short `.mpg` or `.mp4` video showing a frontal face
+2. Click **READ LIPS**
+3. The predicted sentence appears on the right
+## Dataset
+Trained on the [GRID Corpus](https://spandh.dcs.shef.ac.uk/gridcorpus/) — Speaker S1.
+Vocabulary: `a-z`, digits `1-9`, punctuation `'?!` and space (40 characters total).
+## Files
+```
+app.py                        ← Gradio app + inference
+requirements.txt              ← Dependencies
+models/checkpoint.weights.h5  ← Model weights (upload manually)
+```

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import os
+import cv2
+import tempfile
+import subprocess
+import numpy as np
+import imageio
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import (Conv3D, LSTM, Dense, Dropout,
+                                     Bidirectional, MaxPool3D, Activation, Reshape)
+import gradio as gr
+# ── Vocabulary ────────────────────────────────────────────────────────────────
+vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
+char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
+num_to_char = tf.keras.layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
+)
+# ── Build & Load Model ────────────────────────────────────────────────────────
+def build_model():
+    m = Sequential()
+    m.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same'))
+    m.add(Activation('relu'))
+    m.add(MaxPool3D((1, 2, 2)))
+    m.add(Conv3D(256, 3, padding='same'))
+    m.add(Activation('relu'))
+    m.add(MaxPool3D((1, 2, 2)))
+    m.add(Conv3D(75, 3, padding='same'))
+    m.add(Activation('relu'))
+    m.add(MaxPool3D((1, 2, 2)))
+    m.add(Reshape((75, 5 * 17 * 75)))
+    m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    m.add(Dropout(0.5))
+    m.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    m.add(Dropout(0.5))
+    m.add(Dense(char_to_num.vocabulary_size() + 1,
+                kernel_initializer='he_normal', activation='softmax'))
+    return m
+model = build_model()
+model.load_weights('checkpoint.weights.h5')
+# ── Video Processing ──────────────────────────────────────────────────────────
+def load_video_frames(path: str):
+    cap = cv2.VideoCapture(path)
+    processed_frames = []
+    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        gray = tf.image.rgb_to_grayscale(tf.cast(frame, tf.float32))
+        processed_frames.append(gray[190:236, 80:220, :])
+    cap.release()
+    target = 75
+    if len(processed_frames) < target:
+        pad = [tf.zeros_like(processed_frames[0])] * (target - len(processed_frames))
+        processed_frames = processed_frames + pad
+    else:
+        processed_frames = processed_frames[:target]
+    frames_tensor = tf.stack(processed_frames)
+    mean = tf.math.reduce_mean(frames_tensor)
+    std = tf.maximum(tf.math.reduce_std(tf.cast(frames_tensor, tf.float32)), 1e-8)
+    return tf.cast((frames_tensor - mean), tf.float32) / std
+def convert_to_mp4(input_path: str) -> str:
+    out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
+    out.close()
+    try:
+        subprocess.run(
+            ['ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', '-acodec', 'aac', out.name],
+            check=True, capture_output=True
+        )
+        return out.name
+    except Exception:
+        return input_path
+def make_mouth_gif(frames_tensor) -> str:
+    frames_np = frames_tensor.numpy()
+    gif_frames = []
+    for f in frames_np:
+        g = f[:, :, 0]
+        g = g - g.min()
+        rng = g.max()
+        if rng > 0:
+            g = g / rng
+        rgb = np.stack([g, g, g], axis=-1)
+        gif_frames.append((rgb * 255).astype(np.uint8))
+    tmp = tempfile.NamedTemporaryFile(suffix='.gif', delete=False)
+    tmp.close()
+    imageio.mimsave(tmp.name, gif_frames, fps=10, loop=0)
+    return tmp.name
+# ── Inference ─────────────────────────────────────────────────────────────────
+def predict(video_path: str):
+    if video_path is None:
+        return None, None, "Upload a video first.", "(no prediction)", "—"
+    try:
+        frames_tensor = load_video_frames(video_path)
+        mp4_path = convert_to_mp4(video_path)
+        gif_path = make_mouth_gif(frames_tensor)
+        inp = tf.expand_dims(frames_tensor, axis=0)
+        yhat = model.predict(inp, verbose=0)
+        decoded_indices = tf.keras.backend.ctc_decode(
+            yhat, input_length=[75], greedy=True
+        )[0][0].numpy()
+        tokens_str = str(decoded_indices[0].tolist())
+        prediction = tf.strings.reduce_join(
+            num_to_char(decoded_indices[0])
+        ).numpy().decode('utf-8').strip() or "(no prediction)"
+        confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
+        return mp4_path, gif_path, tokens_str, prediction, f"{confidence:.1f}%"
+    except Exception as e:
+        err = f"Error: {str(e)}"
+        return None, None, err, err, "—"
+# ── CSS ───────────────────────────────────────────────────────────────────────
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=Space+Mono:ital@0;1&display=swap');
+body, .gradio-container { background: #07070f !important; font-family: 'Syne', sans-serif !important; color: #e2e2f0 !important; }
+.hero { text-align: center; padding: 2.5rem 1rem 0.5rem; }
+.hero h1 { font-size: 3.5rem; font-weight: 800; letter-spacing: -0.04em; background: linear-gradient(135deg, #f0f0ff 0%, #c084fc 40%, #818cf8 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.3rem; line-height: 1; }
+.hero .sub { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #4b5563; letter-spacing: 0.18em; text-transform: uppercase; }
+.hero .badge { display: inline-block; margin-top: 0.7rem; padding: 0.25rem 0.75rem; border: 1px solid #2d2d4e; border-radius: 999px; font-family: 'Space Mono', monospace; font-size: 0.68rem; color: #7c7c9e; background: #0f0f1e; }
+.section-label { font-family: 'Space Mono', monospace; font-size: 0.68rem; letter-spacing: 0.15em; text-transform: uppercase; color: #4b5563; margin-bottom: 0.4rem; padding-left: 2px; }
+.divider { border: none; border-top: 1px solid #1a1a2e; margin: 1.2rem 0; }
+.mono-out textarea { font-family: 'Space Mono', monospace !important; font-size: 0.82rem !important; background: #0a0a16 !important; color: #a5b4fc !important; border: 1px solid #1e1e38 !important; border-radius: 10px !important; }
+.prediction-out textarea { font-family: 'Syne', sans-serif !important; font-size: 1.6rem !important; font-weight: 700 !important; background: #0a0a16 !important; color: #c084fc !important; border: 1px solid #2d1f4e !important; border-radius: 10px !important; text-align: center !important; }
+.confidence-out textarea { font-family: 'Space Mono', monospace !important; font-size: 1.1rem !important; background: #0a0a16 !important; color: #34d399 !important; border: 1px solid #1a3330 !important; border-radius: 10px !important; text-align: center !important; }
+button.lg { background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 100%) !important; border: none !important; border-radius: 10px !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em !important; color: white !important; }
+.info-panel { background: #0c0c1a; border: 1px solid #1a1a2e; border-radius: 12px; padding: 1rem 1.2rem; }
+.info-panel p { font-family: 'Space Mono', monospace; font-size: 0.72rem; color: #374151; margin: 0; line-height: 2; }
+.info-panel span { color: #6366f1; }
+"""
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(css=css, title="LipNet — Silent Speech Recognition") as demo:
+    gr.HTML("""
+    <div class="hero">
+        <h1>LipNet</h1>
+        <p class="sub">Silent Speech Recognition · No Audio Required</p>
+        <span class="badge">Conv3D → BiLSTM × 2 → CTC Decode · GRID Corpus S1</span>
+    </div>
+    <div style="height:1.5rem"></div>
+    """)
+    # ── Row 1: Upload + Preview ───────────────────────────────────────────────
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-label'>① Upload Video (.mpg / .mp4)</div>")
+            video_input = gr.Video(label="", height=260, sources=["upload"])
+            submit_btn = gr.Button("▶  READ LIPS", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-label'>② Converted Preview (mp4)</div>")
+            video_preview = gr.Video(label="", height=260, interactive=False)
+    gr.HTML("<hr class='divider'>")
+    # ── Row 2: Mouth GIF + Tokens ─────────────────────────────────────────────
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-label'>③ What the Model Sees — mouth crop · grayscale · normalized</div>")
+            gif_preview = gr.Image(label="", height=200, type="filepath")
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-label'>④ Raw CTC Token Indices</div>")
+            tokens_out = gr.Textbox(
+                label="", lines=5, interactive=False,
+                placeholder="Token indices will appear here...",
+                elem_classes=["mono-out"]
+            )
+    gr.HTML("<hr class='divider'>")
+    # ── Row 3: Prediction + Confidence ────────────��──────────────────────────
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.HTML("<div class='section-label'>⑤ Predicted Text</div>")
+            prediction_out = gr.Textbox(
+                label="", lines=2, interactive=False,
+                placeholder="Prediction will appear here...",
+                elem_classes=["prediction-out"]
+            )
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-label'>⑥ Avg Confidence</div>")
+            confidence_out = gr.Textbox(
+                label="", lines=2, interactive=False,
+                placeholder="—", elem_classes=["confidence-out"]
+            )
+    gr.HTML("<hr class='divider'>")
+    gr.HTML("""
+    <div class="info-panel">
+        <p>
+            <span>ARCHITECTURE</span> · Conv3D(128) → Conv3D(256) → Conv3D(75) → Reshape → BiLSTM(128)×2 → Dense(41) → CTC<br>
+            <span>INPUT</span> · 75 frames · mouth crop 46×140 px · grayscale · z-score normalized<br>
+            <span>VOCAB</span> · 40 chars — a–z, 1–9, ' ? !  (space) · output dim = 41 (+ CTC blank token)<br>
+            <span>DATASET</span> · GRID Corpus Speaker S1 · 500 videos · 450 train / 50 test<br>
+            <span>NOTE</span> · Upload frontal-face .mpg or .mp4 videos for best results
+        </p>
+    </div>
+    <div style="height:1.5rem"></div>
+    """)
+    submit_btn.click(
+        fn=predict,
+        inputs=[video_input],
+        outputs=[video_preview, gif_preview, tokens_out, prediction_out, confidence_out]
+    )
+demo.launch()

checkpoint.weights.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c83685a701a669da61e49860463943d0a5fd0a52cbe813c3b2b3ddf075fd3c0
+size 101741136

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+tensorflow-cpu==2.15.0
+opencv-python-headless==4.9.0.80
+gradio==4.44.0
+numpy==1.26.4
+imageio==2.34.0
+ffmpeg-python==0.2.0