Spaces:

omm7
/

lip_reader

Sleeping

App Files Files Community

omm7 commited on Mar 19

Commit

06b5e5b

verified ·

1 Parent(s): cdae4af

Upload app/app.py with huggingface_hub

Browse files

Files changed (1) hide show

app/app.py +227 -0

app/app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from __future__ import annotations
+from pathlib import Path
+import subprocess
+import tempfile
+import imageio
+import streamlit as st
+import tensorflow as tf
+from modelutil import load_model
+from utils import load_data, num_to_char
+# ── Page config ───────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="LipNet — Silent Speech Recognition",
+    page_icon="👄",
+    layout="wide",
+)
+# ── Custom CSS ────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
+html, body, [class*="css"] {
+    font-family: 'Syne', sans-serif;
+    background-color: #07070f;
+    color: #e2e2f0;
+}
+.stApp { background-color: #07070f; }
+/* Sidebar */
+[data-testid="stSidebar"] {
+    background-color: #0f0f1c !important;
+    border-right: 1px solid #1e1e32;
+}
+[data-testid="stSidebar"] * { color: #9ca3af !important; }
+/* Headers */
+h1 {
+    font-weight: 800 !important;
+    background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    letter-spacing: -0.03em;
+}
+h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
+/* Info / success boxes */
+.stAlert { border-radius: 10px !important; }
+[data-testid="stInfo"] {
+    background: #0f0f1c !important;
+    border: 1px solid #2d2d4e !important;
+    color: #a5b4fc !important;
+    font-family: 'Space Mono', monospace;
+    font-size: 0.82rem;
+}
+[data-testid="stSuccess"] {
+    background: #0a1a14 !important;
+    border: 1px solid #1a3330 !important;
+    color: #34d399 !important;
+    font-family: 'Space Mono', monospace;
+    font-size: 1.1rem;
+}
+/* Code / preformatted */
+code, pre {
+    font-family: 'Space Mono', monospace !important;
+    background: #0a0a16 !important;
+    color: #a5b4fc !important;
+    border-radius: 8px !important;
+    font-size: 0.8rem !important;
+}
+/* Selectbox */
+[data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
+/* Divider */
+hr { border-color: #1a1a2e !important; }
+</style>
+""", unsafe_allow_html=True)
+# ── Sidebar ───────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.markdown("## 👄 LipNet")
+    st.markdown(
+        "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
+        "letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>",
+        unsafe_allow_html=True,
+    )
+    st.divider()
+    st.markdown("**Architecture**")
+    st.markdown("""
+    <p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'>
+    Conv3D(128) ↓<br>
+    Conv3D(256) ↓<br>
+    Conv3D(75)  ↓<br>
+    Reshape     ↓<br>
+    BiLSTM(128) ↓<br>
+    BiLSTM(128) ↓<br>
+    Dense(41) + CTC
+    </p>
+    """, unsafe_allow_html=True)
+    st.divider()
+    st.markdown("**Dataset**")
+    st.markdown(
+        "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
+        "line-height:2;'>GRID Corpus · Speaker S1<br>500 videos<br>"
+        "450 train / 50 test<br>Vocab: a–z 1–9 ' ? ! (space)</p>",
+        unsafe_allow_html=True,
+    )
+    st.divider()
+    st.caption("No audio. Lips only.")
+# ── Title ─────────────────────────────────────────────────────────────────────
+st.title("LipNet — Silent Speech Recognition")
+st.markdown(
+    "<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;"
+    "letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC · NO AUDIO REQUIRED</p>",
+    unsafe_allow_html=True,
+)
+st.divider()
+# ── Data paths ────────────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent
+DATA_DIR = BASE_DIR / 'data' / 's1'
+options = sorted([item.name for item in DATA_DIR.glob('*.mpg')])
+if not options:
+    st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.")
+    st.stop()
+selected_video = st.selectbox("**Choose a video**", options)
+file_path = DATA_DIR / selected_video
+st.divider()
+# ── Load model (cached) ────────────────────────────────���──────────────────────
+@st.cache_resource(show_spinner="Loading LipNet model...")
+def get_model():
+    return load_model()
+model = get_model()
+# ── Two-column layout ─────────────────────────────────────────────────────────
+col1, col2 = st.columns(2, gap="large")
+# ── Column 1: Video preview ───────────────────────────────────────────────────
+with col1:
+    st.markdown("### 📹 Original Video")
+    st.info("Video converted to mp4 for browser playback")
+    output_path = None
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
+            output_path = Path(f.name)
+        subprocess.run(
+            ["ffmpeg", "-i", str(file_path), "-vcodec", "libx264",
+             "-crf", "23", str(output_path), "-y"],
+            check=True, capture_output=True, text=True,
+        )
+        st.video(output_path.read_bytes())
+    except subprocess.CalledProcessError as exc:
+        st.error("ffmpeg conversion failed.")
+        st.code(exc.stderr or "No error output.")
+    finally:
+        if output_path and output_path.exists():
+            output_path.unlink()
+# ── Column 2: Model inference ─────────────────────────────────────────────────
+with col2:
+    st.markdown("### 🧠 Model Inference")
+    # Load frames + alignment
+    video_tensor, annotations = load_data(tf.convert_to_tensor(str(file_path)))
+    # ── Mouth crop GIF ────────────────────────────────────────────────────────
+    st.info("Mouth crop — what the model actually sees (grayscale · normalized)")
+    gif_path = None
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf:
+            gif_path = Path(gf.name)
+        frames_np = video_tensor.numpy()
+        gif_frames = []
+        for f in frames_np:
+            g = f[:, :, 0]
+            g = (g - g.min()) / max(g.max() - g.min(), 1e-8)
+            rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8")
+            gif_frames.append(rgb)
+        imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0)
+        st.image(str(gif_path), width=400)
+    finally:
+        if gif_path and gif_path.exists():
+            gif_path.unlink()
+    st.divider()
+    # ── Ground truth ──────────────────────────────────────────────────────────
+    st.info("Ground truth label (from `.align` file)")
+    ground_truth = tf.strings.reduce_join(
+        num_to_char(annotations)
+    ).numpy().decode('utf-8')
+    st.code(ground_truth, language=None)
+    st.divider()
+    # ── Raw tokens ────────────────────────────────────────────────────────────
+    st.info("Raw CTC token indices from model output")
+    yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
+    decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
+    st.code(str(decoded[0].tolist()), language=None)
+    st.divider()
+    # ── Final prediction ──────────────────────────────────────────────────────
+    prediction = tf.strings.reduce_join(
+        num_to_char(decoded[0])
+    ).numpy().decode('utf-8').strip()
+    st.success(f"**Prediction:** {prediction}")
+    # ── Confidence ────────────────────────────────────────────────────────────
+    import numpy as np
+    confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
+    st.markdown(
+        f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
+        f"AVG CONFIDENCE · <span style='color:#34d399'>{confidence:.1f}%</span></p>",
+        unsafe_allow_html=True,
+    )