Spaces:
Running
Running
| from __future__ import annotations | |
| from pathlib import Path | |
| import subprocess | |
| import tempfile | |
| import imageio | |
| import numpy as np | |
| import streamlit as st | |
| import tensorflow as tf | |
| from modelutil import load_model | |
| from utils import load_data, num_to_char | |
| # ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config( | |
| page_title="LipNet - Silent Speech Recognition", | |
| page_icon="π", | |
| layout="wide", | |
| ) | |
| # ββ Custom CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap'); | |
| html, body, [class*="css"] { | |
| font-family: 'Syne', sans-serif; | |
| background-color: #07070f; | |
| color: #e2e2f0; | |
| } | |
| .stApp { background-color: #07070f; } | |
| [data-testid="stSidebar"] { | |
| background-color: #0f0f1c !important; | |
| border-right: 1px solid #1e1e32; | |
| } | |
| [data-testid="stSidebar"] * { color: #9ca3af !important; } | |
| h1 { | |
| font-weight: 800 !important; | |
| background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| letter-spacing: -0.03em; | |
| } | |
| h2, h3 { color: #c084fc !important; font-weight: 700 !important; } | |
| .stAlert { border-radius: 10px !important; } | |
| [data-testid="stInfo"] { | |
| background: #0f0f1c !important; | |
| border: 1px solid #2d2d4e !important; | |
| color: #a5b4fc !important; | |
| font-family: 'Space Mono', monospace; | |
| font-size: 0.82rem; | |
| } | |
| [data-testid="stSuccess"] { | |
| background: #0a1a14 !important; | |
| border: 1px solid #1a3330 !important; | |
| color: #34d399 !important; | |
| font-family: 'Space Mono', monospace; | |
| font-size: 1.1rem; | |
| } | |
| code, pre { | |
| font-family: 'Space Mono', monospace !important; | |
| background: #0a0a16 !important; | |
| color: #a5b4fc !important; | |
| border-radius: 8px !important; | |
| font-size: 0.8rem !important; | |
| } | |
| [data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; } | |
| hr { border-color: #1a1a2e !important; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.markdown("## π LipNet") | |
| st.markdown( | |
| "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;" | |
| "letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.divider() | |
| st.markdown("**Architecture**") | |
| st.markdown(""" | |
| <p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'> | |
| Conv3D(128) β<br> | |
| Conv3D(256) β<br> | |
| Conv3D(75) β<br> | |
| Reshape β<br> | |
| BiLSTM(128) β<br> | |
| BiLSTM(128) β<br> | |
| Dense(41) + CTC | |
| </p> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| st.markdown("**Dataset**") | |
| st.markdown( | |
| "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;" | |
| "line-height:2;'>GRID Corpus Β· Speaker S1<br>500 videos<br>" | |
| "450 train / 50 test<br>Vocab: aβz 1β9 ' ? ! (space)</p>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.divider() | |
| st.caption("No audio. Lips only.") | |
| # ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.title("LipNet - Silent Speech Recognition") | |
| st.markdown( | |
| "<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;" | |
| "letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC Β· NO AUDIO REQUIRED</p>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.divider() | |
| # ββ Data paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BASE_DIR = Path(__file__).resolve().parent | |
| DATA_DIR = BASE_DIR / 'data' / 's1' | |
| options = sorted([item.name for item in DATA_DIR.glob('*.mpg')]) | |
| if not options: | |
| st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.") | |
| st.stop() | |
| selected_video = st.selectbox("**Choose a video**", options) | |
| file_path = DATA_DIR / selected_video | |
| st.divider() | |
| # ββ Load model (cached) βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_model(): | |
| return load_model() | |
| model = get_model() | |
| # ββ Load frames + alignment (cached per video) ββββββββββββββββββββββββββββββββ | |
| def get_video_data(path: str): | |
| video_tensor, annotations = load_data(tf.convert_to_tensor(path)) | |
| ground_truth = tf.strings.reduce_join( | |
| num_to_char(annotations) | |
| ).numpy().decode('utf-8') | |
| return video_tensor, annotations, ground_truth | |
| video_tensor, annotations, ground_truth = get_video_data(str(file_path)) | |
| # ββ Two-column layout βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| col1, col2 = st.columns(2, gap="large") | |
| # ββ Column 1: Video preview + Ground truth ββββββββββββββββββββββββββββββββββββ | |
| with col1: | |
| st.markdown("### πΉ Original Video") | |
| st.info("Video converted to mp4 for browser playback") | |
| output_path = None | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: | |
| output_path = Path(f.name) | |
| subprocess.run( | |
| ["ffmpeg", "-i", str(file_path), "-vcodec", "libx264", | |
| "-crf", "23", str(output_path), "-y"], | |
| check=True, capture_output=True, text=True, | |
| ) | |
| st.video(output_path.read_bytes()) | |
| except subprocess.CalledProcessError as exc: | |
| st.error("ffmpeg conversion failed.") | |
| st.code(exc.stderr or "No error output.") | |
| finally: | |
| if output_path and output_path.exists(): | |
| output_path.unlink() | |
| # ββ Column 2: Model inference βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with col2: | |
| st.markdown("### π§ Model Inference") | |
| # ββ Mouth crop GIF ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.info("Mouth crop - what the model actually sees (grayscale Β· normalized)") | |
| gif_path = None | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf: | |
| gif_path = Path(gf.name) | |
| frames_np = video_tensor.numpy() | |
| gif_frames = [] | |
| for f in frames_np: | |
| g = f[:, :, 0] | |
| g = (g - g.min()) / max(g.max() - g.min(), 1e-8) | |
| rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8") | |
| gif_frames.append(rgb) | |
| imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0) | |
| st.image(str(gif_path), width=400) | |
| finally: | |
| if gif_path and gif_path.exists(): | |
| gif_path.unlink() | |
| st.divider() | |
| # ββ Raw tokens ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.info("Raw CTC token indices from model output") | |
| yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0) | |
| decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() | |
| st.code(str(decoded[0].tolist()), language=None) | |
| # ββ Ground truth (moved here) βββββββββββββββββββββββββββββββββββββββββββββ | |
| st.divider() | |
| st.info("Ground truth label (from `.align` file)") | |
| st.code(ground_truth, language=None) | |
| st.divider() | |
| # ββ Final prediction ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| prediction = tf.strings.reduce_join( | |
| num_to_char(decoded[0]) | |
| ).numpy().decode('utf-8').strip() | |
| st.success(f"**Prediction:** {prediction}") | |
| # ββ Confidence ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100) | |
| st.markdown( | |
| f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>" | |
| f"AVG CONFIDENCE Β· <span style='color:#34d399'>{confidence:.1f}%</span></p>", | |
| unsafe_allow_html=True, | |
| ) |