Spaces:

omm7
/

lip_reader

Running

File size: 9,553 Bytes

from __future__ import annotations
from pathlib import Path
import subprocess
import tempfile
import imageio
import numpy as np
import streamlit as st
import tensorflow as tf
from modelutil import load_model
from utils import load_data, num_to_char

# ── Page config ───────────────────────────────────────────────────────────────
st.set_page_config(
    page_title="LipNet - Silent Speech Recognition",
    page_icon="👄",
    layout="wide",
)

# ── Custom CSS ────────────────────────────────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
html, body, [class*="css"] {
    font-family: 'Syne', sans-serif;
    background-color: #07070f;
    color: #e2e2f0;
}
.stApp { background-color: #07070f; }
[data-testid="stSidebar"] {
    background-color: #0f0f1c !important;
    border-right: 1px solid #1e1e32;
}
[data-testid="stSidebar"] * { color: #9ca3af !important; }
h1 { 
    font-weight: 800 !important; 
    background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    letter-spacing: -0.03em;
}
h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
.stAlert { border-radius: 10px !important; }
[data-testid="stInfo"] {
    background: #0f0f1c !important;
    border: 1px solid #2d2d4e !important;
    color: #a5b4fc !important;
    font-family: 'Space Mono', monospace;
    font-size: 0.82rem;
}
[data-testid="stSuccess"] {
    background: #0a1a14 !important;
    border: 1px solid #1a3330 !important;
    color: #34d399 !important;
    font-family: 'Space Mono', monospace;
    font-size: 1.1rem;
}
code, pre {
    font-family: 'Space Mono', monospace !important;
    background: #0a0a16 !important;
    color: #a5b4fc !important;
    border-radius: 8px !important;
    font-size: 0.8rem !important;
}
[data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
hr { border-color: #1a1a2e !important; }
</style>
""", unsafe_allow_html=True)

# ── Sidebar ───────────────────────────────────────────────────────────────────
with st.sidebar:
    st.markdown("## 👄 LipNet")
    st.markdown(
        "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
        "letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>",
        unsafe_allow_html=True,
    )
    st.divider()
    st.markdown("**Architecture**")
    st.markdown("""
    <p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'>
    Conv3D(128) ↓<br>
    Conv3D(256) ↓<br>
    Conv3D(75)  ↓<br>
    Reshape     ↓<br>
    BiLSTM(128) ↓<br>
    BiLSTM(128) ↓<br>
    Dense(41) + CTC
    </p>
    """, unsafe_allow_html=True)
    st.divider()
    st.markdown("**Dataset**")
    st.markdown(
        "<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
        "line-height:2;'>GRID Corpus · Speaker S1<br>500 videos<br>"
        "450 train / 50 test<br>Vocab: a–z 1–9 ' ? ! (space)</p>",
        unsafe_allow_html=True,
    )
    st.divider()
    st.caption("No audio. Lips only.")

# ── Title ─────────────────────────────────────────────────────────────────────
st.title("LipNet - Silent Speech Recognition")
st.markdown(
    "<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;"
    "letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC · NO AUDIO REQUIRED</p>",
    unsafe_allow_html=True,
)
st.divider()

# ── Data paths ────────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / 'data' / 's1'

options = sorted([item.name for item in DATA_DIR.glob('*.mpg')])
if not options:
    st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.")
    st.stop()

selected_video = st.selectbox("**Choose a video**", options)
file_path = DATA_DIR / selected_video
st.divider()

# ── Load model (cached) ───────────────────────────────────────────────────────
@st.cache_resource(show_spinner="Loading LipNet model...")
def get_model():
    return load_model()

model = get_model()

# ── Load frames + alignment (cached per video) ────────────────────────────────
@st.cache_data(show_spinner="Processing video...")
def get_video_data(path: str):
    video_tensor, annotations = load_data(tf.convert_to_tensor(path))
    ground_truth = tf.strings.reduce_join(
        num_to_char(annotations)
    ).numpy().decode('utf-8')
    return video_tensor, annotations, ground_truth

video_tensor, annotations, ground_truth = get_video_data(str(file_path))

# ── Two-column layout ─────────────────────────────────────────────────────────
col1, col2 = st.columns(2, gap="large")

# ── Column 1: Video preview + Ground truth ────────────────────────────────────
with col1:
    st.markdown("### 📹 Original Video")
    st.info("Video converted to mp4 for browser playback")

    output_path = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
            output_path = Path(f.name)
        subprocess.run(
            ["ffmpeg", "-i", str(file_path), "-vcodec", "libx264",
             "-crf", "23", str(output_path), "-y"],
            check=True, capture_output=True, text=True,
        )
        st.video(output_path.read_bytes())
    except subprocess.CalledProcessError as exc:
        st.error("ffmpeg conversion failed.")
        st.code(exc.stderr or "No error output.")
    finally:
        if output_path and output_path.exists():
            output_path.unlink()



# ── Column 2: Model inference ─────────────────────────────────────────────────
with col2:
    st.markdown("### 🧠 Model Inference")

    # ── Mouth crop GIF ────────────────────────────────────────────────────────
    st.info("Mouth crop - what the model actually sees (grayscale · normalized)")
    gif_path = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf:
            gif_path = Path(gf.name)
        frames_np = video_tensor.numpy()
        gif_frames = []
        for f in frames_np:
            g = f[:, :, 0]
            g = (g - g.min()) / max(g.max() - g.min(), 1e-8)
            rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8")
            gif_frames.append(rgb)
        imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0)
        st.image(str(gif_path), width=400)
    finally:
        if gif_path and gif_path.exists():
            gif_path.unlink()

    st.divider()

    # ── Raw tokens ────────────────────────────────────────────────────────────
    st.info("Raw CTC token indices from model output")
    yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
    decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
    st.code(str(decoded[0].tolist()), language=None)



    # ── Ground truth (moved here) ─────────────────────────────────────────────
    st.divider()
    st.info("Ground truth label (from `.align` file)")
    st.code(ground_truth, language=None)

    st.divider()
    # ── Final prediction ──────────────────────────────────────────────────────
    prediction = tf.strings.reduce_join(
        num_to_char(decoded[0])
    ).numpy().decode('utf-8').strip()
    st.success(f"**Prediction:** {prediction}")

    # ── Confidence ────────────────────────────────────────────────────────────
    confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
    st.markdown(
        f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
        f"AVG CONFIDENCE · <span style='color:#34d399'>{confidence:.1f}%</span></p>",
        unsafe_allow_html=True,
    )