Spaces:

omm7
/

lip_reader

Running

App Files Files Community

lip_reader / app /app.py

omm7

Upload app/app.py with huggingface_hub

84b62a9 verified 16 days ago

raw

history blame contribute delete

9.55 kB

	from __future__ import annotations
	from pathlib import Path
	import subprocess
	import tempfile
	import imageio
	import numpy as np
	import streamlit as st
	import tensorflow as tf
	from modelutil import load_model
	from utils import load_data, num_to_char

	# ── Page config ───────────────────────────────────────────────────────────────
	st.set_page_config(
	page_title="LipNet - Silent Speech Recognition",
	page_icon="👄",
	layout="wide",
	)

	# ── Custom CSS ────────────────────────────────────────────────────────────────
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
	html, body, [class*="css"] {
	font-family: 'Syne', sans-serif;
	background-color: #07070f;
	color: #e2e2f0;
	}
	.stApp { background-color: #07070f; }
	[data-testid="stSidebar"] {
	background-color: #0f0f1c !important;
	border-right: 1px solid #1e1e32;
	}
	[data-testid="stSidebar"] * { color: #9ca3af !important; }
	h1 {
	font-weight: 800 !important;
	background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	letter-spacing: -0.03em;
	}
	h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
	.stAlert { border-radius: 10px !important; }
	[data-testid="stInfo"] {
	background: #0f0f1c !important;
	border: 1px solid #2d2d4e !important;
	color: #a5b4fc !important;
	font-family: 'Space Mono', monospace;
	font-size: 0.82rem;
	}
	[data-testid="stSuccess"] {
	background: #0a1a14 !important;
	border: 1px solid #1a3330 !important;
	color: #34d399 !important;
	font-family: 'Space Mono', monospace;
	font-size: 1.1rem;
	}
	code, pre {
	font-family: 'Space Mono', monospace !important;
	background: #0a0a16 !important;
	color: #a5b4fc !important;
	border-radius: 8px !important;
	font-size: 0.8rem !important;
	}
	[data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
	hr { border-color: #1a1a2e !important; }
	</style>
	""", unsafe_allow_html=True)

	# ── Sidebar ───────────────────────────────────────────────────────────────────
	with st.sidebar:
	st.markdown("## 👄 LipNet")
	st.markdown(
	"<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
	"letter-spacing:0.1em;'>SILENT SPEECH RECOGNITION</p>",
	unsafe_allow_html=True,
	)
	st.divider()
	st.markdown("Architecture")
	st.markdown("""
	<p style='font-family:Space Mono,monospace;font-size:0.72rem;line-height:2;color:#4b5563;'>
	Conv3D(128) ↓<br>
	Conv3D(256) ↓<br>
	Conv3D(75) ↓<br>
	Reshape ↓<br>
	BiLSTM(128) ↓<br>
	BiLSTM(128) ↓<br>
	Dense(41) + CTC
	</p>
	""", unsafe_allow_html=True)
	st.divider()
	st.markdown("Dataset")
	st.markdown(
	"<p style='font-family:Space Mono,monospace;font-size:0.72rem;color:#4b5563;"
	"line-height:2;'>GRID Corpus · Speaker S1<br>500 videos<br>"
	"450 train / 50 test<br>Vocab: a–z 1–9 ' ? ! (space)</p>",
	unsafe_allow_html=True,
	)
	st.divider()
	st.caption("No audio. Lips only.")

	# ── Title ─────────────────────────────────────────────────────────────────────
	st.title("LipNet - Silent Speech Recognition")
	st.markdown(
	"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;"
	"letter-spacing:0.15em;margin-top:-1rem;'>CONV3D + BILSTM + CTC · NO AUDIO REQUIRED</p>",
	unsafe_allow_html=True,
	)
	st.divider()

	# ── Data paths ────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).resolve().parent
	DATA_DIR = BASE_DIR / 'data' / 's1'

	options = sorted([item.name for item in DATA_DIR.glob('*.mpg')])
	if not options:
	st.error(f"No `.mpg` videos found in `{DATA_DIR}`. Make sure `data/s1/` is populated.")
	st.stop()

	selected_video = st.selectbox("Choose a video", options)
	file_path = DATA_DIR / selected_video
	st.divider()

	# ── Load model (cached) ───────────────────────────────────────────────────────
	@st.cache_resource(show_spinner="Loading LipNet model...")
	def get_model():
	return load_model()

	model = get_model()

	# ── Load frames + alignment (cached per video) ────────────────────────────────
	@st.cache_data(show_spinner="Processing video...")
	def get_video_data(path: str):
	video_tensor, annotations = load_data(tf.convert_to_tensor(path))
	ground_truth = tf.strings.reduce_join(
	num_to_char(annotations)
	).numpy().decode('utf-8')
	return video_tensor, annotations, ground_truth

	video_tensor, annotations, ground_truth = get_video_data(str(file_path))

	# ── Two-column layout ─────────────────────────────────────────────────────────
	col1, col2 = st.columns(2, gap="large")

	# ── Column 1: Video preview + Ground truth ────────────────────────────────────
	with col1:
	st.markdown("### 📹 Original Video")
	st.info("Video converted to mp4 for browser playback")

	output_path = None
	try:
	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
	output_path = Path(f.name)
	subprocess.run(
	["ffmpeg", "-i", str(file_path), "-vcodec", "libx264",
	"-crf", "23", str(output_path), "-y"],
	check=True, capture_output=True, text=True,
	)
	st.video(output_path.read_bytes())
	except subprocess.CalledProcessError as exc:
	st.error("ffmpeg conversion failed.")
	st.code(exc.stderr or "No error output.")
	finally:
	if output_path and output_path.exists():
	output_path.unlink()



	# ── Column 2: Model inference ─────────────────────────────────────────────────
	with col2:
	st.markdown("### 🧠 Model Inference")

	# ── Mouth crop GIF ────────────────────────────────────────────────────────
	st.info("Mouth crop - what the model actually sees (grayscale · normalized)")
	gif_path = None
	try:
	with tempfile.NamedTemporaryFile(suffix=".gif", delete=False) as gf:
	gif_path = Path(gf.name)
	frames_np = video_tensor.numpy()
	gif_frames = []
	for f in frames_np:
	g = f[:, :, 0]
	g = (g - g.min()) / max(g.max() - g.min(), 1e-8)
	rgb = (255 * tf.stack([g, g, g], axis=-1).numpy()).astype("uint8")
	gif_frames.append(rgb)
	imageio.mimsave(str(gif_path), gif_frames, fps=10, loop=0)
	st.image(str(gif_path), width=400)
	finally:
	if gif_path and gif_path.exists():
	gif_path.unlink()

	st.divider()

	# ── Raw tokens ────────────────────────────────────────────────────────────
	st.info("Raw CTC token indices from model output")
	yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
	decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
	st.code(str(decoded[0].tolist()), language=None)



	# ── Ground truth (moved here) ─────────────────────────────────────────────
	st.divider()
	st.info("Ground truth label (from `.align` file)")
	st.code(ground_truth, language=None)

	st.divider()
	# ── Final prediction ──────────────────────────────────────────────────────
	prediction = tf.strings.reduce_join(
	num_to_char(decoded[0])
	).numpy().decode('utf-8').strip()
	st.success(f"Prediction: {prediction}")

	# ── Confidence ────────────────────────────────────────────────────────────
	confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
	st.markdown(
	f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
	f"AVG CONFIDENCE · <span style='color:#34d399'>{confidence:.1f}%</span></p>",
	unsafe_allow_html=True,
	)