Spaces:

omm7
/

lip_reader

Sleeping

App Files Files Community

omm7 commited on Mar 20

Commit

ab5dbbd

verified ·

1 Parent(s): 5428408

Upload app/app.py with huggingface_hub

Browse files

Files changed (1) hide show

app/app.py +19 -30

app/app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import subprocess
 import tempfile
 import imageio
 import streamlit as st
 import tensorflow as tf
 from modelutil import load_model
@@ -19,22 +20,17 @@ st.set_page_config(
 st.markdown("""
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
 html, body, [class*="css"] {
     font-family: 'Syne', sans-serif;
     background-color: #07070f;
     color: #e2e2f0;
 }
 .stApp { background-color: #07070f; }
-/* Sidebar */
 [data-testid="stSidebar"] {
     background-color: #0f0f1c !important;
     border-right: 1px solid #1e1e32;
 }
 [data-testid="stSidebar"] * { color: #9ca3af !important; }
-/* Headers */
 h1 {
     font-weight: 800 !important;
     background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
@@ -43,8 +39,6 @@ h1 {
     letter-spacing: -0.03em;
 }
 h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
-/* Info / success boxes */
 .stAlert { border-radius: 10px !important; }
 [data-testid="stInfo"] {
     background: #0f0f1c !important;
@@ -60,8 +54,6 @@ h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
     font-family: 'Space Mono', monospace;
     font-size: 1.1rem;
 }
-/* Code / preformatted */
 code, pre {
     font-family: 'Space Mono', monospace !important;
     background: #0a0a16 !important;
@@ -69,11 +61,7 @@ code, pre {
     border-radius: 8px !important;
     font-size: 0.8rem !important;
 }
-/* Selectbox */
 [data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
-/* Divider */
 hr { border-color: #1a1a2e !important; }
 </style>
 """, unsafe_allow_html=True)
@@ -130,7 +118,6 @@ if not options:
 selected_video = st.selectbox("**Choose a video**", options)
 file_path = DATA_DIR / selected_video
 st.divider()
 # ── Load model (cached) ───────────────────────────────────────────────────────
@@ -140,10 +127,21 @@ def get_model():
 model = get_model()
 # ── Two-column layout ─────────────────────────────────────────────────────────
 col1, col2 = st.columns(2, gap="large")
-# ── Column 1: Video preview ───────────────────────────────────────────────────
 with col1:
     st.markdown("### 📹 Original Video")
     st.info("Video converted to mp4 for browser playback")
@@ -165,13 +163,15 @@ with col1:
         if output_path and output_path.exists():
             output_path.unlink()
 # ── Column 2: Model inference ─────────────────────────────────────────────────
 with col2:
     st.markdown("### 🧠 Model Inference")
-    # Load frames + alignment
-    video_tensor, annotations = load_data(tf.convert_to_tensor(str(file_path)))
     # ── Mouth crop GIF ────────────────────────────────────────────────────────
     st.info("Mouth crop - what the model actually sees (grayscale · normalized)")
     gif_path = None
@@ -193,15 +193,6 @@ with col2:
     st.divider()
-    # ── Ground truth ──────────────────────────────────────────────────────────
-    st.info("Ground truth label (from `.align` file)")
-    ground_truth = tf.strings.reduce_join(
-        num_to_char(annotations)
-    ).numpy().decode('utf-8')
-    st.code(ground_truth, language=None)
-    st.divider()
     # ── Raw tokens ───────────────────────────────────���────────────────────────
     st.info("Raw CTC token indices from model output")
     yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
@@ -214,14 +205,12 @@ with col2:
     prediction = tf.strings.reduce_join(
         num_to_char(decoded[0])
     ).numpy().decode('utf-8').strip()
     st.success(f"**Prediction:** {prediction}")
     # ── Confidence ────────────────────────────────────────────────────────────
-    import numpy as np
     confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
     st.markdown(
         f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
         f"AVG CONFIDENCE · <span style='color:#34d399'>{confidence:.1f}%</span></p>",
         unsafe_allow_html=True,
-    )

 import subprocess
 import tempfile
 import imageio
+import numpy as np
 import streamlit as st
 import tensorflow as tf
 from modelutil import load_model
 st.markdown("""
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=Space+Mono&display=swap');
 html, body, [class*="css"] {
     font-family: 'Syne', sans-serif;
     background-color: #07070f;
     color: #e2e2f0;
 }
 .stApp { background-color: #07070f; }
 [data-testid="stSidebar"] {
     background-color: #0f0f1c !important;
     border-right: 1px solid #1e1e32;
 }
 [data-testid="stSidebar"] * { color: #9ca3af !important; }
 h1 {
     font-weight: 800 !important;
     background: linear-gradient(135deg, #f0f0ff, #c084fc, #818cf8);
     letter-spacing: -0.03em;
 }
 h2, h3 { color: #c084fc !important; font-weight: 700 !important; }
 .stAlert { border-radius: 10px !important; }
 [data-testid="stInfo"] {
     background: #0f0f1c !important;
     font-family: 'Space Mono', monospace;
     font-size: 1.1rem;
 }
 code, pre {
     font-family: 'Space Mono', monospace !important;
     background: #0a0a16 !important;
     border-radius: 8px !important;
     font-size: 0.8rem !important;
 }
 [data-testid="stSelectbox"] label { color: #6b7280 !important; font-size: 0.8rem; letter-spacing: 0.1em; text-transform: uppercase; }
 hr { border-color: #1a1a2e !important; }
 </style>
 """, unsafe_allow_html=True)
 selected_video = st.selectbox("**Choose a video**", options)
 file_path = DATA_DIR / selected_video
 st.divider()
 # ── Load model (cached) ───────────────────────────────────────────────────────
 model = get_model()
+# ── Load frames + alignment (cached per video) ────────────────────────────────
+@st.cache_data(show_spinner="Processing video...")
+def get_video_data(path: str):
+    video_tensor, annotations = load_data(tf.convert_to_tensor(path))
+    ground_truth = tf.strings.reduce_join(
+        num_to_char(annotations)
+    ).numpy().decode('utf-8')
+    return video_tensor, annotations, ground_truth
+video_tensor, annotations, ground_truth = get_video_data(str(file_path))
 # ── Two-column layout ─────────────────────────────────────────────────────────
 col1, col2 = st.columns(2, gap="large")
+# ── Column 1: Video preview + Ground truth ────────────────────────────────────
 with col1:
     st.markdown("### 📹 Original Video")
     st.info("Video converted to mp4 for browser playback")
         if output_path and output_path.exists():
             output_path.unlink()
+    # ── Ground truth (moved here) ─────────────────────────────────────────────
+    st.divider()
+    st.info("Ground truth label (from `.align` file)")
+    st.code(ground_truth, language=None)
 # ── Column 2: Model inference ─────────────────────────────────────────────────
 with col2:
     st.markdown("### 🧠 Model Inference")
     # ── Mouth crop GIF ────────────────────────────────────────────────────────
     st.info("Mouth crop - what the model actually sees (grayscale · normalized)")
     gif_path = None
     st.divider()
     # ── Raw tokens ───────────────────────────────────���────────────────────────
     st.info("Raw CTC token indices from model output")
     yhat = model.predict(tf.expand_dims(video_tensor, axis=0), verbose=0)
     prediction = tf.strings.reduce_join(
         num_to_char(decoded[0])
     ).numpy().decode('utf-8').strip()
     st.success(f"**Prediction:** {prediction}")
     # ── Confidence ────────────────────────────────────────────────────────────
     confidence = float(np.mean(np.max(yhat[0], axis=-1)) * 100)
     st.markdown(
         f"<p style='font-family:Space Mono,monospace;font-size:0.78rem;color:#4b5563;'>"
         f"AVG CONFIDENCE · <span style='color:#34d399'>{confidence:.1f}%</span></p>",
         unsafe_allow_html=True,
+    )