Spaces:

FarhanAK128
/

LipNet-Pytorch

Runtime error

App Files Files Community

FarhanAK128 commited on Sep 23, 2025

Commit

982555b

0 Parent(s):

add files

Browse files

Files changed (27) hide show

.gitattributes +3 -0
.python-version +1 -0
Dockerfile +0 -0
README.md +0 -0
data/bbal6n.align +8 -0
data/bbal6n.mpg +3 -0
data/bbwgzn.align +8 -0
data/bbwgzn.mpg +3 -0
data/bgan7a.align +8 -0
data/bgan7a.mpg +3 -0
data/brwg6n.align +8 -0
data/brwg6n.mpg +3 -0
data/pgad9s.align +8 -0
data/pgad9s.mpg +3 -0
data/swih7s.align +8 -0
data/swih7s.mpg +3 -0
data/swwv8p.align +8 -0
data/swwv8p.mpg +3 -0
frontend.py +257 -0
main.py +6 -0
model_utils/__init__.py +0 -0
model_utils/inference.py +21 -0
model_utils/lipnet_model_full_scripted.pt +3 -0
model_utils/tokenizer.py +15 -0
model_utils/utils.py +108 -0
model_utils/vocabulary.py +4 -0
pyproject.toml +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pt filter=lfs diff=lfs merge=lfs -text
+*.mpg filter=lfs diff=lfs merge=lfs -text
+*.mgp filter=lfs diff=lfs merge=lfs -text

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

Dockerfile ADDED Viewed

File without changes

README.md ADDED Viewed

File without changes

data/bbal6n.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 16000 sil
+16000 21250 bin
+21250 26750 blue
+26750 27750 at
+27750 31500 l
+31500 39250 six
+39250 45250 now
+45250 74500 sil

data/bbal6n.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c7f36eaceda1bb64adc2877cfe3370104d9451059addc9035f7da55390e9510
+size 428032

data/bbwgzn.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 17250 sil
+17250 22750 bin
+22750 28000 blue
+28000 30250 with
+30250 34250 g
+34250 43500 zero
+43500 49250 now
+49250 74500 sil

data/bbwgzn.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73bc132c76e16da1592fad16eb270532857b2100bf5cd30eff579c95bad38f8a
+size 419840

data/bgan7a.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 14250 sil
+14250 20000 bin
+20000 24250 green
+24250 27500 at
+27500 32000 n
+32000 40000 seven
+40000 49250 again
+49250 74500 sil

data/bgan7a.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:403233bcb9018dca8b47436da55793d3ced10f336d2dcfbcacef14825f5f38f4
+size 409600

data/brwg6n.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 16500 sil
+16500 22500 bin
+22500 26500 red
+26500 28500 with
+28500 32250 g
+32250 40000 six
+40000 46000 now
+46000 74500 sil

data/brwg6n.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bc095f1b186a28cd81fd50ad1477ec071ee5f431dd429c4e046d02c6f7c6ada
+size 413696

data/pgad9s.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 8250 sil
+8250 14500 place
+14500 19750 green
+19750 22500 at
+22500 25500 d
+25500 32250 nine
+32250 41750 soon
+41750 74500 sil

data/pgad9s.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d20fbf469814d7cda9f14515dc631bbfb05041648f4d1cdcc038729f4db0cfe8
+size 401408

data/swih7s.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 7750 sil
+7750 16000 set
+16000 23000 white
+23000 25500 in
+25500 29750 h
+29750 37250 seven
+37250 46500 soon
+46500 74500 sil

data/swih7s.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b42de3a3d28a8f3f99adb4c9307441d8619d872f54a008fbef003432c864a51
+size 475136

data/swwv8p.align ADDED Viewed

	@@ -0,0 +1,8 @@

+0 10250 sil
+10250 22000 set
+22000 29750 white
+29750 32500 with
+32500 36000 v
+36000 40250 eight
+40250 51250 please
+51250 74500 sil

data/swwv8p.mpg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06102ed813a4b6897dafda2ee49a39377edf9b2a70e8ba2d10600b6bed1a4f22
+size 413696

frontend.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import streamlit as st
+import os
+import imageio
+from model_utils.utils import load_video_for_gif, load_data, load_alignments
+from model_utils.inference import inference
+import time
+import uuid
+# Page configuration
+st.set_page_config(
+    page_title="MedLipReader",
+    page_icon="🏥",
+    layout='wide',
+    initial_sidebar_state="collapsed"
+)
+# Custom CSS for enhanced styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 3rem;
+        font-weight: bold;
+        color: #2E86AB;
+        text-align: center;
+        margin-bottom: 0.5rem;
+    }
+    .subtitle {
+        font-size: 1.2rem;
+        color: #6C757D;
+        text-align: center;
+        margin-bottom: 2rem;
+        font-style: italic;
+    }
+    .section-header {
+        font-size: 1.5rem;
+        font-weight: bold;
+        color: #495057;
+        margin-bottom: 1rem;
+        padding: 0.5rem;
+        border-left: 4px solid #2E86AB;
+        background-color: #F8F9FA;
+    }
+    .info-card {
+        background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
+        padding: 1.5rem;
+        border-radius: 10px;
+        color: white;
+        margin: 1rem 0;
+        text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
+    }
+    .success-card {
+        background: linear-gradient(135deg, #2d5016 0%, #3e7b27 100%);
+        padding: 1.5rem;
+        border-radius: 10px;
+        color: white;
+        margin: 1rem 0;
+        text-shadow: 1px 1px 2px rgba(0,0,0,0.5);
+    }
+    .feature-box {
+        background: #F8F9FA;
+        padding: 1rem;
+        border-radius: 8px;
+        border: 2px solid #DEE2E6;
+        margin: 0.5rem 0;
+    }
+    .feature-box h4 {
+        color: #2E86AB;
+        margin-bottom: 0.5rem;
+    }
+    .feature-box p {
+        color: #495057;
+        margin: 0;
+    }
+    .metric-container {
+        background: white;
+        padding: 1rem;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        text-align: center;
+        margin: 0.5rem 0;
+    }
+    .stSelectbox > div > div > select {
+        background-color: #F8F9FA;
+        border: 2px solid #2E86AB;
+        border-radius: 5px;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Header Section
+st.markdown('<h1 class="main-header">🏥 MedLipReader</h1>', unsafe_allow_html=True)
+st.markdown('<p class="subtitle">AI-Powered Lip Reading Technology for Healthcare Accessibility</p>', unsafe_allow_html=True)
+# Information banner
+st.markdown("""
+<div class="info-card">
+    <h3>🎯 Empowering Communication for the Deaf and Hard-of-Hearing Community</h3>
+    <p>MedLipReader uses advanced AI to translate lip movements into text, enabling seamless communication in healthcare settings without requiring audio input.</p>
+</div>
+""", unsafe_allow_html=True)
+# Key features section
+st.markdown('<div class="section-header">✨ Key Features</div>', unsafe_allow_html=True)
+feature_col1, feature_col2, feature_col3 = st.columns(3)
+with feature_col1:
+    st.markdown("""
+    <div class="feature-box">
+        <h4>🔇 Silent Operation</h4>
+        <p>Works entirely without audio input</p>
+    </div>
+    """, unsafe_allow_html=True)
+with feature_col2:
+    st.markdown("""
+    <div class="feature-box">
+        <h4>⚡ Real-time Processing</h4>
+        <p>Instant lip-to-text conversion</p>
+    </div>
+    """, unsafe_allow_html=True)
+with feature_col3:
+    st.markdown("""
+    <div class="feature-box">
+        <h4>🏥 Healthcare Optimized</h4>
+        <p>Designed for medical environments</p>
+    </div>
+    """, unsafe_allow_html=True)
+st.markdown("---")
+# Main application section
+st.markdown('<div class="section-header">🎬 Video Analysis Demo</div>', unsafe_allow_html=True)
+# Video selection
+options = [x for x in os.listdir(os.path.join('data')) if x.endswith('.mpg')]
+selected_video = st.selectbox('🎥 Choose a video for lip reading analysis:', options, help="Select a video file to analyze with MedLipReader")
+# Main content columns
+col1, col2 = st.columns([1, 1])
+if options and selected_video:
+    with col1:
+        st.markdown("""
+        <div style="background: #E3F2FD; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
+            <h4 style="color: #1565C0; margin: 0;">📹 Input Video Analysis</h4>
+            <p style="margin: 0.5rem 0 0 0; color: #424242;">Visual lip movement data processed by AI model</p>
+        </div>
+        """, unsafe_allow_html=True)
+        # Process video
+        file_path = os.path.join('data', selected_video)
+        with st.spinner('🔄 Loading and processing video...'):
+            gif_frames = load_video_for_gif(file_path)
+            alignment = load_alignments(file_path.replace('.mpg','.align'))
+            os.makedirs("tmp", exist_ok=True)
+            gif_filename = f"tmp/animation_{uuid.uuid4().hex}.gif"
+            imageio.mimsave(gif_filename, gif_frames, fps=15, loop=0)
+        # Display video
+        st.image(gif_filename, width=400, caption="Lip movement sequence being analyzed")
+        st.markdown("""
+        <div style="background: #E8F4FD; padding: 0.8rem; border-radius: 6px; margin: 1rem 0; border-left: 3px solid #2196F3;">
+            <p style="margin: 0; color: #1565C0; font-size: 0.9rem; font-style: italic;">
+                💡 <strong>Innovation:</strong> Unlike image-based systems, this model learns from temporal dependencies across frames, capturing the natural flow of speech articulation.
+            </p>
+        </div>
+        """, unsafe_allow_html=True)
+        # Ground truth section
+        st.markdown("**📝 Ground Truth (Reference Text):**")
+        st.markdown(f"""
+        <div style="background: #F5F5F5; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #4CAF50; color: #333;">
+            {alignment}
+        </div>
+        """, unsafe_allow_html=True)
+    with col2:
+        st.markdown("""
+        <div style="background: #E8F5E8; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
+            <h4 style="color: #2E7D32; margin: 0;">🤖 AI Model Output</h4>
+            <p style="margin: 0.5rem 0 0 0; color: #424242;">Real-time lip reading results</p>
+        </div>
+        """, unsafe_allow_html=True)
+        # Run inference
+        with st.spinner('🧠 AI model processing lip movements...'):
+            start_time = time.time()
+            result = inference(file_path)
+            processing_time = time.time() - start_time
+        # Processing metrics
+        metric_col1, metric_col2 = st.columns(2)
+        with metric_col1:
+            st.markdown(f"""
+            <div class="metric-container">
+                <h3 style="color: #2E86AB; margin: 0;">{processing_time:.2f}s</h3>
+                <p style="margin: 0; color: #666;">Processing Time</p>
+            </div>
+            """, unsafe_allow_html=True)
+        with metric_col2:
+            st.markdown(f"""
+            <div class="metric-container">
+                <h3 style="color: #2E86AB; margin: 0;">{len(result["tokens"][0])}</h3>
+                <p style="margin: 0; color: #666;">Tokens Generated</p>
+            </div>
+            """, unsafe_allow_html=True)
+        # Raw tokens output
+        st.markdown("**🔤 Raw Model Tokens:**")
+        st.markdown(f"""
+        <div style="background: #FFF3E0; padding: 1rem; border-radius: 5px; font-family: monospace; border-left: 3px solid #FF9800; font-size: 0.9rem; color: #E65100;">
+            {result["tokens"][0]}
+        </div>
+        """, unsafe_allow_html=True)
+        # Final prediction
+        st.markdown("**💬 Decoded Text Prediction:**")
+        st.markdown(f"""
+        <div style="background: #E8F5E8; padding: 1.5rem; border-radius: 8px; border-left: 4px solid #4CAF50; font-size: 1.1rem; font-weight: bold; color: #2E7D32;">
+            "{result["prediction"][0]}"
+        </div>
+        """, unsafe_allow_html=True)
+# Success message
+if options and selected_video:
+    st.markdown("""
+    <div class="success-card">
+        <h3>✅ Analysis Complete!</h3>
+        <p>MedLipReader has successfully converted lip movements to text. This technology can help bridge communication gaps in healthcare settings, ensuring better patient care for the deaf and hard-of-hearing community.</p>
+    </div>
+    """, unsafe_allow_html=True)
+# Footer section
+st.markdown("---")
+st.markdown("""
+<div style="text-align: center; padding: 2rem; background: #F8F9FA; border-radius: 10px; margin-top: 2rem;">
+    <h4 style="color: #495057; margin-bottom: 1rem;">🌟 Impact on Healthcare Accessibility</h4>
+    <p style="color: #6C757D; margin: 0;">
+        MedLipReader represents a step forward in making healthcare more inclusive and accessible.
+        By enabling silent communication, we're breaking down barriers and ensuring everyone receives the care they deserve.
+    </p>
+</div>
+""", unsafe_allow_html=True)
+st.markdown('<br><br><p class="subtitle">Developed by Farhan Ali Khan | Empowering Accessible Healthcare Through AI</p>', unsafe_allow_html=True)

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from lip-reader-hf!")
+if __name__ == "__main__":
+    main()

model_utils/__init__.py ADDED Viewed

File without changes

model_utils/inference.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from .model import MyModel
+from .utils import load_data
+from .vocabulary import vocabulary
+from .utils import ctc_greedy_decoder
+# model = MyModel(vocabulary).to('cpu')
+# cp = torch.load("model_utils/lipnet_model.pt", map_location=torch.device('cpu'))
+# model.load_state_dict(cp["model_state_dict"])
+# model.eval()
+model = torch.jit.load("model_utils/lipnet_model_full_scripted.pt", map_location=torch.device('cpu'))
+model.eval()
+def inference(path:str):
+    frames, _ = load_data(path)
+    input = frames.unsqueeze(0).to('cpu')
+    with torch.no_grad():
+        output = model(input)
+    result, tokens = ctc_greedy_decoder(output)
+    return {"prediction":result, "tokens":tokens}

model_utils/lipnet_model_full_scripted.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52e4097a11847f710937c7e3f00fa773096b1394045c4076ae990399dde53271
+size 33926246

model_utils/tokenizer.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .vocabulary import vocabulary
+class Tokenizer:
+    def __init__(self, vocab=vocabulary):
+        self.str_to_int = vocab
+        self.int_to_str = {id:token for token, id in vocab.items()}
+    def encode(self, text):
+        text_tokens = [token if token in self.str_to_int else "<|unk|>" for token in text]
+        ids = [self.str_to_int[token] for token in text_tokens]
+        return ids
+    def decode(self, ids):
+        text = "".join([self.int_to_str[id] for id in ids])
+        return text

model_utils/utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import cv2
+import torch
+import torchvision.transforms.functional as F
+import os
+from .tokenizer import Tokenizer
+from .vocabulary import vocabulary
+import numpy as np
+tokenizer = Tokenizer()
+def load_video_for_gif(path):
+    cap = cv2.VideoCapture(path)
+    frames = []
+    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Crop the mouth region
+        frame = frame[190:236, 80:220, :]  # [H, W, C]
+        frames.append(frame)
+    cap.release()
+    # Convert to list of uint8 numpy arrays
+    frames_np = [np.array(f).astype(np.uint8) for f in frames]
+    return frames_np  # List of [H, W, 3]
+def load_video(path):
+    cap = cv2.VideoCapture(path)
+    frames = []
+    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
+        ret, frame = cap.read()
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Convert to torch tensor [H, W, C] → [C, H, W]
+        frame = torch.from_numpy(frame).permute(2, 0, 1).float()
+        # Grayscale
+        frame = F.rgb_to_grayscale(frame)
+        frame = frame[:, 190:236, 80:220]   # keep channel dimension
+        frames.append(frame)
+    cap.release()
+    frames = torch.stack(frames)   # Shape: [T, 1, H, W]
+    # Normalize (per video)
+    mean = frames.mean()
+    std = frames.std()
+    frames = (frames - mean) / (std + 1e-8)
+    return frames  # Shape: [T, 1, 46, 140]
+def load_alignments(path: str):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    tokens = []
+    for line in lines:
+        line = line.split()
+        if line[2] != "sil":  # skip silence
+            tokens.append(" ")
+            tokens.append(line[2])
+    # Join into one string
+    text = "".join(tokens).strip()
+    # Encode using your tokenizer
+    # ids = tokenizer.encode(text)
+    # return torch.tensor(ids, dtype=torch.long)
+    return text
+def load_data(path: str):
+    # path = bytes.decode(path.numpy())
+    # file_name = path.split('/')[-1].split('.')[0]
+    # File name splitting for windows
+    file_name = path.split('\\')[-1].split('.')[0]
+    video_path = os.path.join('data',f'{file_name}.mpg')
+    alignment_path = os.path.join('data',f'{file_name}.align')
+    frames = load_video(video_path)
+    alignments = load_alignments(alignment_path)
+    return frames, alignments
+def ctc_greedy_decoder(logits_batch, vocab = tokenizer.int_to_str, blank_id=0):
+    # logits_batch: (batch, time, vocab_size)
+    probabilities = torch.argmax(logits_batch, dim=-1)  # (batch, time)
+    output = []
+    tokens = []
+    for seq in probabilities:  # iterate over batch
+        decoded = []
+        indices = []
+        prev = None
+        for idx in seq.tolist():
+            # CTC rule: ignore blanks + collapse repeats
+            if idx != blank_id and idx != prev:
+                indices.append(idx)
+                decoded.append(vocab[idx])
+            prev = idx
+        output.append("".join(decoded))
+        tokens.append(indices)
+    return output, tokens

model_utils/vocabulary.py ADDED Viewed

	@@ -0,0 +1,4 @@

+all_chars = "abcdefghijklmnopqrstuvwxyz'?!123456789 "
+vocabulary = {token:id+1 for id, token in enumerate(all_chars)}
+vocabulary["<blank>"] = 0
+vocabulary["<unk>"] = len(vocabulary)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "lip-reader-hf"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "imageio[ffmpeg]>=2.37.0",
+    "moviepy>=2.2.1",
+    "opencv-python>=4.12.0.88",
+    "streamlit>=1.48.1",
+    "torch>=2.8.0",
+    "torchvision>=0.23.0",
+]