File size: 4,567 Bytes
6d5d850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import logging
import time
from pathlib import Path
from typing import Any, Optional

import gradio as gr

from . import config
from .model import LipReadingModel, predict_from_video
from .preprocessing import VideoPreprocessor

logger = logging.getLogger(__name__)


def _resolve_video_path(video_input: Any) -> Optional[Path]:
    """
    Gradio's Video component can return a filepath or a dict depending on version.
    Normalize to a Path if possible.
    """
    if not video_input:
        return None

    if isinstance(video_input, (str, Path)):
        path = Path(video_input)
        return path if path.is_file() else None

    if isinstance(video_input, dict):
        for key in ("name", "path", "data"):
            value = video_input.get(key)
            if value:
                candidate = Path(value)
                if candidate.is_file():
                    return candidate
    return None


def run_prediction(
    video_input: Any,
    model: LipReadingModel,
    preprocessor: VideoPreprocessor,
):
    """
    Takes a video input from Gradio, processes it, and returns the predicted text.
    Includes validation and error handling.
    """
    video_path = _resolve_video_path(video_input)
    if not video_path:
        return "No video provided. Please upload or record a video."

    try:
        video_size_mb = video_path.stat().st_size / (1024 * 1024)
        logger.info("Uploaded video size: %.2f MB", video_size_mb)
    except Exception as exc:
        logger.error("Error accessing video file: %s", exc)
        return f"Error accessing video file: {exc}"

    if video_size_mb > config.MAX_VIDEO_SIZE_MB:
        return f"Video size exceeds {config.MAX_VIDEO_SIZE_MB} MB limit. Please upload a smaller video."

    try:
        logger.info("Running prediction for %s", video_path)
        start_time = time.time()
        prediction = predict_from_video(
            video_path=str(video_path),
            model=model,
            preprocessor=preprocessor,
        )
        total_time = time.time() - start_time
        logger.info("Prediction completed in %.2f seconds.", total_time)
        logger.info("Prediction result: %s", prediction)
    except Exception as exc:
        logger.error("Prediction error: %s", exc)
        prediction = f"An error occurred during prediction: {exc}"

    return prediction if isinstance(prediction, str) else str(prediction)


def create_app(model: LipReadingModel, preprocessor: VideoPreprocessor) -> gr.Blocks:
    with gr.Blocks(title="Lip Reading App") as demo:
        gr.Markdown("# Lip Reading App")
        gr.Markdown(
            "Upload a short video or record with your webcam to generate a lip-reading transcription."
        )

        with gr.Tab("Upload Video"):
            video_input = gr.Video(
                label="Upload your video",
                sources=["upload"],
                format="mp4",
            )
            predict_button = gr.Button("Run prediction")
            prediction_output = gr.Textbox(
                label="Predicted text",
                interactive=False,
                lines=4,
                placeholder="Prediction will appear here.",
            )

        with gr.Tab("Record Video"):
            video_recorder = gr.Video(
                label="Record with webcam",
                sources=["webcam"],
                format="mp4",
            )
            predict_button_rec = gr.Button("Run prediction on recording")
            prediction_output_rec = gr.Textbox(
                label="Predicted text",
                interactive=False,
                lines=4,
                placeholder="Prediction will appear here.",
            )

        with gr.Accordion("How to use", open=False):
            gr.Markdown(
                """
**Upload video**
- Select an MP4/AVI/MOV/MPG video that clearly shows the speaker's lips.
- Click "Run prediction" to get the transcription.

**Record video**
- Allow webcam access if prompted.
- Record, wait for the preview to appear, then click "Run prediction on recording".
"""
            )

        predict_button.click(
            fn=lambda video: run_prediction(video, model, preprocessor),
            inputs=video_input,
            outputs=prediction_output,
        )
        predict_button_rec.click(
            fn=lambda video: run_prediction(video, model, preprocessor),
            inputs=video_recorder,
            outputs=prediction_output_rec,
        )

        gr.Markdown("---\n(c) 2024 Lip Reading App.")

    demo.queue(max_size=4)
    return demo