LipNet / lipnet /ui.py
thienphuc12339's picture
Upload 10 files
6d5d850 verified
import logging
import time
from pathlib import Path
from typing import Any, Optional
import gradio as gr
from . import config
from .model import LipReadingModel, predict_from_video
from .preprocessing import VideoPreprocessor
logger = logging.getLogger(__name__)
def _resolve_video_path(video_input: Any) -> Optional[Path]:
"""
Gradio's Video component can return a filepath or a dict depending on version.
Normalize to a Path if possible.
"""
if not video_input:
return None
if isinstance(video_input, (str, Path)):
path = Path(video_input)
return path if path.is_file() else None
if isinstance(video_input, dict):
for key in ("name", "path", "data"):
value = video_input.get(key)
if value:
candidate = Path(value)
if candidate.is_file():
return candidate
return None
def run_prediction(
video_input: Any,
model: LipReadingModel,
preprocessor: VideoPreprocessor,
):
"""
Takes a video input from Gradio, processes it, and returns the predicted text.
Includes validation and error handling.
"""
video_path = _resolve_video_path(video_input)
if not video_path:
return "No video provided. Please upload or record a video."
try:
video_size_mb = video_path.stat().st_size / (1024 * 1024)
logger.info("Uploaded video size: %.2f MB", video_size_mb)
except Exception as exc:
logger.error("Error accessing video file: %s", exc)
return f"Error accessing video file: {exc}"
if video_size_mb > config.MAX_VIDEO_SIZE_MB:
return f"Video size exceeds {config.MAX_VIDEO_SIZE_MB} MB limit. Please upload a smaller video."
try:
logger.info("Running prediction for %s", video_path)
start_time = time.time()
prediction = predict_from_video(
video_path=str(video_path),
model=model,
preprocessor=preprocessor,
)
total_time = time.time() - start_time
logger.info("Prediction completed in %.2f seconds.", total_time)
logger.info("Prediction result: %s", prediction)
except Exception as exc:
logger.error("Prediction error: %s", exc)
prediction = f"An error occurred during prediction: {exc}"
return prediction if isinstance(prediction, str) else str(prediction)
def create_app(model: LipReadingModel, preprocessor: VideoPreprocessor) -> gr.Blocks:
with gr.Blocks(title="Lip Reading App") as demo:
gr.Markdown("# Lip Reading App")
gr.Markdown(
"Upload a short video or record with your webcam to generate a lip-reading transcription."
)
with gr.Tab("Upload Video"):
video_input = gr.Video(
label="Upload your video",
sources=["upload"],
format="mp4",
)
predict_button = gr.Button("Run prediction")
prediction_output = gr.Textbox(
label="Predicted text",
interactive=False,
lines=4,
placeholder="Prediction will appear here.",
)
with gr.Tab("Record Video"):
video_recorder = gr.Video(
label="Record with webcam",
sources=["webcam"],
format="mp4",
)
predict_button_rec = gr.Button("Run prediction on recording")
prediction_output_rec = gr.Textbox(
label="Predicted text",
interactive=False,
lines=4,
placeholder="Prediction will appear here.",
)
with gr.Accordion("How to use", open=False):
gr.Markdown(
"""
**Upload video**
- Select an MP4/AVI/MOV/MPG video that clearly shows the speaker's lips.
- Click "Run prediction" to get the transcription.
**Record video**
- Allow webcam access if prompted.
- Record, wait for the preview to appear, then click "Run prediction on recording".
"""
)
predict_button.click(
fn=lambda video: run_prediction(video, model, preprocessor),
inputs=video_input,
outputs=prediction_output,
)
predict_button_rec.click(
fn=lambda video: run_prediction(video, model, preprocessor),
inputs=video_recorder,
outputs=prediction_output_rec,
)
gr.Markdown("---\n(c) 2024 Lip Reading App.")
demo.queue(max_size=4)
return demo