import logging import time from pathlib import Path from typing import Any, Optional import gradio as gr from . import config from .model import LipReadingModel, predict_from_video from .preprocessing import VideoPreprocessor logger = logging.getLogger(__name__) def _resolve_video_path(video_input: Any) -> Optional[Path]: """ Gradio's Video component can return a filepath or a dict depending on version. Normalize to a Path if possible. """ if not video_input: return None if isinstance(video_input, (str, Path)): path = Path(video_input) return path if path.is_file() else None if isinstance(video_input, dict): for key in ("name", "path", "data"): value = video_input.get(key) if value: candidate = Path(value) if candidate.is_file(): return candidate return None def run_prediction( video_input: Any, model: LipReadingModel, preprocessor: VideoPreprocessor, ): """ Takes a video input from Gradio, processes it, and returns the predicted text. Includes validation and error handling. """ video_path = _resolve_video_path(video_input) if not video_path: return "No video provided. Please upload or record a video." try: video_size_mb = video_path.stat().st_size / (1024 * 1024) logger.info("Uploaded video size: %.2f MB", video_size_mb) except Exception as exc: logger.error("Error accessing video file: %s", exc) return f"Error accessing video file: {exc}" if video_size_mb > config.MAX_VIDEO_SIZE_MB: return f"Video size exceeds {config.MAX_VIDEO_SIZE_MB} MB limit. Please upload a smaller video." try: logger.info("Running prediction for %s", video_path) start_time = time.time() prediction = predict_from_video( video_path=str(video_path), model=model, preprocessor=preprocessor, ) total_time = time.time() - start_time logger.info("Prediction completed in %.2f seconds.", total_time) logger.info("Prediction result: %s", prediction) except Exception as exc: logger.error("Prediction error: %s", exc) prediction = f"An error occurred during prediction: {exc}" return prediction if isinstance(prediction, str) else str(prediction) def create_app(model: LipReadingModel, preprocessor: VideoPreprocessor) -> gr.Blocks: with gr.Blocks(title="Lip Reading App") as demo: gr.Markdown("# Lip Reading App") gr.Markdown( "Upload a short video or record with your webcam to generate a lip-reading transcription." ) with gr.Tab("Upload Video"): video_input = gr.Video( label="Upload your video", sources=["upload"], format="mp4", ) predict_button = gr.Button("Run prediction") prediction_output = gr.Textbox( label="Predicted text", interactive=False, lines=4, placeholder="Prediction will appear here.", ) with gr.Tab("Record Video"): video_recorder = gr.Video( label="Record with webcam", sources=["webcam"], format="mp4", ) predict_button_rec = gr.Button("Run prediction on recording") prediction_output_rec = gr.Textbox( label="Predicted text", interactive=False, lines=4, placeholder="Prediction will appear here.", ) with gr.Accordion("How to use", open=False): gr.Markdown( """ **Upload video** - Select an MP4/AVI/MOV/MPG video that clearly shows the speaker's lips. - Click "Run prediction" to get the transcription. **Record video** - Allow webcam access if prompted. - Record, wait for the preview to appear, then click "Run prediction on recording". """ ) predict_button.click( fn=lambda video: run_prediction(video, model, preprocessor), inputs=video_input, outputs=prediction_output, ) predict_button_rec.click( fn=lambda video: run_prediction(video, model, preprocessor), inputs=video_recorder, outputs=prediction_output_rec, ) gr.Markdown("---\n(c) 2024 Lip Reading App.") demo.queue(max_size=4) return demo