import gradio as gr import os import subprocess import torch import torchaudio from speechbrain.pretrained import EncoderClassifier import yt_dlp import tempfile def download_video(url, out_path): """Download a video from YouTube or direct MP4 link.""" try: if "youtube.com" in url or "youtu.be" in url: ydl_opts = {'outtmpl': out_path} with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) else: os.system(f"wget -O {out_path} {url}") return out_path except Exception as e: return f"ERROR: Video download failed: {str(e)}" def extract_audio(video_path, audio_path): """Extract audio from video file using ffmpeg.""" try: cmd = [ "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path ] subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return audio_path except Exception as e: return f"ERROR: Audio extraction failed: {str(e)}" def analyze_accent(audio_path): """Analyze accent using SpeechBrain pre-trained model.""" try: classifier = EncoderClassifier.from_hparams( source="speechbrain/lang-id-voxlingua107-ecapa", savedir="pretrained_models/lang-id-voxlingua107-ecapa" ) signal, fs = torchaudio.load(audio_path) prediction = classifier.classify_batch(signal) predicted_lang = prediction[3][0] confidence = float(torch.max(prediction[1]).item()) return predicted_lang, confidence except Exception as e: return f"ERROR: Accent analysis failed: {str(e)}", None def process_input(video_link, uploaded_video): temp_dir = tempfile.mkdtemp() # Prioritize file upload if both provided if uploaded_video is not None: video_path = os.path.join(temp_dir, uploaded_video.name) with open(video_path, "wb") as f: f.write(uploaded_video.read()) elif video_link: video_path = os.path.join(temp_dir, "input_video.mp4") result = download_video(video_link, video_path) if isinstance(result, str) and result.startswith("ERROR"): return result, None, None, None else: return "Please provide a YouTube/MP4 link or upload a video file.", None, None, None # Extract audio audio_path = os.path.join(temp_dir, "audio.wav") result = extract_audio(video_path, audio_path) if isinstance(result, str) and result.startswith("ERROR"): return result, None, None, None # Analyze accent accent, confidence = analyze_accent(audio_path) if isinstance(accent, str) and accent.startswith("ERROR"): return accent, None, None, None # For playback in Gradio return ( f"**Detected Language/Accent:** {accent}\n\n**Confidence:** {confidence*100:.2f}%", video_path, audio_path, accent ) with gr.Blocks() as demo: gr.Markdown("# 🎙️ Accent/Language Detection from Video") gr.Markdown( "Upload a video or provide a YouTube/direct MP4 link. This app will extract the audio, " "detect the spoken language/accent, and estimate confidence using a SpeechBrain pre-trained model." ) with gr.Row(): video_link = gr.Textbox(label="YouTube or MP4 Link (optional)") uploaded_video = gr.File(label="Upload Video File (optional)", file_types=[".mp4", ".mov", ".avi", ".mkv"]) btn = gr.Button("Analyze") output_text = gr.Markdown() video_output = gr.Video(label="Video Preview") audio_output = gr.Audio(label="Extracted Audio", type="filepath") btn.click( fn=process_input, inputs=[video_link, uploaded_video], outputs=[output_text, video_output, audio_output, gr.Textbox(visible=False)] ) demo.launch()