Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import subprocess | |
| import torch | |
| import torchaudio | |
| from speechbrain.pretrained import EncoderClassifier | |
| import yt_dlp | |
| import tempfile | |
| def download_video(url, out_path): | |
| """Download a video from YouTube or direct MP4 link.""" | |
| try: | |
| if "youtube.com" in url or "youtu.be" in url: | |
| ydl_opts = {'outtmpl': out_path} | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([url]) | |
| else: | |
| os.system(f"wget -O {out_path} {url}") | |
| return out_path | |
| except Exception as e: | |
| return f"ERROR: Video download failed: {str(e)}" | |
| def extract_audio(video_path, audio_path): | |
| """Extract audio from video file using ffmpeg.""" | |
| try: | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-i", video_path, | |
| "-vn", | |
| "-acodec", "pcm_s16le", | |
| "-ar", "16000", | |
| "-ac", "1", | |
| audio_path | |
| ] | |
| subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| return audio_path | |
| except Exception as e: | |
| return f"ERROR: Audio extraction failed: {str(e)}" | |
| def analyze_accent(audio_path): | |
| """Analyze accent using SpeechBrain pre-trained model.""" | |
| try: | |
| classifier = EncoderClassifier.from_hparams( | |
| source="speechbrain/lang-id-voxlingua107-ecapa", | |
| savedir="pretrained_models/lang-id-voxlingua107-ecapa" | |
| ) | |
| signal, fs = torchaudio.load(audio_path) | |
| prediction = classifier.classify_batch(signal) | |
| predicted_lang = prediction[3][0] | |
| confidence = float(torch.max(prediction[1]).item()) | |
| return predicted_lang, confidence | |
| except Exception as e: | |
| return f"ERROR: Accent analysis failed: {str(e)}", None | |
| def process_input(video_link, uploaded_video): | |
| temp_dir = tempfile.mkdtemp() | |
| # Prioritize file upload if both provided | |
| if uploaded_video is not None: | |
| video_path = os.path.join(temp_dir, uploaded_video.name) | |
| with open(video_path, "wb") as f: | |
| f.write(uploaded_video.read()) | |
| elif video_link: | |
| video_path = os.path.join(temp_dir, "input_video.mp4") | |
| result = download_video(video_link, video_path) | |
| if isinstance(result, str) and result.startswith("ERROR"): | |
| return result, None, None, None | |
| else: | |
| return "Please provide a YouTube/MP4 link or upload a video file.", None, None, None | |
| # Extract audio | |
| audio_path = os.path.join(temp_dir, "audio.wav") | |
| result = extract_audio(video_path, audio_path) | |
| if isinstance(result, str) and result.startswith("ERROR"): | |
| return result, None, None, None | |
| # Analyze accent | |
| accent, confidence = analyze_accent(audio_path) | |
| if isinstance(accent, str) and accent.startswith("ERROR"): | |
| return accent, None, None, None | |
| # For playback in Gradio | |
| return ( | |
| f"**Detected Language/Accent:** {accent}\n\n**Confidence:** {confidence*100:.2f}%", | |
| video_path, | |
| audio_path, | |
| accent | |
| ) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🎙️ Accent/Language Detection from Video") | |
| gr.Markdown( | |
| "Upload a video or provide a YouTube/direct MP4 link. This app will extract the audio, " | |
| "detect the spoken language/accent, and estimate confidence using a SpeechBrain pre-trained model." | |
| ) | |
| with gr.Row(): | |
| video_link = gr.Textbox(label="YouTube or MP4 Link (optional)") | |
| uploaded_video = gr.File(label="Upload Video File (optional)", file_types=[".mp4", ".mov", ".avi", ".mkv"]) | |
| btn = gr.Button("Analyze") | |
| output_text = gr.Markdown() | |
| video_output = gr.Video(label="Video Preview") | |
| audio_output = gr.Audio(label="Extracted Audio", type="filepath") | |
| btn.click( | |
| fn=process_input, | |
| inputs=[video_link, uploaded_video], | |
| outputs=[output_text, video_output, audio_output, gr.Textbox(visible=False)] | |
| ) | |
| demo.launch() | |