Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import pipeline | |
| import os | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| from pytube import YouTube | |
| import timeit | |
| import math | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device) | |
| def transcribe_speech_local(filepath): | |
| if filepath is None: | |
| return [{"error": "No audio found, please retry."}] | |
| # Split audio into 15-second chunks | |
| audio = AudioSegment.from_file(filepath) | |
| chunk_length_ms = 15000 # 15 seconds in milliseconds | |
| chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] | |
| print(chunks) | |
| aligned_chunks = [] | |
| transcription_time_total = 0 | |
| # Transcribe each chunk and measure time | |
| for chunk_id, chunk in enumerate(chunks): | |
| start_time = timeit.default_timer() | |
| chunk.export("temp_chunk.wav", format="wav") | |
| output = pipe("temp_chunk.wav") | |
| transcription_time = timeit.default_timer() - start_time | |
| transcription_time_total += transcription_time | |
| # Calculate start and end times in seconds | |
| start_time_sec = chunk_id * 15 | |
| end_time_sec = start_time_sec + len(chunk) / 1000.0 | |
| aligned_chunks.append({ | |
| "chunk_id": chunk_id, | |
| "chunk_length": len(chunk) / 1000.0, | |
| "text": output["text"], | |
| "start_time": start_time_sec, | |
| "end_time": end_time_sec, | |
| "transcription_time": transcription_time | |
| }) | |
| return aligned_chunks | |
| def download_audio_from_youtube(youtube_url): | |
| yt = YouTube(youtube_url) | |
| stream = yt.streams.filter(only_audio=True).first() | |
| output_path = stream.download() | |
| base, ext = os.path.splitext(output_path) | |
| audio_file = base + '.mp3' | |
| os.rename(output_path, audio_file) | |
| return audio_file | |
| def transcribe_speech_from_youtube(youtube_url): | |
| audio_filepath = download_audio_from_youtube(youtube_url) | |
| # Convert to WAV format with 16kHz sample rate if necessary | |
| audio = AudioSegment.from_file(audio_filepath) | |
| audio = audio.set_frame_rate(16000).set_channels(1) | |
| audio.export("converted_audio.wav", format="wav") | |
| audio = AudioSegment.from_file("converted_audio.wav") | |
| # Split audio into 15-second chunks | |
| chunk_length_ms = 15000 # 15 seconds in milliseconds | |
| chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] | |
| aligned_chunks = [] | |
| transcription_time_total = 0 | |
| # Transcribe each chunk and measure time | |
| for chunk_id, chunk in enumerate(chunks): | |
| start_time = timeit.default_timer() | |
| chunk.export("temp_chunk.wav", format="wav") | |
| output = pipe("temp_chunk.wav") | |
| transcription_time = timeit.default_timer() - start_time | |
| transcription_time_total += transcription_time | |
| # Calculate start and end times in seconds | |
| start_time_sec = chunk_id * 15 | |
| end_time_sec = start_time_sec + len(chunk) / 1000.0 | |
| aligned_chunks.append({ | |
| "chunk_id": chunk_id, | |
| "chunk_length": len(chunk) / 1000.0, | |
| "text": output["text"], | |
| "start_time": start_time_sec, | |
| "end_time": end_time_sec, | |
| "transcription_time": transcription_time | |
| }) | |
| # Clean up temporary files | |
| if os.path.exists("temp_chunk.wav"): | |
| os.remove("temp_chunk.wav") | |
| if os.path.exists("converted_audio.wav"): | |
| os.remove("converted_audio.wav") | |
| if os.path.exists(audio_filepath): | |
| os.remove(audio_filepath) | |
| return aligned_chunks | |
| file_transcribe = gr.Interface( | |
| fn=transcribe_speech_local, | |
| inputs=gr.Audio(sources="upload", type="filepath"), | |
| outputs=gr.JSON(label="Transcription with Time Alignment"), | |
| allow_flagging="never" | |
| ) | |
| link_transcribe = gr.Interface( | |
| fn=transcribe_speech_from_youtube, | |
| inputs=gr.Textbox(lines=1, placeholder="Enter YouTube URL here...", label="YouTube URL"), | |
| outputs=gr.JSON(label="Transcription with Time Alignment"), | |
| allow_flagging="never" | |
| ) | |
| demo = gr.TabbedInterface( | |
| [file_transcribe, link_transcribe ], | |
| ["Local files(mp3/mp4/wav)", "Links"] | |
| ) | |
| demo.launch(share=True) | |