Spaces:
Runtime error
Runtime error
| # install packages | |
| # !pip install --upgrade -q ipython-autotime | |
| # %load_ext autotime | |
| # Download youtube video | |
| # !pip install -q pytube | |
| # youtube video download function | |
| import os | |
| from pytube import YouTube | |
| def progress_function(stream, chunk, bytes_remaining): | |
| total_size = stream.filesize | |
| bytes_downloaded = total_size - bytes_remaining | |
| percentage_of_completion = bytes_downloaded / total_size * 100 | |
| print(f"Downloaded {percentage_of_completion}%") | |
| def youtube_download(video_url): | |
| yt = YouTube(video_url, on_progress_callback=progress_function) | |
| # get video title | |
| video_title = yt.title | |
| print(f"Downloading video: {video_title}") | |
| stream = yt.streams.get_highest_resolution() | |
| # get video default name | |
| default_filename = stream.default_filename | |
| stream.download() | |
| return default_filename | |
| # use insanely-fast-whisper | |
| # !pip install --upgrade -q transformers optimum accelerate pyannote.audio | |
| import re | |
| import json | |
| import torch | |
| from transformers import pipeline | |
| from pyannote.audio import Pipeline | |
| # transfer srt to plain text | |
| import json | |
| def seconds_to_hms(seconds): | |
| # Simple conversion of seconds to HH:MM:SS format | |
| hours, remainder = divmod(seconds, 3600) | |
| minutes, seconds = divmod(remainder, 60) | |
| return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" | |
| def transcript_json2txt(segmented_transcript,file_path): | |
| # with open(file_path, 'r') as file: | |
| # formatted_dialogue = json.load(file) | |
| # Generating the dialogue text | |
| formatted_dialogue = segmented_transcript | |
| dialogue_text = "" | |
| for dialogue in formatted_dialogue: | |
| # Converting start time to HH:MM:SS format | |
| start_time = seconds_to_hms(dialogue['timestamp'][0]) | |
| speaker = dialogue.get('speaker',"").replace("SPEAKER_", "speaker") # Formatting speaker name | |
| text = dialogue.get('text',"").strip() # Removing any leading/trailing whitespaces from the text | |
| # Constructing each dialogue entry | |
| dialogue_text += f"{start_time}, {speaker}: {text}\n\n" | |
| # Checking the first part of the generated dialogue text | |
| print("preview txt...") | |
| print('---------------------------------\n') | |
| print(dialogue_text[:500]) # Displaying the first 500 characters for review | |
| # Save the dialogue text to a file | |
| output_txt_file_path = file_path.replace('.json','.txt') | |
| with open(output_txt_file_path, 'w',encoding="utf8") as file: | |
| file.write(dialogue_text) | |
| print( | |
| f"Voila!β¨ Your file has been transcribed go check it out over here π {output_txt_file_path}" | |
| ) | |
| return dialogue_text | |
| # transcript function | |
| model_name = "openai/whisper-large-v3" | |
| flash = False # Set to True to use Flash Attention 2 | |
| print('---------------------------------') | |
| print('load pipe...') | |
| print('---------------------------------') | |
| # Initialize the pipeline | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model_name, | |
| torch_dtype=torch.float16, | |
| # low_cpu_mem_usage=True, | |
| device='cuda:0', | |
| model_kwargs={"use_flash_attention_2": flash}, | |
| ) | |
| def transcript(file_path,pipe = pipe): | |
| pattern = '\.mp4|\.wav|\.mp3' | |
| transcript_path = re.sub(pattern,'.json',file_path) | |
| device_id = "0" # or "mps" for Macs with Apple Silicon | |
| device = "cuda" # or "mps" for Macs with Apple Silicon | |
| task = "transcribe" # or "translate" | |
| language = 'Chinese' # Whisper auto-detects the language | |
| batch_size = 24 | |
| timestamp = "chunk" # or "word" | |
| diarization_model = "pyannote/speaker-diarization-3.1" | |
| # Transcribe the audio | |
| print('Transcribing...') | |
| print('---------------------------------\n') | |
| outputs = pipe( | |
| file_path, | |
| chunk_length_s=30, | |
| batch_size=batch_size, | |
| # generate_kwargs={"task": task, "language": language}, | |
| generate_kwargs={"task": task}, | |
| return_timestamps=True | |
| ) | |
| # Save or display the output | |
| print('Saving transcript...') | |
| print('---------------------------------\n') | |
| with open(transcript_path, "w", encoding="utf8") as fp: | |
| json.dump(outputs, fp, ensure_ascii=False) | |
| print( | |
| f"Voila!β¨ Your file has been transcribed go check it out over here π {transcript_path}" | |
| ) | |
| # save to transcript txt file | |
| transcript_txt = transcript_json2txt(outputs['chunks'],transcript_path) | |
| transcript_txt_path = transcript_path.replace('.json','.txt') | |
| # save to srt file | |
| # Function to convert time in seconds to SRT time format | |
| def convert_to_srt_time(timestamp): | |
| hours = int(timestamp // 3600) | |
| minutes = int((timestamp % 3600) // 60) | |
| seconds = int(timestamp % 60) | |
| milliseconds = int((timestamp % 1) * 1000) | |
| return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" | |
| # Creating the SRT content | |
| srt_content = "" | |
| for index, entry in enumerate(outputs['chunks']): | |
| try: | |
| start_time = convert_to_srt_time(entry['timestamp'][0]) | |
| end_time = convert_to_srt_time(entry['timestamp'][1] if entry['timestamp'][1] is not None else entry['timestamp'][0]+1) | |
| srt_content += f"{index + 1}\n{start_time} --> {end_time}\n{entry['text']}\n\n" | |
| except Exception as e: | |
| print(e) | |
| print(entry) | |
| # Saving the SRT content to a file | |
| srt_file_path = transcript_path.replace('.json','.srt') | |
| # srt_file_path = '/kaggle/working/6-revolution_transcript.srt' | |
| with open(srt_file_path, 'w',encoding="utf8") as file: | |
| file.write(srt_content) | |
| print( | |
| f"Voila!β¨ Your file has been transcribed go check it out over here π {srt_file_path}" | |
| ) | |
| return transcript_txt,srt_file_path | |
| # youtube transcript function | |
| def transcript_youtube(url): | |
| # download youtube video | |
| default_filename = youtube_download(url) | |
| file_path = os.path.join(os.getcwd(),default_filename) | |
| transcript_txt,srt_file_path = transcript(file_path) | |
| return transcript_txt[:500],file_path,srt_file_path | |
| # test youtube transcript | |
| # url = "https://www.youtube.com/watch?v=2UP7pfGVm0Y&t=252s&ab_channel=TheTEFLOrg" | |
| # transcript_youtube(url) | |
| # gradio interface | |
| # !pip install --upgrade -q gradio | |
| import gradio as gr | |
| title = "Fastly audio transcript" | |
| description = "Input your audio or record your audio" | |
| def audio_func(audio_file): | |
| return f"This is the audio file path: {audio_file}" | |
| def file_func(file_path): | |
| return f"This is the file path: {file_path}" | |
| audio_input = gr.Audio(type='filepath') | |
| file_input = gr.File(type="filepath") | |
| youtube_interface = gr.Interface( | |
| fn = transcript_youtube, | |
| inputs = gr.Textbox(label="youtube video", info="Input a youtube video url"), | |
| outputs = [ | |
| gr.Textbox(label="Transcript preview", lines=3), | |
| gr.File(label="Download Video"), | |
| gr.File(label="Srt file") | |
| ], | |
| title = "Fastly Youtube Video Transcrip", | |
| description = "Transcript Any Youtube video in Seconds!!!" | |
| ) | |
| audio_interface = gr.Interface( | |
| fn=audio_func, | |
| inputs=audio_input, | |
| outputs=[gr.Textbox(label="Greeting",lines=3)], | |
| title = title, | |
| description = description | |
| ) | |
| file_interface = gr.Interface( | |
| fn=file_func, | |
| inputs=file_input, | |
| outputs=[gr.Textbox(label="Greeting",lines=3)], | |
| title = title, | |
| description = description | |
| ) | |
| demo = gr.TabbedInterface([youtube_interface], ["Transcript youtube video"]) | |
| demo.queue(max_size = 20) | |
| demo.launch(share = True) | |