Spaces:
Runtime error
Runtime error
| """ | |
| Imports | |
| """ | |
| from transformers import pipeline | |
| from pytube import YouTube | |
| import gradio as gr | |
| import requests | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| """ | |
| Pipeline and models | |
| """ | |
| transcribe_pipe = pipeline(model="Silemo/whisper-it") # change to "your-username/the-name-you-picked" | |
| tags_model = AutoModelForSeq2SeqLM.from_pretrained("efederici/text2tags") | |
| tags_tokenizer = AutoTokenizer.from_pretrained("efederici/text2tags") | |
| """ | |
| Methods | |
| """ | |
| def transcribe(audio): | |
| text = transcribe_pipe(audio)["text"] | |
| return text | |
| def transcribe_video(url): | |
| yt = YouTube(url) | |
| stream = yt.streams.get_audio_only() | |
| # Saves the audio in the /audio folder | |
| audio = stream.download() #output_path = "audio/" | |
| text = transcribe_and_tag(audio) | |
| return text | |
| def transcribe_and_tag(audio): | |
| text = transcribe(audio) | |
| tags = tag(text=text) | |
| return text, tags | |
| def download_audio(audio_url, filename): | |
| # URL of the image to be downloaded is defined as audio_url | |
| r = requests.get(audio_url) # create HTTP response object | |
| # send a HTTP request to the server and save | |
| # the HTTP response in a response object called r | |
| with open(filename,'wb') as f: #"audio/" + | |
| # Saving received content as a mp3 file in | |
| # binary format | |
| # write the contents of the response (r.content) | |
| # to a new file in binary mode. | |
| f.write(r.content) | |
| def tag(text: str): | |
| """ | |
| Generates tags from given text | |
| """ | |
| text = text.strip().replace('\n', '') | |
| text = 'summarize: ' + text | |
| tokenized_text = tags_tokenizer.encode(text, return_tensors="pt") | |
| tags_ids = tags_model.generate(tokenized_text, | |
| num_beams=4, | |
| no_repeat_ngram_size=2, | |
| max_length=20, | |
| early_stopping=True) | |
| output = tags_tokenizer.decode(tags_ids[0], skip_special_tokens=True) | |
| return output.split(', ') | |
| """ | |
| Downloading audio files | |
| """ | |
| audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3" | |
| audio1_filename = "offer.mp3" | |
| download_audio(audio1_url, audio1_filename) | |
| audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3" | |
| audio2_filename = "fantozzi.mp3" | |
| download_audio(audio2_url, audio2_filename) | |
| """ | |
| Interfaces | |
| """ | |
| audio_transcription = gr.Textbox(label="Transcription") | |
| audio_tags = gr.Textbox(label="Tags") | |
| yt_transcription = gr.Textbox(label="Transcription") | |
| yt_tags = gr.Textbox(label="Tags") | |
| # Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450 | |
| io1 = gr.Interface( | |
| fn = transcribe_and_tag, | |
| inputs = gr.Audio(sources=["upload", "microphone"], type="filepath"), | |
| outputs = [audio_transcription, audio_tags], | |
| examples = [ | |
| [audio1_filename], | |
| [audio2_filename], | |
| ], | |
| title = "Whisper Small - Italian - Microphone or Audio file", | |
| description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone as audio input. It outputs a transcription and the tags of the text.", | |
| ) | |
| io2 = gr.Interface( | |
| fn = transcribe_video, | |
| inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"), | |
| outputs=[yt_transcription, yt_tags], | |
| examples=[ | |
| # Meloni - Confindustria | |
| ["https://www.youtube.com/watch?v=qMslwA7RCcc"], | |
| # Montemagno - Ripartire da zero | |
| ["https://www.youtube.com/watch?v=WlT3dCAGjRo"], | |
| ], | |
| title = "Whisper Small - Italian - YouTube link", | |
| description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input. It outputs a transcription and the tags of the text.", | |
| ) | |
| gr.TabbedInterface( | |
| [io1, io2], {"Microphone or audio file", "YouTube"} | |
| ).launch() |