from pytube import YouTube import os import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import whisperx from datasets import load_dataset import os.path as osp from mlxtend.file_io import find_files from mlxtend.utils import Counter import accelerate import gc import gradio as gr # Definimos una función que se encarga de llevar a cabo las transcripciones def URLToText(URL): # url input from user yt = YouTube(URL) # extract only audio video = yt.streams.filter(only_audio=True).first() # check for destination to save file destination = '.' # download the file out_file = video.download(output_path=destination) # save the file base, ext = os.path.splitext(out_file) base = base.replace(" ", "") new_file = base + '.mp3' os.rename(out_file, new_file) # Pasamos el auido a texto device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-medium" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) result = pipe(new_file) return result["text"] # Creamos la interfaz y la lanzamos. gr.Interface(fn=URLToText, inputs=gr.inputs.Textbox(label="Video URL"), outputs=gr.outputs.Textbox(label="Transcripción")).launch(share=False)