from pytube import YouTube
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import whisperx
from datasets import load_dataset
import os.path as osp
from mlxtend.file_io import find_files
from mlxtend.utils import Counter
import accelerate
import gc
import gradio as gr

# Definimos una función que se encarga de llevar a cabo las transcripciones
def URLToText(URL):
    
  # url input from user
  yt = YouTube(URL)

  # extract only audio
  video = yt.streams.filter(only_audio=True).first()

  # check for destination to save file
  destination = '.'

  # download the file
  out_file = video.download(output_path=destination)

  # save the file
  base, ext = os.path.splitext(out_file)
  base = base.replace(" ", "")
  new_file = base + '.mp3'
  os.rename(out_file, new_file)

  # Pasamos el auido a texto
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

  model_id = "openai/whisper-medium"

  model = AutoModelForSpeechSeq2Seq.from_pretrained(
      model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
  )
  model.to(device)

  processor = AutoProcessor.from_pretrained(model_id)

  pipe = pipeline(
      "automatic-speech-recognition",
      model=model,
      tokenizer=processor.tokenizer,
      feature_extractor=processor.feature_extractor,
      max_new_tokens=128,
      chunk_length_s=30,
      batch_size=16,
      return_timestamps=True,
      torch_dtype=torch_dtype,
      device=device,
  )
  result = pipe(new_file)
  return result["text"]

# Creamos la interfaz y la lanzamos. 
gr.Interface(fn=URLToText, inputs=gr.inputs.Textbox(label="Video URL"), outputs=gr.outputs.Textbox(label="Transcripción")).launch(share=False)