# import os # import gradio as gr # import torch # import pytube # import tempfile # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline # from pytube import YouTube # # Set environment variables for Hugging Face Space # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # # Model configuration # MODEL_NAME = "openai/whisper-large-v3-turbo" # device = "cuda:0" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # # Setup model and processor # def load_model(): # model = AutoModelForSpeechSeq2Seq.from_pretrained( # MODEL_NAME, # torch_dtype=torch_dtype, # low_cpu_mem_usage=True, # use_safetensors=True # ) # model.to(device) # processor = AutoProcessor.from_pretrained(MODEL_NAME) # pipe = pipeline( # "automatic-speech-recognition", # model=model, # tokenizer=processor.tokenizer, # feature_extractor=processor.feature_extractor, # torch_dtype=torch_dtype, # device=device, # ) # return pipe # # Load model globally - will be cached # pipe = load_model() # # Transcription function for audio files and microphone # def transcribe(audio_path, task="transcribe"): # if audio_path is None: # return "Please provide an audio input." # # Set task-specific generation parameters # generate_kwargs = {} # if task == "translate": # generate_kwargs["task"] = "translate" # # Process the audio # try: # result = pipe(audio_path, generate_kwargs=generate_kwargs) # return result["text"] # except Exception as e: # return f"Error during transcription: {str(e)}" # # YouTube video transcription function # def yt_transcribe(youtube_url, task="transcribe"): # if not youtube_url or not youtube_url.strip(): # return "Please enter a YouTube URL", "No transcription available." # try: # # Download audio from YouTube # yt = YouTube(youtube_url) # video = yt.streams.filter(only_audio=True).first() # # Create a temporary file to store the audio # with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: # temp_path = temp_file.name # # Download the audio to the temporary file # video.download(filename=temp_path) # # Set task-specific generation parameters # generate_kwargs = {} # if task == "translate": # generate_kwargs["task"] = "translate" # # Process the audio # result = pipe(temp_path, generate_kwargs=generate_kwargs) # # Create an HTML element to display video thumbnail # video_id = youtube_url.split("v=")[-1].split("&")[0] # thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg" # html_output = f'

{yt.title}

Channel: {yt.author}

' # # Clean up the temporary file # os.unlink(temp_path) # return html_output, result["text"] # except Exception as e: # return f"Error processing YouTube video: {str(e)}", "Transcription failed." # # Create Gradio interfaces # mic_transcribe = gr.Interface( # fn=transcribe, # inputs=[ # gr.Audio(source="microphone", type="filepath", optional=True), # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"), # ], # outputs="text", # layout="horizontal", # theme="huggingface", # title="Whisper Large V3 Turbo: Transcribe Audio", # description=( # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" # " of arbitrary length." # ), # allow_flagging="never", # ) # file_transcribe = gr.Interface( # fn=transcribe, # inputs=[ # gr.Audio(source="upload", type="filepath", optional=True, label="Audio file"), # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"), # ], # outputs="text", # layout="horizontal", # theme="huggingface", # title="Whisper Large V3 Turbo: Transcribe Audio", # description=( # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" # " of arbitrary length." # ), # allow_flagging="never", # ) # yt_interface = gr.Interface( # fn=yt_transcribe, # inputs=[ # gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe") # ], # outputs=["html", "text"], # layout="horizontal", # theme="huggingface", # title="Whisper Large V3 Turbo: Transcribe YouTube", # description=( # "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint" # f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of" # " arbitrary length." # ), # allow_flagging="never", # ) # # Create the tabbed interface # demo = gr.Blocks() # with demo: # gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"]) # # Launch the app # if __name__ == "__main__": # demo.launch(enable_queue=True) import os import gradio as gr import torch import pytube import tempfile import shutil from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from pytube import YouTube # Set environment variables for Hugging Face Space os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" os.environ["HF_HOME"] = "/tmp/hf_home" os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/hf_hub_cache" os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib_config" # Create cache directories with proper permissions for cache_dir in ["/tmp/transformers_cache", "/tmp/hf_home", "/tmp/hf_hub_cache", "/tmp/matplotlib_config"]: os.makedirs(cache_dir, exist_ok=True) # Model configuration MODEL_NAME = "openai/whisper-large-v3-turbo" device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Print environment info for debugging print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA device name: {torch.cuda.get_device_name(0)}") print(f"Using device: {device}") print(f"Using dtype: {torch_dtype}") # Setup model and processor def load_model(): try: print("Loading model...") model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, cache_dir="/tmp/transformers_cache", ) model.to(device) print("Model loaded successfully!") print("Loading processor...") processor = AutoProcessor.from_pretrained( MODEL_NAME, cache_dir="/tmp/transformers_cache", ) print("Processor loaded successfully!") print("Creating pipeline...") pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) print("Pipeline created successfully!") return pipe except Exception as e: print(f"Error loading model: {e}") raise # Load model globally - will be cached pipe = load_model() # Transcription function for audio files and microphone def transcribe(audio_path, task="transcribe"): if audio_path is None: return "Please provide an audio input." # Set task-specific generation parameters generate_kwargs = {} if task == "translate": generate_kwargs["task"] = "translate" # Process the audio try: result = pipe(audio_path, generate_kwargs=generate_kwargs) return result["text"] except Exception as e: return f"Error during transcription: {str(e)}" # YouTube video transcription function def yt_transcribe(youtube_url, task="transcribe"): if not youtube_url or not youtube_url.strip(): return "Please enter a YouTube URL", "No transcription available." try: # Download audio from YouTube yt = YouTube(youtube_url) video = yt.streams.filter(only_audio=True).first() # Create a temporary file to store the audio with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: temp_path = temp_file.name # Download the audio to the temporary file video.download(filename=temp_path) # Set task-specific generation parameters generate_kwargs = {} if task == "translate": generate_kwargs["task"] = "translate" # Process the audio result = pipe(temp_path, generate_kwargs=generate_kwargs) # Create an HTML element to display video thumbnail video_id = youtube_url.split("v=")[-1].split("&")[0] thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg" html_output = f'

{yt.title}

Channel: {yt.author}

' # Clean up the temporary file os.unlink(temp_path) return html_output, result["text"] except Exception as e: return f"Error processing YouTube video: {str(e)}", "Transcription failed." # Create Gradio interfaces mic_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Microphone(sources=["microphone"], type="filepath"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), ], outputs="text", theme="huggingface", title="Whisper Large V3 Turbo: Transcribe Audio", description=( "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" " of arbitrary length." ), flagging_mode="never", ) file_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["upload"], type="filepath", label="Audio file"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), ], outputs="text", theme="huggingface", title="Whisper Large V3 Turbo: Transcribe Audio", description=( "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" " of arbitrary length." ), flagging_mode="never", ) yt_interface = gr.Interface( fn=yt_transcribe, inputs=[ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe") ], outputs=["html", "text"], theme="huggingface", title="Whisper Large V3 Turbo: Transcribe YouTube", description=( "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint" f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of" " arbitrary length." ), flagging_mode="never", ) # # Create the tabbed interface demo = gr.Blocks() with demo: gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"]) # Launch the app if __name__ == "__main__": demo.launch(server_name="0.0.0.0")