Spaces:
Runtime error
Runtime error
| # import os | |
| # import gradio as gr | |
| # import torch | |
| # import pytube | |
| # import tempfile | |
| # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| # from pytube import YouTube | |
| # # Set environment variables for Hugging Face Space | |
| # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" | |
| # # Model configuration | |
| # MODEL_NAME = "openai/whisper-large-v3-turbo" | |
| # device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # # Setup model and processor | |
| # def load_model(): | |
| # model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| # MODEL_NAME, | |
| # torch_dtype=torch_dtype, | |
| # low_cpu_mem_usage=True, | |
| # use_safetensors=True | |
| # ) | |
| # model.to(device) | |
| # processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| # pipe = pipeline( | |
| # "automatic-speech-recognition", | |
| # model=model, | |
| # tokenizer=processor.tokenizer, | |
| # feature_extractor=processor.feature_extractor, | |
| # torch_dtype=torch_dtype, | |
| # device=device, | |
| # ) | |
| # return pipe | |
| # # Load model globally - will be cached | |
| # pipe = load_model() | |
| # # Transcription function for audio files and microphone | |
| # def transcribe(audio_path, task="transcribe"): | |
| # if audio_path is None: | |
| # return "Please provide an audio input." | |
| # # Set task-specific generation parameters | |
| # generate_kwargs = {} | |
| # if task == "translate": | |
| # generate_kwargs["task"] = "translate" | |
| # # Process the audio | |
| # try: | |
| # result = pipe(audio_path, generate_kwargs=generate_kwargs) | |
| # return result["text"] | |
| # except Exception as e: | |
| # return f"Error during transcription: {str(e)}" | |
| # # YouTube video transcription function | |
| # def yt_transcribe(youtube_url, task="transcribe"): | |
| # if not youtube_url or not youtube_url.strip(): | |
| # return "Please enter a YouTube URL", "No transcription available." | |
| # try: | |
| # # Download audio from YouTube | |
| # yt = YouTube(youtube_url) | |
| # video = yt.streams.filter(only_audio=True).first() | |
| # # Create a temporary file to store the audio | |
| # with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: | |
| # temp_path = temp_file.name | |
| # # Download the audio to the temporary file | |
| # video.download(filename=temp_path) | |
| # # Set task-specific generation parameters | |
| # generate_kwargs = {} | |
| # if task == "translate": | |
| # generate_kwargs["task"] = "translate" | |
| # # Process the audio | |
| # result = pipe(temp_path, generate_kwargs=generate_kwargs) | |
| # # Create an HTML element to display video thumbnail | |
| # video_id = youtube_url.split("v=")[-1].split("&")[0] | |
| # thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg" | |
| # html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>' | |
| # # Clean up the temporary file | |
| # os.unlink(temp_path) | |
| # return html_output, result["text"] | |
| # except Exception as e: | |
| # return f"Error processing YouTube video: {str(e)}", "Transcription failed." | |
| # # Create Gradio interfaces | |
| # mic_transcribe = gr.Interface( | |
| # fn=transcribe, | |
| # inputs=[ | |
| # gr.Audio(source="microphone", type="filepath", optional=True), | |
| # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"), | |
| # ], | |
| # outputs="text", | |
| # layout="horizontal", | |
| # theme="huggingface", | |
| # title="Whisper Large V3 Turbo: Transcribe Audio", | |
| # description=( | |
| # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" | |
| # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
| # " of arbitrary length." | |
| # ), | |
| # allow_flagging="never", | |
| # ) | |
| # file_transcribe = gr.Interface( | |
| # fn=transcribe, | |
| # inputs=[ | |
| # gr.Audio(source="upload", type="filepath", optional=True, label="Audio file"), | |
| # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"), | |
| # ], | |
| # outputs="text", | |
| # layout="horizontal", | |
| # theme="huggingface", | |
| # title="Whisper Large V3 Turbo: Transcribe Audio", | |
| # description=( | |
| # "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" | |
| # f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
| # " of arbitrary length." | |
| # ), | |
| # allow_flagging="never", | |
| # ) | |
| # yt_interface = gr.Interface( | |
| # fn=yt_transcribe, | |
| # inputs=[ | |
| # gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), | |
| # gr.Radio(["transcribe", "translate"], label="Task", default="transcribe") | |
| # ], | |
| # outputs=["html", "text"], | |
| # layout="horizontal", | |
| # theme="huggingface", | |
| # title="Whisper Large V3 Turbo: Transcribe YouTube", | |
| # description=( | |
| # "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint" | |
| # f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of" | |
| # " arbitrary length." | |
| # ), | |
| # allow_flagging="never", | |
| # ) | |
| # # Create the tabbed interface | |
| # demo = gr.Blocks() | |
| # with demo: | |
| # gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"]) | |
| # # Launch the app | |
| # if __name__ == "__main__": | |
| # demo.launch(enable_queue=True) | |
| import os | |
| import gradio as gr | |
| import torch | |
| import pytube | |
| import tempfile | |
| import shutil | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| from pytube import YouTube | |
| # Set environment variables for Hugging Face Space | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" | |
| os.environ["HF_HOME"] = "/tmp/hf_home" | |
| os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/hf_hub_cache" | |
| os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib_config" | |
| # Create cache directories with proper permissions | |
| for cache_dir in ["/tmp/transformers_cache", "/tmp/hf_home", "/tmp/hf_hub_cache", "/tmp/matplotlib_config"]: | |
| os.makedirs(cache_dir, exist_ok=True) | |
| # Model configuration | |
| MODEL_NAME = "openai/whisper-large-v3-turbo" | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # Print environment info for debugging | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA device name: {torch.cuda.get_device_name(0)}") | |
| print(f"Using device: {device}") | |
| print(f"Using dtype: {torch_dtype}") | |
| # Setup model and processor | |
| def load_model(): | |
| try: | |
| print("Loading model...") | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True, | |
| cache_dir="/tmp/transformers_cache", | |
| ) | |
| model.to(device) | |
| print("Model loaded successfully!") | |
| print("Loading processor...") | |
| processor = AutoProcessor.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir="/tmp/transformers_cache", | |
| ) | |
| print("Processor loaded successfully!") | |
| print("Creating pipeline...") | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| print("Pipeline created successfully!") | |
| return pipe | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise | |
| # Load model globally - will be cached | |
| pipe = load_model() | |
| # Transcription function for audio files and microphone | |
| def transcribe(audio_path, task="transcribe"): | |
| if audio_path is None: | |
| return "Please provide an audio input." | |
| # Set task-specific generation parameters | |
| generate_kwargs = {} | |
| if task == "translate": | |
| generate_kwargs["task"] = "translate" | |
| # Process the audio | |
| try: | |
| result = pipe(audio_path, generate_kwargs=generate_kwargs) | |
| return result["text"] | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| # YouTube video transcription function | |
| def yt_transcribe(youtube_url, task="transcribe"): | |
| if not youtube_url or not youtube_url.strip(): | |
| return "Please enter a YouTube URL", "No transcription available." | |
| try: | |
| # Download audio from YouTube | |
| yt = YouTube(youtube_url) | |
| video = yt.streams.filter(only_audio=True).first() | |
| # Create a temporary file to store the audio | |
| with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: | |
| temp_path = temp_file.name | |
| # Download the audio to the temporary file | |
| video.download(filename=temp_path) | |
| # Set task-specific generation parameters | |
| generate_kwargs = {} | |
| if task == "translate": | |
| generate_kwargs["task"] = "translate" | |
| # Process the audio | |
| result = pipe(temp_path, generate_kwargs=generate_kwargs) | |
| # Create an HTML element to display video thumbnail | |
| video_id = youtube_url.split("v=")[-1].split("&")[0] | |
| thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg" | |
| html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>' | |
| # Clean up the temporary file | |
| os.unlink(temp_path) | |
| return html_output, result["text"] | |
| except Exception as e: | |
| return f"Error processing YouTube video: {str(e)}", "Transcription failed." | |
| # Create Gradio interfaces | |
| mic_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Microphone(sources=["microphone"], type="filepath"), | |
| gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), | |
| ], | |
| outputs="text", | |
| theme="huggingface", | |
| title="Whisper Large V3 Turbo: Transcribe Audio", | |
| description=( | |
| "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" | |
| f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
| " of arbitrary length." | |
| ), | |
| flagging_mode="never", | |
| ) | |
| file_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Audio(sources=["upload"], type="filepath", label="Audio file"), | |
| gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), | |
| ], | |
| outputs="text", | |
| theme="huggingface", | |
| title="Whisper Large V3 Turbo: Transcribe Audio", | |
| description=( | |
| "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper" | |
| f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
| " of arbitrary length." | |
| ), | |
| flagging_mode="never", | |
| ) | |
| yt_interface = gr.Interface( | |
| fn=yt_transcribe, | |
| inputs=[ | |
| gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), | |
| gr.Radio(["transcribe", "translate"], label="Task", value="transcribe") | |
| ], | |
| outputs=["html", "text"], | |
| theme="huggingface", | |
| title="Whisper Large V3 Turbo: Transcribe YouTube", | |
| description=( | |
| "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint" | |
| f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of" | |
| " arbitrary length." | |
| ), | |
| flagging_mode="never", | |
| ) | |
| # # Create the tabbed interface | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"]) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0") |