whis / app.py
Pavithiran's picture
Update app.py
ce06f8a verified
# import os
# import gradio as gr
# import torch
# import pytube
# import tempfile
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from pytube import YouTube
# # Set environment variables for Hugging Face Space
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
# # Model configuration
# MODEL_NAME = "openai/whisper-large-v3-turbo"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# # Setup model and processor
# def load_model():
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
# MODEL_NAME,
# torch_dtype=torch_dtype,
# low_cpu_mem_usage=True,
# use_safetensors=True
# )
# model.to(device)
# processor = AutoProcessor.from_pretrained(MODEL_NAME)
# pipe = pipeline(
# "automatic-speech-recognition",
# model=model,
# tokenizer=processor.tokenizer,
# feature_extractor=processor.feature_extractor,
# torch_dtype=torch_dtype,
# device=device,
# )
# return pipe
# # Load model globally - will be cached
# pipe = load_model()
# # Transcription function for audio files and microphone
# def transcribe(audio_path, task="transcribe"):
# if audio_path is None:
# return "Please provide an audio input."
# # Set task-specific generation parameters
# generate_kwargs = {}
# if task == "translate":
# generate_kwargs["task"] = "translate"
# # Process the audio
# try:
# result = pipe(audio_path, generate_kwargs=generate_kwargs)
# return result["text"]
# except Exception as e:
# return f"Error during transcription: {str(e)}"
# # YouTube video transcription function
# def yt_transcribe(youtube_url, task="transcribe"):
# if not youtube_url or not youtube_url.strip():
# return "Please enter a YouTube URL", "No transcription available."
# try:
# # Download audio from YouTube
# yt = YouTube(youtube_url)
# video = yt.streams.filter(only_audio=True).first()
# # Create a temporary file to store the audio
# with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
# temp_path = temp_file.name
# # Download the audio to the temporary file
# video.download(filename=temp_path)
# # Set task-specific generation parameters
# generate_kwargs = {}
# if task == "translate":
# generate_kwargs["task"] = "translate"
# # Process the audio
# result = pipe(temp_path, generate_kwargs=generate_kwargs)
# # Create an HTML element to display video thumbnail
# video_id = youtube_url.split("v=")[-1].split("&")[0]
# thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
# html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>'
# # Clean up the temporary file
# os.unlink(temp_path)
# return html_output, result["text"]
# except Exception as e:
# return f"Error processing YouTube video: {str(e)}", "Transcription failed."
# # Create Gradio interfaces
# mic_transcribe = gr.Interface(
# fn=transcribe,
# inputs=[
# gr.Audio(source="microphone", type="filepath", optional=True),
# gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
# ],
# outputs="text",
# layout="horizontal",
# theme="huggingface",
# title="Whisper Large V3 Turbo: Transcribe Audio",
# description=(
# "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
# f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
# " of arbitrary length."
# ),
# allow_flagging="never",
# )
# file_transcribe = gr.Interface(
# fn=transcribe,
# inputs=[
# gr.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
# gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
# ],
# outputs="text",
# layout="horizontal",
# theme="huggingface",
# title="Whisper Large V3 Turbo: Transcribe Audio",
# description=(
# "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
# f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
# " of arbitrary length."
# ),
# allow_flagging="never",
# )
# yt_interface = gr.Interface(
# fn=yt_transcribe,
# inputs=[
# gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
# gr.Radio(["transcribe", "translate"], label="Task", default="transcribe")
# ],
# outputs=["html", "text"],
# layout="horizontal",
# theme="huggingface",
# title="Whisper Large V3 Turbo: Transcribe YouTube",
# description=(
# "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
# f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
# " arbitrary length."
# ),
# allow_flagging="never",
# )
# # Create the tabbed interface
# demo = gr.Blocks()
# with demo:
# gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"])
# # Launch the app
# if __name__ == "__main__":
# demo.launch(enable_queue=True)
import os
import gradio as gr
import torch
import pytube
import tempfile
import shutil
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pytube import YouTube
# Set environment variables for Hugging Face Space
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/hf_hub_cache"
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib_config"
# Create cache directories with proper permissions
for cache_dir in ["/tmp/transformers_cache", "/tmp/hf_home", "/tmp/hf_hub_cache", "/tmp/matplotlib_config"]:
os.makedirs(cache_dir, exist_ok=True)
# Model configuration
MODEL_NAME = "openai/whisper-large-v3-turbo"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Print environment info for debugging
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
print(f"Using device: {device}")
print(f"Using dtype: {torch_dtype}")
# Setup model and processor
def load_model():
try:
print("Loading model...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
cache_dir="/tmp/transformers_cache",
)
model.to(device)
print("Model loaded successfully!")
print("Loading processor...")
processor = AutoProcessor.from_pretrained(
MODEL_NAME,
cache_dir="/tmp/transformers_cache",
)
print("Processor loaded successfully!")
print("Creating pipeline...")
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
print("Pipeline created successfully!")
return pipe
except Exception as e:
print(f"Error loading model: {e}")
raise
# Load model globally - will be cached
pipe = load_model()
# Transcription function for audio files and microphone
def transcribe(audio_path, task="transcribe"):
if audio_path is None:
return "Please provide an audio input."
# Set task-specific generation parameters
generate_kwargs = {}
if task == "translate":
generate_kwargs["task"] = "translate"
# Process the audio
try:
result = pipe(audio_path, generate_kwargs=generate_kwargs)
return result["text"]
except Exception as e:
return f"Error during transcription: {str(e)}"
# YouTube video transcription function
def yt_transcribe(youtube_url, task="transcribe"):
if not youtube_url or not youtube_url.strip():
return "Please enter a YouTube URL", "No transcription available."
try:
# Download audio from YouTube
yt = YouTube(youtube_url)
video = yt.streams.filter(only_audio=True).first()
# Create a temporary file to store the audio
with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
temp_path = temp_file.name
# Download the audio to the temporary file
video.download(filename=temp_path)
# Set task-specific generation parameters
generate_kwargs = {}
if task == "translate":
generate_kwargs["task"] = "translate"
# Process the audio
result = pipe(temp_path, generate_kwargs=generate_kwargs)
# Create an HTML element to display video thumbnail
video_id = youtube_url.split("v=")[-1].split("&")[0]
thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg"
html_output = f'<div style="display: flex; align-items: center;"><img src="{thumbnail_url}" style="max-width: 200px; margin-right: 20px;"><div><h3>{yt.title}</h3><p>Channel: {yt.author}</p></div></div>'
# Clean up the temporary file
os.unlink(temp_path)
return html_output, result["text"]
except Exception as e:
return f"Error processing YouTube video: {str(e)}", "Transcription failed."
# Create Gradio interfaces
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Microphone(sources=["microphone"], type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
theme="huggingface",
title="Whisper Large V3 Turbo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
flagging_mode="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["upload"], type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
theme="huggingface",
title="Whisper Large V3 Turbo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
flagging_mode="never",
)
yt_interface = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
],
outputs=["html", "text"],
theme="huggingface",
title="Whisper Large V3 Turbo: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length."
),
flagging_mode="never",
)
# # Create the tabbed interface
demo = gr.Blocks()
with demo:
gr.TabbedInterface([mic_transcribe, file_transcribe, yt_interface], ["Microphone", "Audio file", "YouTube"])
# Launch the app
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")