Spaces:
Sleeping
Sleeping
File size: 6,787 Bytes
0cff18c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import modal
import os
import tempfile
import io
# Define the Modal image
whisper_image = (
modal.Image.debian_slim(python_version="3.10")
.apt_install("ffmpeg")
.run_commands("pip install moviepy") # Force install moviepy
.pip_install(
"transformers[torch]",
"accelerate",
"soundfile",
"moviepy", # Essential for audio extraction from video
"huggingface_hub",
"ffmpeg-python"
)
)
app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
# Environment variable for model name, configurable in Modal UI or via .env
MODEL_NAME = os.environ.get("HF_MODEL_NAME", "openai/whisper-base")
# Hugging Face Token - retrieve from memory and set as Modal Secret
# IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
# Example: modal secret create my-huggingface-secret HF_TOKEN=your_hf_token_here
HF_TOKEN_SECRET = modal.Secret.from_name("my-huggingface-secret")
@app.function(
image=whisper_image,
secrets=[HF_TOKEN_SECRET],
timeout=1200
)
def transcribe_video_audio(video_bytes: bytes) -> str:
# Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
from moviepy.editor import VideoFileClip
import soundfile as sf
import torch
from transformers import pipeline
from huggingface_hub import login
if not video_bytes:
return "Error: No video data received."
# Login to Hugging Face Hub using the token from Modal secrets
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
try:
login(token=hf_token)
print("Successfully logged into Hugging Face Hub.")
except Exception as e:
print(f"Hugging Face Hub login failed: {e}. Proceeding, but private models may not be accessible.")
else:
print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
print(f"Processing video for transcription using model: {MODEL_NAME}")
# Initialize pipeline inside the function.
# For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
print("Loading Whisper model...")
device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
# Use float16 for GPU for faster inference and less memory, float32 for CPU
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
transcriber = pipeline(
"automatic-speech-recognition",
model=MODEL_NAME,
torch_dtype=torch_dtype,
device=device_map,
)
print(f"Whisper model loaded on device: {device_map} with dtype: {torch_dtype}")
video_path = None
audio_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
tmp_video_file.write(video_bytes)
video_path = tmp_video_file.name
print(f"Temporary video file saved: {video_path}")
print("Extracting audio from video...")
video_clip = VideoFileClip(video_path)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file:
audio_path = tmp_audio_file.name
video_clip.audio.write_audiofile(audio_path, codec='pcm_s16le', logger=None)
video_clip.close()
print(f"Audio extracted to: {audio_path}")
audio_input, samplerate = sf.read(audio_path)
if audio_input.ndim > 1:
audio_input = audio_input.mean(axis=1) # Convert to mono
print(f"Audio data shape: {audio_input.shape}, Samplerate: {samplerate}")
print("Starting transcription...")
# Pass audio as a dictionary for more control, or directly as numpy array
# Adding chunk_length_s for handling long audio files better.
result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False)
transcribed_text = result["text"]
print(f"Transcription successful. Length: {len(transcribed_text)}")
if len(transcribed_text) > 100:
print(f"Transcription preview: {transcribed_text[:100]}...")
else:
print(f"Transcription: {transcribed_text}")
return transcribed_text
except Exception as e:
print(f"Error during transcription process: {e}")
import traceback
traceback.print_exc()
return f"Error: Transcription failed. Details: {str(e)}"
finally:
for p in [video_path, audio_path]:
if p and os.path.exists(p):
try:
os.remove(p)
print(f"Removed temporary file: {p}")
except Exception as e_rm:
print(f"Error removing temporary file {p}: {e_rm}")
# This is a local entrypoint for testing the Modal function if you run `modal run modal_whisper_app.py`
@app.local_entrypoint()
def main():
# This is just an example of how you might test.
# You'd need a sample video file (e.g., "sample.mp4") in the same directory.
# For actual deployment, this main function isn't strictly necessary as Gradio will call the webhook.
sample_video_path = "sample.mp4"
if not os.path.exists(sample_video_path):
print(f"Sample video {sample_video_path} not found. Skipping local test run.")
return
with open(sample_video_path, "rb") as f:
video_bytes_content = f.read()
print(f"Testing transcription with {sample_video_path}...")
transcription = transcribe_video_audio.remote(video_bytes_content)
print("----")
print(f"Transcription Result: {transcription}")
print("----")
# To call this function from another Python script (after deployment):
# import modal
# Ensure the app name matches the one in modal.App(name=...)
# The exact lookup method might vary slightly with modal.App, often it's:
# deployed_app = modal.App.lookup("whisper-transcriber")
# or by accessing the function directly if the app is deployed with a name.
# For a deployed function, you might use its tag or webhook URL directly.
# Example using a direct function call if deployed and accessible:
# f = modal.Function.lookup("whisper-transcriber/transcribe_video_audio") # Or similar based on deployment output
# For invoking:
# result = f.remote(your_video_bytes) # for async
# print(result)
# Or, if you have the app object:
# result = app.functions.transcribe_video_audio.remote(your_video_bytes)
# Consult Modal documentation for the precise invocation method for your Modal version and deployment style.
# Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
# The Gradio app will call the deployed Modal function via its HTTP endpoint.
|