Whisper-FastAPI / main.py
dahyedahye's picture
Add application file
6bac6fb
raw
history blame
4.18 kB
import os
import shutil
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import FileResponse, JSONResponse
from typing import Optional
from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.whisper_parameter import WhisperParameters
app = FastAPI()
# Initialize Whisper inference engine
whisper_inf = WhisperFactory.create_whisper_inference(
whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
whisper_model_dir=os.path.join("models", "Whisper"),
faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
output_dir=os.path.join("outputs"),
)
@app.post("/transcribe/")
async def transcribe_video(
file: UploadFile = File(...),
model_size: str = Form("large-v2"),
language: str = Form("en"),
translate: bool = Form(False),
file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt"
add_timestamp: bool = Form(True)
):
"""
Upload a video/audio file and get the generated subtitle file as a response.
"""
try:
# Create temporary directories
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
# Save the uploaded file temporarily
input_file_path = os.path.join(temp_dir, file.filename)
with open(input_file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Prepare whisper parameters
whisper_params = WhisperParameters(
model_size=model_size,
lang=language,
is_translate=translate,
beam_size=5,
log_prob_threshold=-1.0,
no_speech_threshold=0.6,
compute_type="float16", # or "int8_float16", etc.
best_of=5,
patience=1.0,
condition_on_previous_text=True,
initial_prompt=None,
temperature=0.0,
compression_ratio_threshold=2.4,
vad_filter=False,
threshold=0.5,
min_speech_duration_ms=250,
max_speech_duration_s=9999,
min_silence_duration_ms=2000,
speech_pad_ms=400,
chunk_length_s=None,
batch_size=None,
is_diarize=False,
hf_token=None,
diarization_device=None,
length_penalty=1.0,
repetition_penalty=1.0,
no_repeat_ngram_size=0,
prefix=None,
suppress_blank=True,
suppress_tokens="[-1]",
max_initial_timestamp=1.0,
word_timestamps=False,
prepend_punctuations="\"'“¿([{-",
append_punctuations="\"'.。,,!!??::”)]}、",
max_new_tokens=None,
chunk_length=None,
hallucination_silence_threshold=None,
hotwords=None,
language_detection_threshold=None,
language_detection_segments=1,
prompt_reset_on_temperature=0.5
)
# Prepare params and whisper parameters as a single list
params = [input_file_path, "", file_format, add_timestamp]
# Transcribe the file
result_str, result_files = whisper_inf.transcribe_file(
*params, # Expand the params list
*whisper_params.as_list() # Expand whisper_params as individual arguments
)
# Check if transcription was successful
if not result_files:
return JSONResponse(status_code=500, content={"message": "Transcription failed."})
# Return the first result file
output_file_path = result_files[0]
return FileResponse(
path=output_file_path,
filename=os.path.basename(output_file_path),
media_type='application/octet-stream'
)
except Exception as e:
return JSONResponse(status_code=500, content={"message": str(e)})
finally:
# Clean up temporary files
if os.path.exists(input_file_path):
os.remove(input_file_path)