Whisper-FastAPI / main.py
dahyedahye's picture
Add application file
d6817dc
import os
import shutil
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import FileResponse, JSONResponse
from typing import Optional
from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.whisper_parameter import WhisperParameters
app = FastAPI()
# Initialize Whisper inference engine
whisper_inf = WhisperFactory.create_whisper_inference(
whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
whisper_model_dir=os.path.join("models", "Whisper"),
faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
output_dir=os.path.join("outputs"),
)
@app.post("/transcribe/")
async def transcribe_video(
file: UploadFile = File(...),
model_size: str = Form("large-v2"),
language: str = Form("en"),
translate: bool = Form(False),
file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt"
add_timestamp: bool = Form(True)
):
"""
Upload a video/audio file and get the generated subtitle file as a response.
"""
try:
# Create temporary directories
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
# Save the uploaded file temporarily
input_file_path = os.path.join(temp_dir, file.filename)
with open(input_file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Prepare whisper parameters
whisper_params = WhisperParameters(
model_size=model_size,
lang=language,
is_translate=translate,
beam_size=5,
log_prob_threshold=-1.0,
no_speech_threshold=0.6,
compute_type="float16", # or "int8_float16", etc.
best_of=5,
patience=1.0,
condition_on_previous_text=True,
initial_prompt=None,
temperature=0.0,
compression_ratio_threshold=2.4,
vad_filter=False,
threshold=0.5,
min_speech_duration_ms=250,
max_speech_duration_s=9999,
min_silence_duration_ms=2000,
speech_pad_ms=400,
chunk_length_s=None,
batch_size=None,
is_diarize=False,
hf_token=None,
diarization_device=None,
length_penalty=1.0,
repetition_penalty=1.0,
no_repeat_ngram_size=0,
prefix=None,
suppress_blank=True,
suppress_tokens="[-1]",
max_initial_timestamp=1.0,
word_timestamps=False,
prepend_punctuations="\"'“¿([{-",
append_punctuations="\"'.。,,!!??::”)]}、",
max_new_tokens=None,
chunk_length=None,
hallucination_silence_threshold=None,
hotwords=None,
language_detection_threshold=None,
language_detection_segments=1,
prompt_reset_on_temperature=0.5
)
# Prepare params and whisper parameters as a single list
params = [input_file_path, "", file_format, add_timestamp]
# Transcribe the file
result_str, result_files = whisper_inf.transcribe_file(
files=[input_file_path],
input_folder_path="",
file_format=file_format,
add_timestamp=add_timestamp,
*whisper_params.as_list() # Expand whisper_params as individual arguments
)
# Check if transcription was successful
if not result_files:
return JSONResponse(status_code=500, content={"message": "Transcription failed."})
# Return the first result file
output_file_path = result_files[0]
return FileResponse(
path=output_file_path,
filename=os.path.basename(output_file_path),
media_type='application/octet-stream'
)
except Exception as e:
return JSONResponse(status_code=500, content={"message": str(e)})
finally:
# Clean up temporary files
if os.path.exists(input_file_path):
os.remove(input_file_path)