Spaces:
Sleeping
Sleeping
File size: 4,145 Bytes
6bac6fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import shutil
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import FileResponse, JSONResponse
from typing import Optional
from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.whisper_parameter import WhisperParameters
app = FastAPI()
# Initialize Whisper inference engine
whisper_inf = WhisperFactory.create_whisper_inference(
whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
whisper_model_dir=os.path.join("models", "Whisper"),
faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
output_dir=os.path.join("outputs"),
)
@app.post("/transcribe/")
async def transcribe_video(
file: UploadFile = File(...),
model_size: str = Form("large-v2"),
language: str = Form("en"),
translate: bool = Form(False),
file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt"
add_timestamp: bool = Form(True)
):
"""
Upload a video/audio file and get the generated subtitle file as a response.
"""
try:
# Create temporary directories
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
# Save the uploaded file temporarily
input_file_path = os.path.join(temp_dir, file.filename)
with open(input_file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Prepare whisper parameters
whisper_params = WhisperParameters(
model_size=model_size,
lang=language,
is_translate=translate,
beam_size=5,
log_prob_threshold=-1.0,
no_speech_threshold=0.6,
compute_type="float16", # or "int8_float16", etc.
best_of=5,
patience=1.0,
condition_on_previous_text=True,
initial_prompt=None,
temperature=0.0,
compression_ratio_threshold=2.4,
vad_filter=False,
threshold=0.5,
min_speech_duration_ms=250,
max_speech_duration_s=9999,
min_silence_duration_ms=2000,
speech_pad_ms=400,
chunk_length_s=None,
batch_size=None,
is_diarize=False,
hf_token=None,
diarization_device=None,
length_penalty=1.0,
repetition_penalty=1.0,
no_repeat_ngram_size=0,
prefix=None,
suppress_blank=True,
suppress_tokens="[-1]",
max_initial_timestamp=1.0,
word_timestamps=False,
prepend_punctuations="\"'“¿([{-",
append_punctuations="\"'.。,,!!??::”)]}、",
max_new_tokens=None,
chunk_length=None,
hallucination_silence_threshold=None,
hotwords=None,
language_detection_threshold=None,
language_detection_segments=1,
prompt_reset_on_temperature=0.5
)
# Transcribe the file
result_str, result_files = whisper_inf.transcribe_file(
files=[input_file_path],
input_folder_path="",
file_format=file_format,
add_timestamp=add_timestamp,
*whisper_params.as_list() # Expand whisper_params as individual arguments
)
# Check if transcription was successful
if not result_files:
return JSONResponse(status_code=500, content={"message": "Transcription failed."})
# Return the first result file
output_file_path = result_files[0]
return FileResponse(
path=output_file_path,
filename=os.path.basename(output_file_path),
media_type='application/octet-stream'
)
except Exception as e:
return JSONResponse(status_code=500, content={"message": str(e)})
finally:
# Clean up temporary files
if os.path.exists(input_file_path):
os.remove(input_file_path) |