import os import shutil from fastapi import FastAPI, File, UploadFile, Form from fastapi.responses import FileResponse, JSONResponse from typing import Optional from modules.whisper.whisper_factory import WhisperFactory from modules.whisper.whisper_parameter import WhisperParameters app = FastAPI() # Initialize Whisper inference engine whisper_inf = WhisperFactory.create_whisper_inference( whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper" whisper_model_dir=os.path.join("models", "Whisper"), faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"), insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"), output_dir=os.path.join("outputs"), ) @app.post("/transcribe/") async def transcribe_video( file: UploadFile = File(...), model_size: str = Form("large-v2"), language: str = Form("en"), translate: bool = Form(False), file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt" add_timestamp: bool = Form(True) ): """ Upload a video/audio file and get the generated subtitle file as a response. """ try: # Create temporary directories temp_dir = "temp" os.makedirs(temp_dir, exist_ok=True) # Save the uploaded file temporarily input_file_path = os.path.join(temp_dir, file.filename) with open(input_file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) # Prepare whisper parameters whisper_params = WhisperParameters( model_size=model_size, lang=language, is_translate=translate, beam_size=5, log_prob_threshold=-1.0, no_speech_threshold=0.6, compute_type="float16", # or "int8_float16", etc. best_of=5, patience=1.0, condition_on_previous_text=True, initial_prompt=None, temperature=0.0, compression_ratio_threshold=2.4, vad_filter=False, threshold=0.5, min_speech_duration_ms=250, max_speech_duration_s=9999, min_silence_duration_ms=2000, speech_pad_ms=400, chunk_length_s=None, batch_size=None, is_diarize=False, hf_token=None, diarization_device=None, length_penalty=1.0, repetition_penalty=1.0, no_repeat_ngram_size=0, prefix=None, suppress_blank=True, suppress_tokens="[-1]", max_initial_timestamp=1.0, word_timestamps=False, prepend_punctuations="\"'“¿([{-", append_punctuations="\"'.。,,!!??::”)]}、", max_new_tokens=None, chunk_length=None, hallucination_silence_threshold=None, hotwords=None, language_detection_threshold=None, language_detection_segments=1, prompt_reset_on_temperature=0.5 ) # Prepare params and whisper parameters as a single list params = [input_file_path, "", file_format, add_timestamp] # Transcribe the file result_str, result_files = whisper_inf.transcribe_file( files=[input_file_path], input_folder_path="", file_format=file_format, add_timestamp=add_timestamp, *whisper_params.as_list() # Expand whisper_params as individual arguments ) # Check if transcription was successful if not result_files: return JSONResponse(status_code=500, content={"message": "Transcription failed."}) # Return the first result file output_file_path = result_files[0] return FileResponse( path=output_file_path, filename=os.path.basename(output_file_path), media_type='application/octet-stream' ) except Exception as e: return JSONResponse(status_code=500, content={"message": str(e)}) finally: # Clean up temporary files if os.path.exists(input_file_path): os.remove(input_file_path)