Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| from fastapi import FastAPI, File, UploadFile, Form | |
| from fastapi.responses import FileResponse, JSONResponse | |
| from typing import Optional | |
| from modules.whisper.whisper_factory import WhisperFactory | |
| from modules.whisper.whisper_parameter import WhisperParameters | |
| app = FastAPI() | |
| # Initialize Whisper inference engine | |
| whisper_inf = WhisperFactory.create_whisper_inference( | |
| whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper" | |
| whisper_model_dir=os.path.join("models", "Whisper"), | |
| faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"), | |
| insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"), | |
| output_dir=os.path.join("outputs"), | |
| ) | |
| async def transcribe_video( | |
| file: UploadFile = File(...), | |
| model_size: str = Form("large-v2"), | |
| language: str = Form("en"), | |
| translate: bool = Form(False), | |
| file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt" | |
| add_timestamp: bool = Form(True) | |
| ): | |
| """ | |
| Upload a video/audio file and get the generated subtitle file as a response. | |
| """ | |
| try: | |
| # Create temporary directories | |
| temp_dir = "temp" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| # Save the uploaded file temporarily | |
| input_file_path = os.path.join(temp_dir, file.filename) | |
| with open(input_file_path, "wb") as buffer: | |
| shutil.copyfileobj(file.file, buffer) | |
| # Prepare whisper parameters | |
| whisper_params = WhisperParameters( | |
| model_size=model_size, | |
| lang=language, | |
| is_translate=translate, | |
| beam_size=5, | |
| log_prob_threshold=-1.0, | |
| no_speech_threshold=0.6, | |
| compute_type="float16", # or "int8_float16", etc. | |
| best_of=5, | |
| patience=1.0, | |
| condition_on_previous_text=True, | |
| initial_prompt=None, | |
| temperature=0.0, | |
| compression_ratio_threshold=2.4, | |
| vad_filter=False, | |
| threshold=0.5, | |
| min_speech_duration_ms=250, | |
| max_speech_duration_s=9999, | |
| min_silence_duration_ms=2000, | |
| speech_pad_ms=400, | |
| chunk_length_s=None, | |
| batch_size=None, | |
| is_diarize=False, | |
| hf_token=None, | |
| diarization_device=None, | |
| length_penalty=1.0, | |
| repetition_penalty=1.0, | |
| no_repeat_ngram_size=0, | |
| prefix=None, | |
| suppress_blank=True, | |
| suppress_tokens="[-1]", | |
| max_initial_timestamp=1.0, | |
| word_timestamps=False, | |
| prepend_punctuations="\"'“¿([{-", | |
| append_punctuations="\"'.。,,!!??::”)]}、", | |
| max_new_tokens=None, | |
| chunk_length=None, | |
| hallucination_silence_threshold=None, | |
| hotwords=None, | |
| language_detection_threshold=None, | |
| language_detection_segments=1, | |
| prompt_reset_on_temperature=0.5 | |
| ) | |
| # Prepare params and whisper parameters as a single list | |
| params = [input_file_path, "", file_format, add_timestamp] | |
| # Transcribe the file | |
| result_str, result_files = whisper_inf.transcribe_file( | |
| files=[input_file_path], | |
| input_folder_path="", | |
| file_format=file_format, | |
| add_timestamp=add_timestamp, | |
| *whisper_params.as_list() # Expand whisper_params as individual arguments | |
| ) | |
| # Check if transcription was successful | |
| if not result_files: | |
| return JSONResponse(status_code=500, content={"message": "Transcription failed."}) | |
| # Return the first result file | |
| output_file_path = result_files[0] | |
| return FileResponse( | |
| path=output_file_path, | |
| filename=os.path.basename(output_file_path), | |
| media_type='application/octet-stream' | |
| ) | |
| except Exception as e: | |
| return JSONResponse(status_code=500, content={"message": str(e)}) | |
| finally: | |
| # Clean up temporary files | |
| if os.path.exists(input_file_path): | |
| os.remove(input_file_path) |