Spaces:

leekwoon
/

Whisper-FastAPI

Sleeping

File size: 4,294 Bytes

f8e3c2f
 
c2d3acc
 
 
f8e3c2f
c2d3acc
0faa266
 
 
f8e3c2f
 
c2d3acc
f8e3c2f
 
 
 
 
 
c2d3acc
 
 
 
 
 
 
 
 
0faa266
c2d3acc
0faa266
c2d3acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bac6fb
 
 
c2d3acc
 
d6817dc
 
 
 
13fcab8
c2d3acc
d6817dc
c2d3acc

import os
import shutil
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import FileResponse, JSONResponse
from typing import Optional
from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.whisper_parameter import WhisperParameters

app = FastAPI()

# Initialize Whisper inference engine
whisper_inf = WhisperFactory.create_whisper_inference(
    whisper_type="faster-whisper",  # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
    whisper_model_dir=os.path.join("models", "Whisper"),
    faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
    insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
    output_dir=os.path.join("outputs"),
)

@app.post("/transcribe/")
async def transcribe_video(
    file: UploadFile = File(...),
    model_size: str = Form("large-v2"),
    language: str = Form("en"),
    translate: bool = Form(False),
    file_format: str = Form("SRT"),  # Options: "SRT", "WebVTT", "txt"
    add_timestamp: bool = Form(True)
):
    """
    Upload a video/audio file and get the generated subtitle file as a response.
    """
    try:
        # Create temporary directories
        temp_dir = "temp"
        os.makedirs(temp_dir, exist_ok=True)
        
        # Save the uploaded file temporarily
        input_file_path = os.path.join(temp_dir, file.filename)
        with open(input_file_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        
        # Prepare whisper parameters
        whisper_params = WhisperParameters(
            model_size=model_size,
            lang=language,
            is_translate=translate,
            beam_size=5,
            log_prob_threshold=-1.0,
            no_speech_threshold=0.6,
            compute_type="float16",  # or "int8_float16", etc.
            best_of=5,
            patience=1.0,
            condition_on_previous_text=True,
            initial_prompt=None,
            temperature=0.0,
            compression_ratio_threshold=2.4,
            vad_filter=False,
            threshold=0.5,
            min_speech_duration_ms=250,
            max_speech_duration_s=9999,
            min_silence_duration_ms=2000,
            speech_pad_ms=400,
            chunk_length_s=None,
            batch_size=None,
            is_diarize=False,
            hf_token=None,
            diarization_device=None,
            length_penalty=1.0,
            repetition_penalty=1.0,
            no_repeat_ngram_size=0,
            prefix=None,
            suppress_blank=True,
            suppress_tokens="[-1]",
            max_initial_timestamp=1.0,
            word_timestamps=False,
            prepend_punctuations="\"'“¿([{-",
            append_punctuations="\"'.。,，!！?？:：”)]}、",
            max_new_tokens=None,
            chunk_length=None,
            hallucination_silence_threshold=None,
            hotwords=None,
            language_detection_threshold=None,
            language_detection_segments=1,
            prompt_reset_on_temperature=0.5
        )
        
        # Prepare params and whisper parameters as a single list
        params = [input_file_path, "", file_format, add_timestamp]
        
        # Transcribe the file
        result_str, result_files = whisper_inf.transcribe_file(
            files=[input_file_path],
            input_folder_path="",
            file_format=file_format,
            add_timestamp=add_timestamp,
            *whisper_params.as_list()  # Expand whisper_params as individual arguments
        )
                
        # Check if transcription was successful
        if not result_files:
            return JSONResponse(status_code=500, content={"message": "Transcription failed."})
        
        # Return the first result file
        output_file_path = result_files[0]
        return FileResponse(
            path=output_file_path,
            filename=os.path.basename(output_file_path),
            media_type='application/octet-stream'
        )
    except Exception as e:
        return JSONResponse(status_code=500, content={"message": str(e)})
    finally:
        # Clean up temporary files
        if os.path.exists(input_file_path):
            os.remove(input_file_path)