File size: 4,145 Bytes
6bac6fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import shutil
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import FileResponse, JSONResponse
from typing import Optional
from modules.whisper.whisper_factory import WhisperFactory
from modules.whisper.whisper_parameter import WhisperParameters

app = FastAPI()

# Initialize Whisper inference engine
whisper_inf = WhisperFactory.create_whisper_inference(
    whisper_type="faster-whisper",  # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
    whisper_model_dir=os.path.join("models", "Whisper"),
    faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
    insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
    output_dir=os.path.join("outputs"),
)

@app.post("/transcribe/")
async def transcribe_video(
    file: UploadFile = File(...),
    model_size: str = Form("large-v2"),
    language: str = Form("en"),
    translate: bool = Form(False),
    file_format: str = Form("SRT"),  # Options: "SRT", "WebVTT", "txt"
    add_timestamp: bool = Form(True)
):
    """
    Upload a video/audio file and get the generated subtitle file as a response.
    """
    try:
        # Create temporary directories
        temp_dir = "temp"
        os.makedirs(temp_dir, exist_ok=True)
        
        # Save the uploaded file temporarily
        input_file_path = os.path.join(temp_dir, file.filename)
        with open(input_file_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        
        # Prepare whisper parameters
        whisper_params = WhisperParameters(
            model_size=model_size,
            lang=language,
            is_translate=translate,
            beam_size=5,
            log_prob_threshold=-1.0,
            no_speech_threshold=0.6,
            compute_type="float16",  # or "int8_float16", etc.
            best_of=5,
            patience=1.0,
            condition_on_previous_text=True,
            initial_prompt=None,
            temperature=0.0,
            compression_ratio_threshold=2.4,
            vad_filter=False,
            threshold=0.5,
            min_speech_duration_ms=250,
            max_speech_duration_s=9999,
            min_silence_duration_ms=2000,
            speech_pad_ms=400,
            chunk_length_s=None,
            batch_size=None,
            is_diarize=False,
            hf_token=None,
            diarization_device=None,
            length_penalty=1.0,
            repetition_penalty=1.0,
            no_repeat_ngram_size=0,
            prefix=None,
            suppress_blank=True,
            suppress_tokens="[-1]",
            max_initial_timestamp=1.0,
            word_timestamps=False,
            prepend_punctuations="\"'“¿([{-",
            append_punctuations="\"'.。,,!!??::”)]}、",
            max_new_tokens=None,
            chunk_length=None,
            hallucination_silence_threshold=None,
            hotwords=None,
            language_detection_threshold=None,
            language_detection_segments=1,
            prompt_reset_on_temperature=0.5
        )
        
        # Transcribe the file
        result_str, result_files = whisper_inf.transcribe_file(
            files=[input_file_path],
            input_folder_path="",
            file_format=file_format,
            add_timestamp=add_timestamp,
            *whisper_params.as_list()  # Expand whisper_params as individual arguments
        )
        
        # Check if transcription was successful
        if not result_files:
            return JSONResponse(status_code=500, content={"message": "Transcription failed."})
        
        # Return the first result file
        output_file_path = result_files[0]
        return FileResponse(
            path=output_file_path,
            filename=os.path.basename(output_file_path),
            media_type='application/octet-stream'
        )
    except Exception as e:
        return JSONResponse(status_code=500, content={"message": str(e)})
    finally:
        # Clean up temporary files
        if os.path.exists(input_file_path):
            os.remove(input_file_path)