File size: 6,451 Bytes
9c92b55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7d57ec
a3000c6
d7d57ec
 
 
 
 
0a05afb
 
 
 
 
9c92b55
 
 
 
 
 
 
 
 
 
 
d7d57ec
9c92b55
d7d57ec
9c92b55
d7d57ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c92b55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import tempfile
import torch
from contextlib import asynccontextmanager
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from peft import PeftModel
import librosa
from pydub import AudioSegment
from dotenv import load_dotenv

transcriber = None

@asynccontextmanager
async def lifespan(app: FastAPI):
    global transcriber

    cache_dir = "/tmp/hf_cache"
    os.makedirs(cache_dir, exist_ok=True)
    os.environ["HF_HOME"] = cache_dir
    os.environ["TRANSFORMERS_CACHE"] = cache_dir
    os.environ["HF_HUB_CACHE"] = cache_dir

    # Add numba cache directory for librosa
    numba_cache_dir = os.path.join(cache_dir, "numba_cache")
    os.makedirs(numba_cache_dir, exist_ok=True)
    os.environ["NUMBA_CACHE_DIR"] = numba_cache_dir

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if device == "cuda:0" else torch.float32
    load_dotenv(override=True)  # Load environment variables from .env file
    print("After load_dotenv, HF_TOKEN:", os.getenv("HF_TOKEN"))

    hf_token = os.getenv("HF_TOKEN")

# Add a check to ensure the token is provided
    if hf_token is None:
        raise ValueError("Hugging Face token not found. Please set the HUGGING_FACE_TOKEN environment variable.")

    BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
    # BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
    ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3"
    # ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3"
    processor = WhisperProcessor.from_pretrained(
            ADAPTER_AND_PROCESSOR_PATH, 
            token=hf_token,
            cache_dir=cache_dir
        )
    base_model = WhisperForConditionalGeneration.from_pretrained(
        BASE_MODEL_PATH, 
        torch_dtype=torch_dtype,
        cache_dir=cache_dir
    )
    final_model = PeftModel.from_pretrained(
        base_model, 
        ADAPTER_AND_PROCESSOR_PATH, 
        token=hf_token,
        cache_dir=cache_dir
    )
    transcriber = pipeline(
        "automatic-speech-recognition",
        model=final_model,
        torch_dtype=torch_dtype,
        device=device,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
    )
    print("Model loaded successfully!")
    yield

app = FastAPI(lifespan=lifespan)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

def load_audio(path: str):
    try:
        audio_array, _ = librosa.load(path, sr=16000, mono=True)
        return audio_array
    except Exception as e:
        import traceback
        msg = f"Error processing audio failed to load audio: {str(e)}\n{traceback.format_exc()}"
        if any(err in str(e) for err in ["NoBackendError", "SoundFileNotOpen", "Unsupported format", "AudioreadError"]):
            raise HTTPException(status_code=415, detail=msg + "\nSupported formats: ( WEBM, WAV, MP3, FLAC)")
        raise HTTPException(status_code=500, detail=msg)




@app.post("/api/transcription")
async def transcribe_pipeline(file: UploadFile = File(...)):
    if not file.content_type or not file.content_type.startswith("audio/"):
        raise HTTPException(status_code=400, detail="Invalid file content type.")

    print(f"Received file: {file.filename}, Content-Type: {file.content_type}")

    original_temp_path = None  # Path to the originally uploaded file
    input_for_librosa_path = None    # Path to the file librosa will load (either original or converted)
    

    try:
        # 1. Save the uploaded file to a temporary location first.
        # This gives us a file path to work with, which is often easier for external tools like FFmpeg via pydub.
        file_suffix = ".unknown"
        if file.filename:
            _, ext = os.path.splitext(file.filename)
            if ext:
                file_suffix = ext
        print(f"Saving uploaded file to temporary location with suffix '{file_suffix}'")
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as temp_orig_file:
            content = await file.read()
            temp_orig_file.write(content)
            original_temp_path = temp_orig_file.name
        
        # It's good practice to close the UploadFile object after reading its content
        await file.close()

        if file.content_type.startswith("audio/webm"):
            print(f"Conversion needed for '{original_temp_path}' (ContentType: {file.content_type}) to MP3.")
            # Define a path for the converted MP3 file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_conv_file:
                input_for_librosa_path = temp_conv_file.name
            
            try:
                # Load the WebM audio from the original temporary file using pydub
                # pydub's from_file can often infer the format, or you can specify format="webm" or format="opus"
                audio = AudioSegment.from_file(original_temp_path) # pydub will use FFmpeg here
                
                # Export as MP3 to the new temporary file path
                audio.export(input_for_librosa_path, format="mp3")
                print(f"Successfully converted '{original_temp_path}' to MP3: '{input_for_librosa_path}'")
            except Exception as e:
                import traceback
                err_msg = f"Audio conversion failed: {str(e)}\n{traceback.format_exc()}"
                if "ffmpeg" in str(e).lower():
                        err_msg += "\nEnsure FFmpeg is installed and in PATH."
                raise HTTPException(status_code=500, detail=err_msg)
        else:
            input_for_librosa_path = original_temp_path
            original_temp_path = None


        audio_array = load_audio(input_for_librosa_path)
        result = transcriber(audio_array.copy(), return_timestamps=True)
        return {"transcription": result["text"]}

    except HTTPException:
        raise
    except Exception as e:
        import traceback
        raise HTTPException(status_code=500, detail=f"Unexpected error : {str(e)}\n{traceback.format_exc()}")
    finally:
        for f in (original_temp_path, input_for_librosa_path):
            if f and os.path.exists(f):
                os.unlink(f)