import os import tempfile import torch from contextlib import asynccontextmanager from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.middleware.cors import CORSMiddleware from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline from peft import PeftModel import librosa from pydub import AudioSegment from dotenv import load_dotenv transcriber = None @asynccontextmanager async def lifespan(app: FastAPI): global transcriber cache_dir = "/tmp/hf_cache" os.makedirs(cache_dir, exist_ok=True) os.environ["HF_HOME"] = cache_dir os.environ["TRANSFORMERS_CACHE"] = cache_dir os.environ["HF_HUB_CACHE"] = cache_dir # Add numba cache directory for librosa numba_cache_dir = os.path.join(cache_dir, "numba_cache") os.makedirs(numba_cache_dir, exist_ok=True) os.environ["NUMBA_CACHE_DIR"] = numba_cache_dir device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if device == "cuda:0" else torch.float32 load_dotenv(override=True) # Load environment variables from .env file print("After load_dotenv, HF_TOKEN:", os.getenv("HF_TOKEN")) hf_token = os.getenv("HF_TOKEN") # Add a check to ensure the token is provided if hf_token is None: raise ValueError("Hugging Face token not found. Please set the HUGGING_FACE_TOKEN environment variable.") BASE_MODEL_PATH = "openai/whisper-large-v3-turbo" # BASE_MODEL_PATH = "openai/whisper-large-v3-turbo" ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3" # ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3" processor = WhisperProcessor.from_pretrained( ADAPTER_AND_PROCESSOR_PATH, token=hf_token, cache_dir=cache_dir ) base_model = WhisperForConditionalGeneration.from_pretrained( BASE_MODEL_PATH, torch_dtype=torch_dtype, cache_dir=cache_dir ) final_model = PeftModel.from_pretrained( base_model, ADAPTER_AND_PROCESSOR_PATH, token=hf_token, cache_dir=cache_dir ) transcriber = pipeline( "automatic-speech-recognition", model=final_model, torch_dtype=torch_dtype, device=device, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, ) print("Model loaded successfully!") yield app = FastAPI(lifespan=lifespan) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) def load_audio(path: str): try: audio_array, _ = librosa.load(path, sr=16000, mono=True) return audio_array except Exception as e: import traceback msg = f"Error processing audio failed to load audio: {str(e)}\n{traceback.format_exc()}" if any(err in str(e) for err in ["NoBackendError", "SoundFileNotOpen", "Unsupported format", "AudioreadError"]): raise HTTPException(status_code=415, detail=msg + "\nSupported formats: ( WEBM, WAV, MP3, FLAC)") raise HTTPException(status_code=500, detail=msg) @app.post("/api/transcription") async def transcribe_pipeline(file: UploadFile = File(...)): if not file.content_type or not file.content_type.startswith("audio/"): raise HTTPException(status_code=400, detail="Invalid file content type.") print(f"Received file: {file.filename}, Content-Type: {file.content_type}") original_temp_path = None # Path to the originally uploaded file input_for_librosa_path = None # Path to the file librosa will load (either original or converted) try: # 1. Save the uploaded file to a temporary location first. # This gives us a file path to work with, which is often easier for external tools like FFmpeg via pydub. file_suffix = ".unknown" if file.filename: _, ext = os.path.splitext(file.filename) if ext: file_suffix = ext print(f"Saving uploaded file to temporary location with suffix '{file_suffix}'") with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as temp_orig_file: content = await file.read() temp_orig_file.write(content) original_temp_path = temp_orig_file.name # It's good practice to close the UploadFile object after reading its content await file.close() if file.content_type.startswith("audio/webm"): print(f"Conversion needed for '{original_temp_path}' (ContentType: {file.content_type}) to MP3.") # Define a path for the converted MP3 file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_conv_file: input_for_librosa_path = temp_conv_file.name try: # Load the WebM audio from the original temporary file using pydub # pydub's from_file can often infer the format, or you can specify format="webm" or format="opus" audio = AudioSegment.from_file(original_temp_path) # pydub will use FFmpeg here # Export as MP3 to the new temporary file path audio.export(input_for_librosa_path, format="mp3") print(f"Successfully converted '{original_temp_path}' to MP3: '{input_for_librosa_path}'") except Exception as e: import traceback err_msg = f"Audio conversion failed: {str(e)}\n{traceback.format_exc()}" if "ffmpeg" in str(e).lower(): err_msg += "\nEnsure FFmpeg is installed and in PATH." raise HTTPException(status_code=500, detail=err_msg) else: input_for_librosa_path = original_temp_path original_temp_path = None audio_array = load_audio(input_for_librosa_path) result = transcriber(audio_array.copy(), return_timestamps=True) return {"transcription": result["text"]} except HTTPException: raise except Exception as e: import traceback raise HTTPException(status_code=500, detail=f"Unexpected error : {str(e)}\n{traceback.format_exc()}") finally: for f in (original_temp_path, input_for_librosa_path): if f and os.path.exists(f): os.unlink(f)