from fastapi import FastAPI, UploadFile, File from transformers import pipeline import torchaudio import torch import subprocess import os app = FastAPI() # ✅ Multilingual model (better Hindi-English support than tiny) # You can switch to "openai/whisper-small" for even better accuracy if your container allows. asr = pipeline( "automatic-speech-recognition", model="openai/whisper-base", device="cpu" ) @app.post("/predict") async def predict(file: UploadFile = File(...)): input_path = "/tmp/input_audio.webm" wav_path = "/tmp/input_audio.wav" # Save uploaded file with open(input_path, "wb") as f: f.write(await file.read()) # Convert to 16 kHz mono WAV — ensures consistency subprocess.run([ "ffmpeg", "-y", "-i", input_path, "-ac", "1", "-ar", "16000", wav_path ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Load waveform waveform, sr = torchaudio.load(wav_path) waveform = waveform.to(torch.float32) # ✅ Transcribe with automatic language detection # The 'task': 'transcribe' ensures Whisper writes what it hears, no translation. result = asr( {"array": waveform[0].numpy(), "sampling_rate": sr}, generate_kwargs={ "task": "transcribe", # disables translation "language": None # auto-detect language } ) # Cleanup temp files os.remove(input_path) os.remove(wav_path) return { "text": result["text"].strip(), "language": result.get("language", "auto"), "note": "Auto language detection enabled. Optimized for Hindi + English speech." }