File size: 1,661 Bytes
e79fe95 43ce088 3069428 2affccb e79fe95 02a9c21 2affccb 2ac0c53 61dbc44 9dcdc97 02a9c21 7a1fd19 9f428bc 9dcdc97 e79fe95 2affccb 9dcdc97 e79fe95 2affccb 02a9c21 e79fe95 2affccb 9dcdc97 02a9c21 9dcdc97 02a9c21 9dcdc97 02a9c21 9dcdc97 02a9c21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | from fastapi import FastAPI, UploadFile, File
from transformers import pipeline
import torchaudio
import torch
import subprocess
import os
app = FastAPI()
# ✅ Multilingual model (better Hindi-English support than tiny)
# You can switch to "openai/whisper-small" for even better accuracy if your container allows.
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base",
device="cpu"
)
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
input_path = "/tmp/input_audio.webm"
wav_path = "/tmp/input_audio.wav"
# Save uploaded file
with open(input_path, "wb") as f:
f.write(await file.read())
# Convert to 16 kHz mono WAV — ensures consistency
subprocess.run([
"ffmpeg", "-y", "-i", input_path,
"-ac", "1", "-ar", "16000", wav_path
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# Load waveform
waveform, sr = torchaudio.load(wav_path)
waveform = waveform.to(torch.float32)
# ✅ Transcribe with automatic language detection
# The 'task': 'transcribe' ensures Whisper writes what it hears, no translation.
result = asr(
{"array": waveform[0].numpy(), "sampling_rate": sr},
generate_kwargs={
"task": "transcribe", # disables translation
"language": None # auto-detect language
}
)
# Cleanup temp files
os.remove(input_path)
os.remove(wav_path)
return {
"text": result["text"].strip(),
"language": result.get("language", "auto"),
"note": "Auto language detection enabled. Optimized for Hindi + English speech."
}
|