Auralyn / app.py
narendraa's picture
Update app.py
9dcdc97 verified
from fastapi import FastAPI, UploadFile, File
from transformers import pipeline
import torchaudio
import torch
import subprocess
import os
app = FastAPI()
# ✅ Multilingual model (better Hindi-English support than tiny)
# You can switch to "openai/whisper-small" for even better accuracy if your container allows.
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base",
device="cpu"
)
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
input_path = "/tmp/input_audio.webm"
wav_path = "/tmp/input_audio.wav"
# Save uploaded file
with open(input_path, "wb") as f:
f.write(await file.read())
# Convert to 16 kHz mono WAV — ensures consistency
subprocess.run([
"ffmpeg", "-y", "-i", input_path,
"-ac", "1", "-ar", "16000", wav_path
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# Load waveform
waveform, sr = torchaudio.load(wav_path)
waveform = waveform.to(torch.float32)
# ✅ Transcribe with automatic language detection
# The 'task': 'transcribe' ensures Whisper writes what it hears, no translation.
result = asr(
{"array": waveform[0].numpy(), "sampling_rate": sr},
generate_kwargs={
"task": "transcribe", # disables translation
"language": None # auto-detect language
}
)
# Cleanup temp files
os.remove(input_path)
os.remove(wav_path)
return {
"text": result["text"].strip(),
"language": result.get("language", "auto"),
"note": "Auto language detection enabled. Optimized for Hindi + English speech."
}