File size: 1,661 Bytes
e79fe95
43ce088
3069428
2affccb
e79fe95
02a9c21
2affccb
2ac0c53
61dbc44
9dcdc97
 
02a9c21
 
 
 
 
7a1fd19
9f428bc
 
9dcdc97
 
e79fe95
 
 
2affccb
 
9dcdc97
e79fe95
 
 
 
2affccb
02a9c21
e79fe95
 
2affccb
9dcdc97
 
02a9c21
 
9dcdc97
 
 
 
02a9c21
 
9dcdc97
02a9c21
 
 
 
9dcdc97
 
 
02a9c21
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from fastapi import FastAPI, UploadFile, File
from transformers import pipeline
import torchaudio
import torch
import subprocess
import os

app = FastAPI()

# ✅ Multilingual model (better Hindi-English support than tiny)
# You can switch to "openai/whisper-small" for even better accuracy if your container allows.
asr = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base",
    device="cpu"
)

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    input_path = "/tmp/input_audio.webm"
    wav_path = "/tmp/input_audio.wav"

    # Save uploaded file
    with open(input_path, "wb") as f:
        f.write(await file.read())

    # Convert to 16 kHz mono WAV — ensures consistency
    subprocess.run([
        "ffmpeg", "-y", "-i", input_path,
        "-ac", "1", "-ar", "16000", wav_path
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # Load waveform
    waveform, sr = torchaudio.load(wav_path)
    waveform = waveform.to(torch.float32)

    # ✅ Transcribe with automatic language detection
    # The 'task': 'transcribe' ensures Whisper writes what it hears, no translation.
    result = asr(
        {"array": waveform[0].numpy(), "sampling_rate": sr},
        generate_kwargs={
            "task": "transcribe",  # disables translation
            "language": None       # auto-detect language
        }
    )

    # Cleanup temp files
    os.remove(input_path)
    os.remove(wav_path)

    return {
        "text": result["text"].strip(),
        "language": result.get("language", "auto"),
        "note": "Auto language detection enabled. Optimized for Hindi + English speech."
    }