Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,25 +34,31 @@ processor.tokenizer.set_target_lang("bam")
|
|
| 34 |
model.load_adapter("bam")
|
| 35 |
print("Bambara adapter loaded. System Ready.")
|
| 36 |
|
|
|
|
| 37 |
@app.post("/transcribe")
|
| 38 |
async def transcribe(audio_file: UploadFile = File(...)):
|
| 39 |
try:
|
| 40 |
-
# Read file
|
| 41 |
content = await audio_file.read()
|
| 42 |
if not content:
|
| 43 |
return {"text": "Error: Empty audio file"}
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
# Prepare inputs
|
| 49 |
inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
|
| 50 |
|
| 51 |
-
#
|
| 52 |
with torch.inference_mode():
|
| 53 |
logits = model(**inputs).logits
|
| 54 |
|
| 55 |
-
# Decode output
|
| 56 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 57 |
transcription = processor.batch_decode(predicted_ids)[0]
|
| 58 |
|
|
|
|
| 34 |
model.load_adapter("bam")
|
| 35 |
print("Bambara adapter loaded. System Ready.")
|
| 36 |
|
| 37 |
+
|
| 38 |
@app.post("/transcribe")
|
| 39 |
async def transcribe(audio_file: UploadFile = File(...)):
|
| 40 |
try:
|
| 41 |
+
# 1. Read the file into memory
|
| 42 |
content = await audio_file.read()
|
| 43 |
if not content:
|
| 44 |
return {"text": "Error: Empty audio file"}
|
| 45 |
|
| 46 |
+
# 2. Convert to a file-like object
|
| 47 |
+
audio_fp = io.BytesIO(content)
|
| 48 |
+
|
| 49 |
+
# 3. Load & Resample
|
| 50 |
+
# By not specifying 'format', librosa uses ffmpeg to 'sniff' the file.
|
| 51 |
+
# This works for WebM, Ogg, WAV, etc., IF ffmpeg is in packages.txt
|
| 52 |
+
audio_data, _ = librosa.load(audio_fp, sr=16000)
|
| 53 |
|
| 54 |
+
# 4. Prepare inputs for the model
|
| 55 |
inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
|
| 56 |
|
| 57 |
+
# 5. Run the model
|
| 58 |
with torch.inference_mode():
|
| 59 |
logits = model(**inputs).logits
|
| 60 |
|
| 61 |
+
# 6. Decode output
|
| 62 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 63 |
transcription = processor.batch_decode(predicted_ids)[0]
|
| 64 |
|