Gaoussin commited on
Commit
2955b20
·
verified ·
1 Parent(s): 1945a83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -34,25 +34,31 @@ processor.tokenizer.set_target_lang("bam")
34
  model.load_adapter("bam")
35
  print("Bambara adapter loaded. System Ready.")
36
 
 
37
  @app.post("/transcribe")
38
  async def transcribe(audio_file: UploadFile = File(...)):
39
  try:
40
- # Read file stream
41
  content = await audio_file.read()
42
  if not content:
43
  return {"text": "Error: Empty audio file"}
44
 
45
- # Load & Resample (Critical: Model expects 16,000Hz)
46
- audio_data, _ = librosa.load(io.BytesIO(content), sr=16000)
 
 
 
 
 
47
 
48
- # Prepare inputs
49
  inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
50
 
51
- # Inference (inference_mode is more memory efficient than no_grad)
52
  with torch.inference_mode():
53
  logits = model(**inputs).logits
54
 
55
- # Decode output
56
  predicted_ids = torch.argmax(logits, dim=-1)
57
  transcription = processor.batch_decode(predicted_ids)[0]
58
 
 
34
  model.load_adapter("bam")
35
  print("Bambara adapter loaded. System Ready.")
36
 
37
+
38
  @app.post("/transcribe")
39
  async def transcribe(audio_file: UploadFile = File(...)):
40
  try:
41
+ # 1. Read the file into memory
42
  content = await audio_file.read()
43
  if not content:
44
  return {"text": "Error: Empty audio file"}
45
 
46
+ # 2. Convert to a file-like object
47
+ audio_fp = io.BytesIO(content)
48
+
49
+ # 3. Load & Resample
50
+ # By not specifying 'format', librosa uses ffmpeg to 'sniff' the file.
51
+ # This works for WebM, Ogg, WAV, etc., IF ffmpeg is in packages.txt
52
+ audio_data, _ = librosa.load(audio_fp, sr=16000)
53
 
54
+ # 4. Prepare inputs for the model
55
  inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
56
 
57
+ # 5. Run the model
58
  with torch.inference_mode():
59
  logits = model(**inputs).logits
60
 
61
+ # 6. Decode output
62
  predicted_ids = torch.argmax(logits, dim=-1)
63
  transcription = processor.batch_decode(predicted_ids)[0]
64