arshenoy commited on
Commit
b9aa307
·
verified ·
1 Parent(s): 14c3647

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -7
app.py CHANGED
@@ -8,6 +8,7 @@ import io
8
  import base64
9
  import torchaudio
10
  import numpy as np
 
11
 
12
  print(">>> INITIALIZING SOMAI MEDIA NODE...")
13
 
@@ -16,12 +17,9 @@ MOONDREAM_REPO = "vikhyatk/moondream2"
16
  WHISPER_REPO = "distil-whisper/distil-small.en"
17
 
18
  app = FastAPI()
19
-
20
  app.add_middleware(
21
  CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_credentials=True,
24
- allow_methods=["*"],
25
  allow_headers=["*"],
26
  )
27
 
@@ -71,15 +69,27 @@ def vision(req: VisionRequest):
71
  @app.post("/transcribe")
72
  def transcribe(req: AudioRequest):
73
  if not whisper_model: raise HTTPException(503, "Audio Model Unavailable")
 
 
74
  try:
 
75
  audio_bytes = base64.b64decode(req.audio)
76
- with open("temp.wav", "wb") as f: f.write(audio_bytes)
 
 
77
  import librosa
78
- audio, _ = librosa.load("temp.wav", sr=16000)
 
 
79
  inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
80
  generated_ids = whisper_model.generate(inputs["input_features"], max_new_tokens=128)
81
  text = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
82
  return {"text": text}
83
  except Exception as e:
84
  print(e)
85
- return {"text": "Transcription failed."}
 
 
 
 
 
8
  import base64
9
  import torchaudio
10
  import numpy as np
11
+ import os
12
 
13
  print(">>> INITIALIZING SOMAI MEDIA NODE...")
14
 
 
17
  WHISPER_REPO = "distil-whisper/distil-small.en"
18
 
19
  app = FastAPI()
 
20
  app.add_middleware(
21
  CORSMiddleware,
22
+ allow_origins=["*"],
 
 
23
  allow_headers=["*"],
24
  )
25
 
 
69
  @app.post("/transcribe")
70
  def transcribe(req: AudioRequest):
71
  if not whisper_model: raise HTTPException(503, "Audio Model Unavailable")
72
+
73
+ temp_wav_path = "temp.wav"
74
  try:
75
+ # Decode base64 and save to temp file
76
  audio_bytes = base64.b64decode(req.audio)
77
+ with open(temp_wav_path, "wb") as f: f.write(audio_bytes)
78
+
79
+ # Use librosa to load and resample (handles various audio formats via ffmpeg)
80
  import librosa
81
+ audio, _ = librosa.load(temp_wav_path, sr=16000)
82
+
83
+ # Process and transcribe
84
  inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
85
  generated_ids = whisper_model.generate(inputs["input_features"], max_new_tokens=128)
86
  text = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
87
+
88
  return {"text": text}
89
  except Exception as e:
90
  print(e)
91
+ return {"text": "Transcription failed."}
92
+ finally:
93
+ # Cleanup temp file
94
+ if os.path.exists(temp_wav_path):
95
+ os.remove(temp_wav_path)