Spaces:

arshenoy
/

somAI-media

Sleeping

arshenoy commited on Dec 3, 2025

Commit

b9aa307

verified ·

1 Parent(s): 14c3647

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import io
 import base64
 import torchaudio
 import numpy as np
 print(">>> INITIALIZING SOMAI MEDIA NODE...")
@@ -16,12 +17,9 @@ MOONDREAM_REPO = "vikhyatk/moondream2"
 WHISPER_REPO = "distil-whisper/distil-small.en"
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
     allow_headers=["*"],
 )
@@ -71,15 +69,27 @@ def vision(req: VisionRequest):
 @app.post("/transcribe")
 def transcribe(req: AudioRequest):
     if not whisper_model: raise HTTPException(503, "Audio Model Unavailable")
     try:
         audio_bytes = base64.b64decode(req.audio)
-        with open("temp.wav", "wb") as f: f.write(audio_bytes)
         import librosa
-        audio, _ = librosa.load("temp.wav", sr=16000)
         inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
         generated_ids = whisper_model.generate(inputs["input_features"], max_new_tokens=128)
         text = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return {"text": text}
     except Exception as e:
         print(e)
-        return {"text": "Transcription failed."}

 import base64
 import torchaudio
 import numpy as np
+import os
 print(">>> INITIALIZING SOMAI MEDIA NODE...")
 WHISPER_REPO = "distil-whisper/distil-small.en"
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_headers=["*"],
 )
 @app.post("/transcribe")
 def transcribe(req: AudioRequest):
     if not whisper_model: raise HTTPException(503, "Audio Model Unavailable")
+    temp_wav_path = "temp.wav"
     try:
+        # Decode base64 and save to temp file
         audio_bytes = base64.b64decode(req.audio)
+        with open(temp_wav_path, "wb") as f: f.write(audio_bytes)
+        # Use librosa to load and resample (handles various audio formats via ffmpeg)
         import librosa
+        audio, _ = librosa.load(temp_wav_path, sr=16000)
+        # Process and transcribe
         inputs = whisper_processor(audio, sampling_rate=16000, return_tensors="pt")
         generated_ids = whisper_model.generate(inputs["input_features"], max_new_tokens=128)
         text = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return {"text": text}
     except Exception as e:
         print(e)
+        return {"text": "Transcription failed."}
+    finally:
+        # Cleanup temp file
+        if os.path.exists(temp_wav_path):
+            os.remove(temp_wav_path)