Spaces:

Somalitts
/

STT_Api

Runtime error

App Files Files Community

Somalitts commited on Aug 9, 2025

Commit

4acc86d

verified ·

1 Parent(s): a81dacb

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -25

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
 import os
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"  # Important for Docker
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
-import torchaudio
 import torch
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import io
 app = FastAPI()
-# Allow all origins (for Flutter)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -18,30 +20,69 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Load model
-processor = Wav2Vec2Processor.from_pretrained("Mustafaa4a/ASR-Somali")
-model = Wav2Vec2ForCTC.from_pretrained("Mustafaa4a/ASR-Somali")
 @app.get("/")
 async def root():
     return {"message": "Somali Speech-to-Text API is running."}
 @app.post("/transcribe")
 async def transcribe(file: UploadFile = File(...)):
-    audio_bytes = await file.read()
-    audio_stream = io.BytesIO(audio_bytes)
-    waveform, sample_rate = torchaudio.load(audio_stream)
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-        waveform = resampler(waveform)
-    inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0])
-    return {"transcription": transcription}

 import os
+import io
+import uuid
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
 import torch
+import torchaudio
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+# This line is good practice but less critical now with the updated Dockerfile using HF_HOME
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
 app = FastAPI()
+# CORS middleware for allowing requests from your mobile app
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Load the ASR model and processor once at startup to save time on each request
+try:
+    processor = Wav2Vec2Processor.from_pretrained("Mustafaa4a/ASR-Somali")
+    model = Wav2Vec2ForCTC.from_pretrained("Mustafaa4a/ASR-Somali")
+except Exception as e:
+    # If the model fails to load, the app can't work.
+    # Log this for debugging. In a real app, you might exit or return an error state.
+    print(f"FATAL: Could not load model. Error: {e}")
+    model = None
+    processor = None
 @app.get("/")
 async def root():
+    """A simple endpoint to check if the API is running."""
     return {"message": "Somali Speech-to-Text API is running."}
 @app.post("/transcribe")
 async def transcribe(file: UploadFile = File(...)):
+    """
+    Receives an audio file, transcribes it, and returns the text.
+    """
+    if not model or not processor:
+        return {"error": "ASR model is not available."}
+    # Use a temporary file to reliably load the audio data.
+    # This helps torchaudio correctly identify the audio format.
+    temp_dir = "/tmp"
+    os.makedirs(temp_dir, exist_ok=True) # Ensure the directory exists
+    # Use the original filename from the upload to preserve the extension (e.g., .m4a, .wav)
+    temp_file_path = os.path.join(temp_dir, f"{uuid.uuid4()}_{file.filename}")
+    try:
+        # Save the uploaded file's content to the temporary file
+        with open(temp_file_path, "wb") as buffer:
+            buffer.write(await file.read())
+        # Load the audio using the file path, which is more reliable
+        waveform, sample_rate = torchaudio.load(temp_file_path)
+        # Resample the audio to the 16kHz required by the model
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            waveform = resampler(waveform)
+        # Process the audio waveform
+        # .squeeze() removes any redundant channels/dimensions
+        inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt", padding=True)
+        # Perform inference
+        with torch.no_grad():
+            logits = model(inputs.input_values).logits
+        # Decode the model's output to text
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        return {"transcription": transcription.upper()} # Returning in uppercase is common for ASR
+    except Exception as e:
+        # If anything goes wrong during processing, return a specific error
+        # This helps in debugging on the mobile client side
+        return {"error": f"Failed to process audio file. Reason: {str(e)}"}
+    finally:
+        # Clean up the temporary file after processing is complete
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)