Spaces:

Somalitts
/

STT_Api

Sleeping

App Files Files Community

Somalitts commited on Aug 9, 2025

Commit

c8ea8ab

verified ·

1 Parent(s): 900e386

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -66

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
 import os
-import io
-import uuid
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
-import torch
 import torchaudio
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-# This line is good practice but less critical now with the updated Dockerfile using HF_HOME
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
 app = FastAPI()
-# CORS middleware for allowing requests from your mobile app
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -20,69 +18,30 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Load the ASR model and processor once at startup to save time on each request
-try:
-    processor = Wav2Vec2Processor.from_pretrained("Mustafaa4a/ASR-Somali")
-    model = Wav2Vec2ForCTC.from_pretrained("Mustafaa4a/ASR-Somali")
-except Exception as e:
-    # If the model fails to load, the app can't work.
-    # Log this for debugging. In a real app, you might exit or return an error state.
-    print(f"FATAL: Could not load model. Error: {e}")
-    model = None
-    processor = None
 @app.get("/")
 async def root():
-    """A simple endpoint to check if the API is running."""
     return {"message": "Somali Speech-to-Text API is running."}
 @app.post("/transcribe")
 async def transcribe(file: UploadFile = File(...)):
-    """
-    Receives an audio file, transcribes it, and returns the text.
-    """
-    if not model or not processor:
-        return {"error": "ASR model is not available."}
-    # Use a temporary file to reliably load the audio data.
-    # This helps torchaudio correctly identify the audio format.
-    temp_dir = "/tmp"
-    os.makedirs(temp_dir, exist_ok=True) # Ensure the directory exists
-    # Use the original filename from the upload to preserve the extension (e.g., .m4a, .wav)
-    temp_file_path = os.path.join(temp_dir, f"{uuid.uuid4()}_{file.filename}")
-    try:
-        # Save the uploaded file's content to the temporary file
-        with open(temp_file_path, "wb") as buffer:
-            buffer.write(await file.read())
-        # Load the audio using the file path, which is more reliable
-        waveform, sample_rate = torchaudio.load(temp_file_path)
-        # Resample the audio to the 16kHz required by the model
-        if sample_rate != 16000:
-            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-            waveform = resampler(waveform)
-        # Process the audio waveform
-        # .squeeze() removes any redundant channels/dimensions
-        inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt", padding=True)
-        # Perform inference
-        with torch.no_grad():
-            logits = model(inputs.input_values).logits
-        # Decode the model's output to text
-        predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = processor.batch_decode(predicted_ids)[0]
-        return {"transcription": transcription.upper()} # Returning in uppercase is common for ASR
-    except Exception as e:
-        # If anything goes wrong during processing, return a specific error
-        # This helps in debugging on the mobile client side
-        return {"error": f"Failed to process audio file. Reason: {str(e)}"}
-    finally:
-        # Clean up the temporary file after processing is complete
-        if os.path.exists(temp_file_path):
-            os.remove(temp_file_path)

 import os
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"  # Important for Docker
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
 import torchaudio
+import torch
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import io
 app = FastAPI()
+# Allow all origins (for Flutter)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Load model
+processor = Wav2Vec2Processor.from_pretrained("Mustafaa4a/ASR-Somali")
+model = Wav2Vec2ForCTC.from_pretrained("Mustafaa4a/ASR-Somali")
 @app.get("/")
 async def root():
     return {"message": "Somali Speech-to-Text API is running."}
 @app.post("/transcribe")
 async def transcribe(file: UploadFile = File(...)):
+    audio_bytes = await file.read()
+    audio_stream = io.BytesIO(audio_bytes)
+    waveform, sample_rate = torchaudio.load(audio_stream)
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        waveform = resampler(waveform)
+    inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    return {"transcription": transcription}