Spaces:

Gaoussin
/

bm_speech

Running

App Files Files Community

Gaoussin commited on 10 days ago

Commit

8bb18e8

verified ·

1 Parent(s): 7642f85

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -59

app.py CHANGED Viewed

@@ -1,71 +1,43 @@
-import os
-os.environ["HF_HOME"] = "/tmp/hf"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf"
-os.environ["HF_DATASETS_CACHE"] = "/tmp/hf"
-os.makedirs("/tmp/hf", exist_ok=True)
-from fastapi import FastAPI, Query
-from fastapi.responses import StreamingResponse
-from transformers import VitsModel, AutoTokenizer
-import torch, scipy.io.wavfile as wavfile
 import io
-import edge_tts
-app = FastAPI(title="Bambara TTS API")
-# Load model once at startup
-model = VitsModel.from_pretrained("facebook/mms-tts-bam")
-tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-bam")
-sampling_rate = model.config.sampling_rate
-@app.get("/tts/")
-async def tts(text: str = Query(..., description="Bambara text to synthesize")):
-    inputs = tokenizer(text, return_tensors="pt")
-    inputs = {k: v.to("cpu") for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model(**inputs).waveform
-    waveform = output[0]
-    # Stream audio instead of saving to disk
-    buffer = io.BytesIO()
-    wavfile.write(buffer, rate=sampling_rate, data=waveform.numpy())
-    buffer.seek(0)
-    return StreamingResponse(buffer, media_type="audio/wav")
-@app.get("/noneBmTts/")
-async def noneBmTts(
-    text: str = Query(..., description="Text to synthesize"),
-    voice: str = Query(
-        "fr-FR-DeniseNeural", description="Voice ID (e.g., en-US-GuyNeural)"
-    ),
-):
     try:
-        # Create the Communicate object with the requested text and voice
-        communicate = edge_tts.Communicate(text, voice)
-        buffer = io.BytesIO()
-        # Stream the audio chunks into the buffer
-        async for chunk in communicate.stream():
-            if chunk["type"] == "audio":
-                buffer.write(chunk["data"])
-        # Check if we actually got data
-        if buffer.tell() == 0:
-            raise HTTPException(
-                status_code=400, detail="Synthesis failed to produce audio."
-            )
-        buffer.seek(0)
-        return StreamingResponse(buffer, media_type="audio/mpeg")
     except Exception as e:
-        # Catch errors like invalid voice names
-        raise HTTPException(status_code=400, detail=str(e))

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+import torch
+import librosa
 import io
+app = FastAPI()
+# Load model and processor once at startup
+MODEL_ID = "facebook/mms-1b-all"
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+@app.post("/transcribe")
+async def transcribe(audio_file: UploadFile = File(...)):
+    # 1. Check if a file was actually uploaded
+    if not audio_file:
+        raise HTTPException(status_code=400, detail="No file uploaded")
     try:
+        # 2. Read the file into memory
+        audio_bytes = await audio_file.read()
+        # 3. Load and Resample to 16,000 Hz using librosa
+        # io.BytesIO(audio_bytes) lets librosa treat the bytes like a file
+        audio_data, _ = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        # 4. Setup Bambara Adapter
+        processor.tokenizer.set_target_lang("bam")
+        model.load_adapter("bam")
+        # 5. Perform Inference
+        inputs = processor(audio_data, sampling_rate=16_000, return_tensors="pt")
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        return {"text": transcription}
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing audio: {str(e)}")