Spaces:

Kora3
/

AI-API

Running

App Files Files Community

Anicet commited on 21 days ago

Commit

7e3c986

1 Parent(s): d6e0e11

update: add translate, TTS and STT support

Browse files

Files changed (10) hide show

.gitignore +1 -0
Dockerfile +11 -0
README.md +1 -0
functions/speech_to_text.py +21 -0
functions/text_to_speech.py +17 -0
functions/translation.py +24 -0
language/mos_stt.py +33 -0
language/mos_tts.py +28 -0
main.py +81 -0
requirements.txt +65 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y git ffmpeg
+COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -6,6 +6,7 @@ colorTo: purple
 sdk: docker
 pinned: false
 short_description: Common AI API
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk: docker
 pinned: false
 short_description: Common AI API
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

functions/speech_to_text.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from faster_whisper import WhisperModel
+import base64, tempfile, os
+model = WhisperModel("base")
+def speechToText(audioBase64: str, sourceLang: str) -> dict:
+    tempAudioPath = None
+    try:
+        audioBytes = base64.b64decode(audioBase64)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".m4a") as tempFile:
+            tempFile.write(audioBytes)
+            tempAudioPath = tempFile.name
+        segments, info = model.transcribe(tempAudioPath, language=sourceLang)
+        text = " ".join(segment.text for segment in segments)
+        return {'text': text, 'language': info.language, 'duration': info.duration}
+    finally:
+        if tempAudioPath and os.path.exists(tempAudioPath):
+            os.remove(tempAudioPath)

functions/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import time, edge_tts, base64, os
+async def textToSpeech(text: str, voice: str) -> str:
+    outputFile = f"temp_{int(time.time())}.mp3"
+    try:
+        tts = edge_tts.Communicate(text=text, voice=voice)
+        await tts.save(outputFile)
+        with open(outputFile, "rb") as file:
+            audioBytes = file.read()
+        audioBase64 = base64.b64encode(audioBytes).decode("utf-8")
+        return audioBase64
+    finally:
+        if os.path.exists(outputFile):
+            os.remove(outputFile)

functions/translation.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+MODEL_NAME = "facebook/nllb-200-distilled-600M"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+model.eval()
+def translateText(text: str, sourceLang: str, targetLang: str) -> str:
+    tokenizer.src_lang = sourceLang
+    inputs = tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        tokens = model.generate(
+            **inputs,
+            forced_bos_token_id=tokenizer.convert_tokens_to_ids(targetLang),
+            num_beams=1,
+            max_length=128
+        )
+    translatedText = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
+    return translatedText

language/mos_stt.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import base64, tempfile, os
+from transformers import pipeline
+import soundfile as sf
+MODEL_NAME = "facebook/mms-1b-all"
+pipe = pipeline("automatic-speech-recognition", model=MODEL_NAME, model_kwargs={"target_lang": "mos"})
+def mooreSTT(audioBase64: str) -> dict:
+    audioBytes = base64.b64decode(audioBase64)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tempFile:
+        tempFile.write(audioBytes)
+        tempAudioPath = tempFile.name
+    try:
+        result = pipe(tempAudioPath)
+        text = result["text"]
+        duration = getAudioDuration(tempAudioPath)
+    finally:
+        os.remove(tempAudioPath)
+    return {'text': text, 'language': 'mos', 'duration': duration}
+def getAudioDuration(filePath: str) -> float:
+    try:
+        data, samplerate = sf.read(filePath)
+        duration = len(data) / samplerate
+        return duration
+    except Exception as e:
+        print(f"Error getting audio duration: {e}")
+        return 0.0

language/mos_tts.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch, base64, tempfile, os
+import scipy.io.wavfile as wavfile
+from transformers import VitsModel, VitsTokenizer
+MODEL_NAME = "facebook/mms-tts-mos"
+tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME)
+model = VitsModel.from_pretrained(MODEL_NAME)
+def mooreTTS(text: str) -> str:
+    inputs = tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        output = model(**inputs)
+    waveform = output.waveform[0].cpu().numpy()
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tempFile:
+        wavfile.write(tempFile.name, rate=model.config.sampling_rate, data=waveform)
+        tempAudioPath = tempFile.name
+    try:
+        with open(tempAudioPath, "rb") as file:
+            audioBase64 = base64.b64encode(file.read()).decode("utf-8")
+    finally:
+        os.remove(tempAudioPath)
+    return audioBase64

main.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import FastAPI, Request, HTTPException
+from functions.translation import translateText
+from functions.speech_to_text import speechToText
+from functions.text_to_speech import textToSpeech
+from language.mos_stt import mooreSTT
+from language.mos_tts import mooreTTS
+app = FastAPI(
+    version='1.0.0',
+    root_path='/api',
+)
+@app.post("/translateText")
+async def translate(request: Request):
+    body: dict = await request.json()
+    try:
+        text = body.get('text')
+        sourceLang = body.get('sourceLang')
+        targetLang = body.get('targetLang')
+        translatedText = translateText(text=text, sourceLang=sourceLang, targetLang=targetLang)
+        return { 'translatedText': translatedText }
+    except Exception as e:
+        print(f"Translate error: {e}")
+        raise HTTPException(status_code=400, detail=f"Translate error: {e}")
+@app.post("/speechToText")
+async def convertSpeechToText(request: Request):
+    body: dict = await request.json()
+    try:
+        audioBase64 = body.get('audioBase64')
+        sourceLang = body.get('sourceLang')
+        data = speechToText(audioBase64=audioBase64, sourceLang=sourceLang)
+        return data
+    except Exception as e:
+        print(f"STT error: {e}")
+        raise HTTPException(status_code=400, detail=f"STT error: {e}")
+@app.post("/textToSpeech")
+async def convertTextToSpeech(request: Request):
+    body: dict = await request.json()
+    try:
+        text = body.get('text')
+        voice = body.get('voice')
+        audioBase64 = await textToSpeech(text=text, voice=voice)
+        return { 'audioBase64': audioBase64 }
+    except Exception as e:
+        print(f"TTS error: {e}")
+        raise HTTPException(status_code=400, detail=f"TTS error: {e}")
+@app.post("/moore/speechToText")
+async def mooreSpeechToText(request: Request):
+    body: dict = await request.json()
+    try:
+        audioBase64 = body.get('audioBase64')
+        data = mooreSTT(audioBase64=audioBase64)
+        return data
+    except Exception as e:
+        print(f"STT error: {e}")
+        raise HTTPException(status_code=400, detail=f"STT error: {e}")
+@app.post("/moore/textToSpeech")
+async def mooreTextToSpeech(request: Request):
+    body: dict = await request.json()
+    try:
+        text = body.get('text')
+        audioBase64 = mooreTTS(text=text)
+        return { 'audioBase64': audioBase64 }
+    except Exception as e:
+        print(f"TTS error: {e}")
+        raise HTTPException(status_code=400, detail=f"TTS error: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+aiohappyeyeballs==2.6.2
+aiohttp==3.14.1
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+async-timeout==5.0.1
+attrs==26.1.0
+av==17.1.0
+certifi==2026.5.20
+cffi==2.0.0
+click==8.4.1
+coloredlogs==15.0.1
+ctranslate2==4.8.0
+edge-tts==7.2.8
+exceptiongroup==1.3.1
+fastapi==0.136.3
+faster-whisper==1.2.1
+filelock==3.29.1
+flatbuffers==25.12.19
+frozenlist==1.8.0
+fsspec==2026.4.0
+h11==0.16.0
+hf-xet==1.5.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.18.0
+humanfriendly==10.0
+idna==3.18
+Jinja2==3.1.6
+markdown-it-py==4.2.0
+MarkupSafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.1
+networkx==3.4.2
+numpy==2.2.6
+onnxruntime==1.23.2
+packaging==26.2
+propcache==0.5.2
+protobuf==7.35.0
+pycparser==3.0
+pydantic==2.13.4
+pydantic_core==2.46.4
+Pygments==2.20.0
+PyYAML==6.0.3
+regex==2026.5.9
+rich==15.0.0
+safetensors==0.7.0
+scipy==1.15.3
+sentencepiece==0.2.1
+shellingham==1.5.4
+soundfile==0.14.0
+starlette==1.2.1
+sympy==1.14.0
+tabulate==0.10.0
+tokenizers==0.22.2
+torch==2.12.0
+tqdm==4.68.1
+transformers==5.10.2
+typer==0.25.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+uvicorn==0.49.0
+yarl==1.24.2