Anicet commited on
Commit
7e3c986
·
1 Parent(s): d6e0e11

update: add translate, TTS and STT support

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y git ffmpeg
6
+
7
+ COPY . .
8
+
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -6,6 +6,7 @@ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  short_description: Common AI API
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
6
  sdk: docker
7
  pinned: false
8
  short_description: Common AI API
9
+ license: apache-2.0
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
functions/speech_to_text.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ import base64, tempfile, os
3
+
4
+ model = WhisperModel("base")
5
+
6
+
7
+ def speechToText(audioBase64: str, sourceLang: str) -> dict:
8
+ tempAudioPath = None
9
+ try:
10
+ audioBytes = base64.b64decode(audioBase64)
11
+
12
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".m4a") as tempFile:
13
+ tempFile.write(audioBytes)
14
+ tempAudioPath = tempFile.name
15
+
16
+ segments, info = model.transcribe(tempAudioPath, language=sourceLang)
17
+ text = " ".join(segment.text for segment in segments)
18
+ return {'text': text, 'language': info.language, 'duration': info.duration}
19
+ finally:
20
+ if tempAudioPath and os.path.exists(tempAudioPath):
21
+ os.remove(tempAudioPath)
functions/text_to_speech.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time, edge_tts, base64, os
2
+
3
+
4
+ async def textToSpeech(text: str, voice: str) -> str:
5
+ outputFile = f"temp_{int(time.time())}.mp3"
6
+ try:
7
+ tts = edge_tts.Communicate(text=text, voice=voice)
8
+ await tts.save(outputFile)
9
+
10
+ with open(outputFile, "rb") as file:
11
+ audioBytes = file.read()
12
+
13
+ audioBase64 = base64.b64encode(audioBytes).decode("utf-8")
14
+ return audioBase64
15
+ finally:
16
+ if os.path.exists(outputFile):
17
+ os.remove(outputFile)
functions/translation.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ MODEL_NAME = "facebook/nllb-200-distilled-600M"
5
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
6
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
7
+
8
+ model.eval()
9
+
10
+
11
+ def translateText(text: str, sourceLang: str, targetLang: str) -> str:
12
+ tokenizer.src_lang = sourceLang
13
+ inputs = tokenizer(text, return_tensors="pt")
14
+
15
+ with torch.no_grad():
16
+ tokens = model.generate(
17
+ **inputs,
18
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(targetLang),
19
+ num_beams=1,
20
+ max_length=128
21
+ )
22
+
23
+ translatedText = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
24
+ return translatedText
language/mos_stt.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64, tempfile, os
2
+ from transformers import pipeline
3
+ import soundfile as sf
4
+
5
+ MODEL_NAME = "facebook/mms-1b-all"
6
+ pipe = pipeline("automatic-speech-recognition", model=MODEL_NAME, model_kwargs={"target_lang": "mos"})
7
+
8
+
9
+ def mooreSTT(audioBase64: str) -> dict:
10
+ audioBytes = base64.b64decode(audioBase64)
11
+
12
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tempFile:
13
+ tempFile.write(audioBytes)
14
+ tempAudioPath = tempFile.name
15
+
16
+ try:
17
+ result = pipe(tempAudioPath)
18
+ text = result["text"]
19
+ duration = getAudioDuration(tempAudioPath)
20
+ finally:
21
+ os.remove(tempAudioPath)
22
+
23
+ return {'text': text, 'language': 'mos', 'duration': duration}
24
+
25
+
26
+ def getAudioDuration(filePath: str) -> float:
27
+ try:
28
+ data, samplerate = sf.read(filePath)
29
+ duration = len(data) / samplerate
30
+ return duration
31
+ except Exception as e:
32
+ print(f"Error getting audio duration: {e}")
33
+ return 0.0
language/mos_tts.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, base64, tempfile, os
2
+ import scipy.io.wavfile as wavfile
3
+ from transformers import VitsModel, VitsTokenizer
4
+
5
+ MODEL_NAME = "facebook/mms-tts-mos"
6
+ tokenizer = VitsTokenizer.from_pretrained(MODEL_NAME)
7
+ model = VitsModel.from_pretrained(MODEL_NAME)
8
+
9
+
10
+ def mooreTTS(text: str) -> str:
11
+ inputs = tokenizer(text, return_tensors="pt")
12
+
13
+ with torch.no_grad():
14
+ output = model(**inputs)
15
+
16
+ waveform = output.waveform[0].cpu().numpy()
17
+
18
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tempFile:
19
+ wavfile.write(tempFile.name, rate=model.config.sampling_rate, data=waveform)
20
+ tempAudioPath = tempFile.name
21
+
22
+ try:
23
+ with open(tempAudioPath, "rb") as file:
24
+ audioBase64 = base64.b64encode(file.read()).decode("utf-8")
25
+ finally:
26
+ os.remove(tempAudioPath)
27
+
28
+ return audioBase64
main.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request, HTTPException
2
+ from functions.translation import translateText
3
+ from functions.speech_to_text import speechToText
4
+ from functions.text_to_speech import textToSpeech
5
+ from language.mos_stt import mooreSTT
6
+ from language.mos_tts import mooreTTS
7
+
8
+
9
+ app = FastAPI(
10
+ version='1.0.0',
11
+ root_path='/api',
12
+ )
13
+
14
+
15
+ @app.post("/translateText")
16
+ async def translate(request: Request):
17
+ body: dict = await request.json()
18
+ try:
19
+ text = body.get('text')
20
+ sourceLang = body.get('sourceLang')
21
+ targetLang = body.get('targetLang')
22
+
23
+ translatedText = translateText(text=text, sourceLang=sourceLang, targetLang=targetLang)
24
+ return { 'translatedText': translatedText }
25
+ except Exception as e:
26
+ print(f"Translate error: {e}")
27
+ raise HTTPException(status_code=400, detail=f"Translate error: {e}")
28
+
29
+
30
+ @app.post("/speechToText")
31
+ async def convertSpeechToText(request: Request):
32
+ body: dict = await request.json()
33
+ try:
34
+ audioBase64 = body.get('audioBase64')
35
+ sourceLang = body.get('sourceLang')
36
+
37
+ data = speechToText(audioBase64=audioBase64, sourceLang=sourceLang)
38
+ return data
39
+ except Exception as e:
40
+ print(f"STT error: {e}")
41
+ raise HTTPException(status_code=400, detail=f"STT error: {e}")
42
+
43
+
44
+ @app.post("/textToSpeech")
45
+ async def convertTextToSpeech(request: Request):
46
+ body: dict = await request.json()
47
+ try:
48
+ text = body.get('text')
49
+ voice = body.get('voice')
50
+
51
+ audioBase64 = await textToSpeech(text=text, voice=voice)
52
+ return { 'audioBase64': audioBase64 }
53
+ except Exception as e:
54
+ print(f"TTS error: {e}")
55
+ raise HTTPException(status_code=400, detail=f"TTS error: {e}")
56
+
57
+
58
+ @app.post("/moore/speechToText")
59
+ async def mooreSpeechToText(request: Request):
60
+ body: dict = await request.json()
61
+ try:
62
+ audioBase64 = body.get('audioBase64')
63
+
64
+ data = mooreSTT(audioBase64=audioBase64)
65
+ return data
66
+ except Exception as e:
67
+ print(f"STT error: {e}")
68
+ raise HTTPException(status_code=400, detail=f"STT error: {e}")
69
+
70
+
71
+ @app.post("/moore/textToSpeech")
72
+ async def mooreTextToSpeech(request: Request):
73
+ body: dict = await request.json()
74
+ try:
75
+ text = body.get('text')
76
+
77
+ audioBase64 = mooreTTS(text=text)
78
+ return { 'audioBase64': audioBase64 }
79
+ except Exception as e:
80
+ print(f"TTS error: {e}")
81
+ raise HTTPException(status_code=400, detail=f"TTS error: {e}")
requirements.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.6.2
2
+ aiohttp==3.14.1
3
+ aiosignal==1.4.0
4
+ annotated-doc==0.0.4
5
+ annotated-types==0.7.0
6
+ anyio==4.13.0
7
+ async-timeout==5.0.1
8
+ attrs==26.1.0
9
+ av==17.1.0
10
+ certifi==2026.5.20
11
+ cffi==2.0.0
12
+ click==8.4.1
13
+ coloredlogs==15.0.1
14
+ ctranslate2==4.8.0
15
+ edge-tts==7.2.8
16
+ exceptiongroup==1.3.1
17
+ fastapi==0.136.3
18
+ faster-whisper==1.2.1
19
+ filelock==3.29.1
20
+ flatbuffers==25.12.19
21
+ frozenlist==1.8.0
22
+ fsspec==2026.4.0
23
+ h11==0.16.0
24
+ hf-xet==1.5.0
25
+ httpcore==1.0.9
26
+ httpx==0.28.1
27
+ huggingface_hub==1.18.0
28
+ humanfriendly==10.0
29
+ idna==3.18
30
+ Jinja2==3.1.6
31
+ markdown-it-py==4.2.0
32
+ MarkupSafe==3.0.3
33
+ mdurl==0.1.2
34
+ mpmath==1.3.0
35
+ multidict==6.7.1
36
+ networkx==3.4.2
37
+ numpy==2.2.6
38
+ onnxruntime==1.23.2
39
+ packaging==26.2
40
+ propcache==0.5.2
41
+ protobuf==7.35.0
42
+ pycparser==3.0
43
+ pydantic==2.13.4
44
+ pydantic_core==2.46.4
45
+ Pygments==2.20.0
46
+ PyYAML==6.0.3
47
+ regex==2026.5.9
48
+ rich==15.0.0
49
+ safetensors==0.7.0
50
+ scipy==1.15.3
51
+ sentencepiece==0.2.1
52
+ shellingham==1.5.4
53
+ soundfile==0.14.0
54
+ starlette==1.2.1
55
+ sympy==1.14.0
56
+ tabulate==0.10.0
57
+ tokenizers==0.22.2
58
+ torch==2.12.0
59
+ tqdm==4.68.1
60
+ transformers==5.10.2
61
+ typer==0.25.1
62
+ typing-inspection==0.4.2
63
+ typing_extensions==4.15.0
64
+ uvicorn==0.49.0
65
+ yarl==1.24.2