Spaces:

imseldrith
/

TTS-OPENAI-FREE

Paused

App Files Files Community

imseldrith commited on Dec 11, 2024

Commit

5fc1e76

verified ·

1 Parent(s): 9a2e8e8

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.github/workflows/build-docker.yml +29 -3
Dockerfile.min +26 -0
pre_process_map.default.yaml +3 -4
requirements-min.txt +2 -1
requirements-rocm.txt +2 -1
requirements.txt +1 -0
sample.env +6 -0
speech.py +34 -4
startup.min.sh +1 -1
startup.sh +1 -1

.github/workflows/build-docker.yml CHANGED Viewed

@@ -26,6 +26,15 @@ jobs:
       - name: Check out code
         uses: actions/checkout@v4
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
@@ -69,7 +78,7 @@ jobs:
           labels: version=${{ github.run_id }}
           platforms: linux/amd64,linux/arm64
-  build-and-push-min-image:
     runs-on: ubuntu-latest
     permissions:
@@ -86,6 +95,15 @@ jobs:
       - name: Check out code
         uses: actions/checkout@v4
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
@@ -129,7 +147,7 @@ jobs:
           labels: version=${{ github.run_id }}
           platforms: linux/amd64,linux/arm64
-  build-and-push-rocm-image:
     runs-on: ubuntu-latest
     permissions:
@@ -147,6 +165,15 @@ jobs:
       - name: Check out code
         uses: actions/checkout@v4
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
@@ -193,4 +220,3 @@ jobs:
           platforms: linux/amd64,linux/arm64
           build-args: |
             USE_ROCM=1

       - name: Check out code
         uses: actions/checkout@v4
+      - name: Free Disk Space Before Build
+        run: |
+          sudo rm -rf /usr/local/.ghcup
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
           labels: version=${{ github.run_id }}
           platforms: linux/amd64,linux/arm64
+  build-and-push-image-min:
     runs-on: ubuntu-latest
     permissions:
       - name: Check out code
         uses: actions/checkout@v4
+      - name: Free Disk Space Before Build
+        run: |
+          sudo rm -rf /usr/local/.ghcup
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
           labels: version=${{ github.run_id }}
           platforms: linux/amd64,linux/arm64
+  build-and-push-image-rocm:
     runs-on: ubuntu-latest
     permissions:
       - name: Check out code
         uses: actions/checkout@v4
+      - name: Free Disk Space Before Build
+        run: |
+          sudo rm -rf /usr/local/.ghcup
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
         with:
           platforms: linux/amd64,linux/arm64
           build-args: |
             USE_ROCM=1

Dockerfile.min ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.11-slim
+ARG TARGETPLATFORM
+RUN <<EOF
+apt-get update
+apt-get install --no-install-recommends -y curl ffmpeg
+if [ "$TARGETPLATFORM" != "linux/amd64" ]; then
+	apt-get install --no-install-recommends -y build-essential
+	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+fi
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+EOF
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+RUN mkdir -p voices config
+COPY requirements*.txt /app/
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
+COPY *.py *.sh *.default.yaml README.md LICENSE /app/
+ENV TTS_HOME=voices
+ENV HF_HOME=voices
+CMD bash startup.min.sh

pre_process_map.default.yaml CHANGED Viewed

@@ -31,7 +31,6 @@
   - ' F.Y. '
 - - ([0-9]+)-([0-9]+)
   - \1 to \2
-- - '\*\*\*'
-  - '*'
-- - '\*\*'
-  - '*'

   - ' F.Y. '
 - - ([0-9]+)-([0-9]+)
   - \1 to \2
+# xtts has a lot of trouble with these, but piper is fine.
+#- - '[\*=+-]+'
+#  - ' '

requirements-min.txt CHANGED Viewed

@@ -2,4 +2,5 @@ fastapi
 uvicorn
 loguru
 numpy<2
-piper-tts

 uvicorn
 loguru
 numpy<2
+piper-tts
+pyyaml

requirements-rocm.txt CHANGED Viewed

@@ -2,8 +2,9 @@ fastapi
 uvicorn
 loguru
 piper-tts
-coqui-tts
 langdetect
 # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
 #deepspeed
 torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"

 uvicorn
 loguru
 piper-tts
+coqui-tts[languages]
 langdetect
+pyyaml
 # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
 #deepspeed
 torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ loguru
 piper-tts
 coqui-tts[languages]
 langdetect
 # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
 #deepspeed

 piper-tts
 coqui-tts[languages]
 langdetect
+pyyaml
 # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
 #deepspeed

sample.env ADDED Viewed

	@@ -0,0 +1,6 @@

+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#EXTRA_ARGS=--log-level DEBUG --unload-timer 300
+#USE_ROCM=1

speech.py CHANGED Viewed

@@ -10,6 +10,7 @@ import sys
 import threading
 import time
 import yaml
 from fastapi.responses import StreamingResponse
 from loguru import logger
@@ -84,13 +85,15 @@ class xtts_wrapper():
                 self.timer.daemon = True
                 self.timer.start()
-    def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
         with torch.no_grad():
             self.last_used = time.time()
             tokens = 0
             try:
                 with self.lock:
-                    gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
                     pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
                     self.last_used = time.time()
@@ -230,7 +233,15 @@ async def generate_speech(request: GenerateSpeechRequest):
         tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
         tts_proc.stdin.close()
-        ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate="22050")
         # Pipe the output from piper/xtts to the input of ffmpeg
         ffmpeg_args.extend(["-"])
@@ -308,6 +319,21 @@ async def generate_speech(request: GenerateSpeechRequest):
         in_q = queue.Queue() # speech pcm
         ex_q = queue.Queue() # exceptions
         def exception_check(exq: queue.Queue):
             try:
                 e = exq.get_nowait()
@@ -318,9 +344,13 @@ async def generate_speech(request: GenerateSpeechRequest):
         def generator():
             # text -> in_q
             try:
                 for text in all_text:
-                    for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
                         exception_check(ex_q)
                         in_q.put(chunk)

 import threading
 import time
 import yaml
+import json
 from fastapi.responses import StreamingResponse
 from loguru import logger
                 self.timer.daemon = True
                 self.timer.start()
+    def tts(self, text, language, audio_path, **hf_generate_kwargs):
         with torch.no_grad():
             self.last_used = time.time()
             tokens = 0
             try:
                 with self.lock:
+                    logger.debug(f"generating [{language}]: {[text]}")
+                    gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=audio_path) # not worth caching calls, it's < 0.001s after model is loaded
                     pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
                     self.last_used = time.time()
         tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
         tts_proc.stdin.close()
+        try:
+            with open(f"{piper_model}.json", 'r') as pvc_f:
+                conf = json.load(pvc_f)
+                sample_rate = str(conf['audio']['sample_rate'])
+        except:
+            sample_rate = '22050'
+        ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate=sample_rate)
         # Pipe the output from piper/xtts to the input of ffmpeg
         ffmpeg_args.extend(["-"])
         in_q = queue.Queue() # speech pcm
         ex_q = queue.Queue() # exceptions
+        def get_speaker_samples(samples: str) -> list[str]:
+            if os.path.isfile(samples):
+                audio_path = [samples]
+            elif os.path.isdir(samples):
+                audio_path = [os.path.join(samples, sample) for sample in os.listdir(samples) if os.path.isfile(os.path.join(samples, sample))]
+                if len(audio_path) < 1:
+                    logger.error(f"No files found: {samples}")
+                    raise ServiceUnavailableError(f"Invalid path: {samples}")
+            else:
+                logger.error(f"Invalid path: {samples}")
+                raise ServiceUnavailableError(f"Invalid path: {samples}")
+            return audio_path
         def exception_check(exq: queue.Queue):
             try:
                 e = exq.get_nowait()
         def generator():
             # text -> in_q
+            audio_path = get_speaker_samples(speaker)
+            logger.debug(f"{voice} wav samples: {audio_path}")
             try:
                 for text in all_text:
+                    for chunk in xtts.tts(text=text, language=language, audio_path=audio_path, **hf_generate_kwargs):
                         exception_check(ex_q)
                         in_q.put(chunk)

startup.min.sh CHANGED Viewed

@@ -4,4 +4,4 @@
 bash download_voices_tts-1.sh
-python speech.py --xtts_device none $EXTRA_ARGS $@ -P 7860


4
5	bash download_voices_tts-1.sh
6
7	+ python speech.py --xtts_device none $EXTRA_ARGS $@

startup.sh CHANGED Viewed

@@ -7,4 +7,4 @@ echo "First startup may download 2GB of speech models. Please wait."
 bash download_voices_tts-1.sh
 bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
-python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@ -P 7860

 bash download_voices_tts-1.sh
 bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
+python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@