Its-OMG commited on
Commit ·
a2eb473
1
Parent(s): a26a4aa
Bumped torch to 2.7+ and torchcodec to 0.3 to restore ASR auto transcribe
Browse files- Dockerfile +10 -7
- main.py +10 -0
- requirements.txt +8 -6
Dockerfile
CHANGED
|
@@ -27,14 +27,17 @@ RUN pip install --user --upgrade pip setuptools wheel \
|
|
| 27 |
&& pip install --user "numpy>=1.26.0" Cython \
|
| 28 |
&& pip install --user --no-build-isolation pkuseg==0.0.25
|
| 29 |
|
| 30 |
-
# chatterbox-tts==0.1.7 hard-pins transformers==5.2.0
|
| 31 |
-
# >=5.3.0 (HiggsAudioV2
|
| 32 |
-
#
|
| 33 |
-
#
|
| 34 |
-
#
|
| 35 |
-
#
|
|
|
|
|
|
|
| 36 |
RUN pip install --user chatterbox-tts==0.1.7 \
|
| 37 |
-
&& pip install --user --no-deps --upgrade 'transformers>=5.3.0,<6'
|
|
|
|
| 38 |
|
| 39 |
# Install the rest of the Python deps (chatterbox is already satisfied,
|
| 40 |
# so it won't be re-resolved here).
|
|
|
|
| 27 |
&& pip install --user "numpy>=1.26.0" Cython \
|
| 28 |
&& pip install --user --no-build-isolation pkuseg==0.0.25
|
| 29 |
|
| 30 |
+
# chatterbox-tts==0.1.7 hard-pins transformers==5.2.0 and torch==2.6.0, but
|
| 31 |
+
# OmniVoice needs transformers>=5.3.0 (HiggsAudioV2) and the transformers ASR
|
| 32 |
+
# pipeline calls torchcodec.decoders.AudioDecoder which only exists in
|
| 33 |
+
# torchcodec>=0.3, which in turn needs torch>=2.7. Install chatterbox first
|
| 34 |
+
# so it pulls in vocos/encodec/librosa/etc., then force-upgrade transformers,
|
| 35 |
+
# torch, and torchaudio above their pins with --no-deps. chatterbox uses
|
| 36 |
+
# stable PyTorch APIs and runs fine on torch 2.7 in practice; pip will print
|
| 37 |
+
# a "broken requirement" warning that's safe to ignore here.
|
| 38 |
RUN pip install --user chatterbox-tts==0.1.7 \
|
| 39 |
+
&& pip install --user --no-deps --upgrade 'transformers>=5.3.0,<6' \
|
| 40 |
+
&& pip install --user --no-deps --upgrade 'torch>=2.7,<2.8' 'torchaudio>=2.7,<2.8'
|
| 41 |
|
| 42 |
# Install the rest of the Python deps (chatterbox is already satisfied,
|
| 43 |
# so it won't be re-resolved here).
|
main.py
CHANGED
|
@@ -312,6 +312,16 @@ def omnivoice_generate(
|
|
| 312 |
if mode == "clone":
|
| 313 |
if not ref_audio_path:
|
| 314 |
raise HTTPException(400, "Voice Clone requires a reference audio file.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
kw["voice_clone_prompt"] = models.omnivoice.create_voice_clone_prompt(
|
| 316 |
ref_audio=ref_audio_path,
|
| 317 |
ref_text=ref_text or None,
|
|
|
|
| 312 |
if mode == "clone":
|
| 313 |
if not ref_audio_path:
|
| 314 |
raise HTTPException(400, "Voice Clone requires a reference audio file.")
|
| 315 |
+
if not (ref_text and ref_text.strip()) and not LOAD_ASR:
|
| 316 |
+
# Auto-transcribe (Whisper) is disabled in this build because the
|
| 317 |
+
# transformers ASR pipeline pulls in torchcodec features that need
|
| 318 |
+
# torch>=2.7, and we're holding torch at 2.6 for chatterbox.
|
| 319 |
+
raise HTTPException(
|
| 320 |
+
400,
|
| 321 |
+
"Reference text is required: auto-transcribe is disabled in "
|
| 322 |
+
"this deployment. Please paste the transcript of your "
|
| 323 |
+
"reference audio in the 'Reference text' field.",
|
| 324 |
+
)
|
| 325 |
kw["voice_clone_prompt"] = models.omnivoice.create_voice_clone_prompt(
|
| 326 |
ref_audio=ref_audio_path,
|
| 327 |
ref_text=ref_text or None,
|
requirements.txt
CHANGED
|
@@ -4,12 +4,14 @@ uvicorn[standard]>=0.32
|
|
| 4 |
python-multipart>=0.0.18
|
| 5 |
|
| 6 |
# --- Core ML stack ----------------------------------------------------------
|
| 7 |
-
# torch
|
| 8 |
-
#
|
| 9 |
-
# in
|
| 10 |
-
torch
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
transformers>=5.3.0
|
| 14 |
accelerate
|
| 15 |
numpy>=1.26
|
|
|
|
| 4 |
python-multipart>=0.0.18
|
| 5 |
|
| 6 |
# --- Core ML stack ----------------------------------------------------------
|
| 7 |
+
# torch 2.7 is required because transformers 5.3's ASR pipeline calls
|
| 8 |
+
# torchcodec.decoders.AudioDecoder, which only exists from torchcodec 0.3+,
|
| 9 |
+
# which in turn requires torch>=2.7 (uses the 2.7 AOTI ABI). chatterbox-tts
|
| 10 |
+
# 0.1.7 metadata pins torch==2.6.0 defensively, but it runs fine on 2.7 in
|
| 11 |
+
# practice; the Dockerfile bypasses that pin with --no-deps.
|
| 12 |
+
torch>=2.7,<2.8
|
| 13 |
+
torchaudio>=2.7,<2.8
|
| 14 |
+
torchcodec>=0.3,<0.4
|
| 15 |
transformers>=5.3.0
|
| 16 |
accelerate
|
| 17 |
numpy>=1.26
|