Its-OMG commited on
Commit
a2eb473
·
1 Parent(s): a26a4aa

Bumped torch to 2.7+ and torchcodec to 0.3 to restore ASR auto transcribe

Browse files
Files changed (3) hide show
  1. Dockerfile +10 -7
  2. main.py +10 -0
  3. requirements.txt +8 -6
Dockerfile CHANGED
@@ -27,14 +27,17 @@ RUN pip install --user --upgrade pip setuptools wheel \
27
  && pip install --user "numpy>=1.26.0" Cython \
28
  && pip install --user --no-build-isolation pkuseg==0.0.25
29
 
30
- # chatterbox-tts==0.1.7 hard-pins transformers==5.2.0, but OmniVoice needs
31
- # >=5.3.0 (HiggsAudioV2 was added in 5.3.0). Install chatterbox first so it
32
- # pulls in vocos / encodec / librosa / etc., then force-upgrade transformers
33
- # above its pin with --no-deps. chatterbox runs fine on transformers 5.3+
34
- # in practice; pip will print a "broken requirement" warning that's safe
35
- # to ignore here.
 
 
36
  RUN pip install --user chatterbox-tts==0.1.7 \
37
- && pip install --user --no-deps --upgrade 'transformers>=5.3.0,<6'
 
38
 
39
  # Install the rest of the Python deps (chatterbox is already satisfied,
40
  # so it won't be re-resolved here).
 
27
  && pip install --user "numpy>=1.26.0" Cython \
28
  && pip install --user --no-build-isolation pkuseg==0.0.25
29
 
30
+ # chatterbox-tts==0.1.7 hard-pins transformers==5.2.0 and torch==2.6.0, but
31
+ # OmniVoice needs transformers>=5.3.0 (HiggsAudioV2) and the transformers ASR
32
+ # pipeline calls torchcodec.decoders.AudioDecoder which only exists in
33
+ # torchcodec>=0.3, which in turn needs torch>=2.7. Install chatterbox first
34
+ # so it pulls in vocos/encodec/librosa/etc., then force-upgrade transformers,
35
+ # torch, and torchaudio above their pins with --no-deps. chatterbox uses
36
+ # stable PyTorch APIs and runs fine on torch 2.7 in practice; pip will print
37
+ # a "broken requirement" warning that's safe to ignore here.
38
  RUN pip install --user chatterbox-tts==0.1.7 \
39
+ && pip install --user --no-deps --upgrade 'transformers>=5.3.0,<6' \
40
+ && pip install --user --no-deps --upgrade 'torch>=2.7,<2.8' 'torchaudio>=2.7,<2.8'
41
 
42
  # Install the rest of the Python deps (chatterbox is already satisfied,
43
  # so it won't be re-resolved here).
main.py CHANGED
@@ -312,6 +312,16 @@ def omnivoice_generate(
312
  if mode == "clone":
313
  if not ref_audio_path:
314
  raise HTTPException(400, "Voice Clone requires a reference audio file.")
 
 
 
 
 
 
 
 
 
 
315
  kw["voice_clone_prompt"] = models.omnivoice.create_voice_clone_prompt(
316
  ref_audio=ref_audio_path,
317
  ref_text=ref_text or None,
 
312
  if mode == "clone":
313
  if not ref_audio_path:
314
  raise HTTPException(400, "Voice Clone requires a reference audio file.")
315
+ if not (ref_text and ref_text.strip()) and not LOAD_ASR:
316
+ # Auto-transcribe (Whisper) is disabled in this build because the
317
+ # transformers ASR pipeline pulls in torchcodec features that need
318
+ # torch>=2.7, and we're holding torch at 2.6 for chatterbox.
319
+ raise HTTPException(
320
+ 400,
321
+ "Reference text is required: auto-transcribe is disabled in "
322
+ "this deployment. Please paste the transcript of your "
323
+ "reference audio in the 'Reference text' field.",
324
+ )
325
  kw["voice_clone_prompt"] = models.omnivoice.create_voice_clone_prompt(
326
  ref_audio=ref_audio_path,
327
  ref_text=ref_text or None,
requirements.txt CHANGED
@@ -4,12 +4,14 @@ uvicorn[standard]>=0.32
4
  python-multipart>=0.0.18
5
 
6
  # --- Core ML stack ----------------------------------------------------------
7
- # torch is pinned to 2.6.x because chatterbox-tts==0.1.7 installs torch 2.6.0
8
- # and torchcodec must match torch's AOTI ABI (aoti_torch_abi_version landed
9
- # in 2.7). torchcodec 0.2.x is the line that targets torch 2.6.
10
- torch>=2.6,<2.7
11
- torchaudio>=2.6,<2.7
12
- torchcodec>=0.2,<0.3
 
 
13
  transformers>=5.3.0
14
  accelerate
15
  numpy>=1.26
 
4
  python-multipart>=0.0.18
5
 
6
  # --- Core ML stack ----------------------------------------------------------
7
+ # torch 2.7 is required because transformers 5.3's ASR pipeline calls
8
+ # torchcodec.decoders.AudioDecoder, which only exists from torchcodec 0.3+,
9
+ # which in turn requires torch>=2.7 (uses the 2.7 AOTI ABI). chatterbox-tts
10
+ # 0.1.7 metadata pins torch==2.6.0 defensively, but it runs fine on 2.7 in
11
+ # practice; the Dockerfile bypasses that pin with --no-deps.
12
+ torch>=2.7,<2.8
13
+ torchaudio>=2.7,<2.8
14
+ torchcodec>=0.3,<0.4
15
  transformers>=5.3.0
16
  accelerate
17
  numpy>=1.26