Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -53,8 +53,23 @@ except ImportError:
|
|
| 53 |
try:
|
| 54 |
from chatterbox.src.chatterbox.tts import ChatterboxTTS
|
| 55 |
CHATTERBOX_AVAILABLE = True
|
|
|
|
| 56 |
except ImportError:
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# Import config and prompts
|
| 60 |
from config_prompts import (
|
|
@@ -573,10 +588,14 @@ class UnifiedAudioConverter:
|
|
| 573 |
Chatterbox TTS๋ฅผ ์ฌ์ฉํ์ฌ ๋ํ๋ฅผ ์์ฑ์ผ๋ก ๋ณํ
|
| 574 |
"""
|
| 575 |
if not CHATTERBOX_AVAILABLE:
|
| 576 |
-
raise RuntimeError("Chatterbox TTS not available")
|
| 577 |
|
| 578 |
-
|
| 579 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
|
| 581 |
if seed_num_input != 0:
|
| 582 |
set_seed(int(seed_num_input))
|
|
@@ -588,7 +607,7 @@ class UnifiedAudioConverter:
|
|
| 588 |
if not text.strip():
|
| 589 |
continue
|
| 590 |
|
| 591 |
-
print(f"์์ฑ ์ค: Speaker {i+1} - '{text[:50]}...'")
|
| 592 |
|
| 593 |
try:
|
| 594 |
# ํ
์คํธ๊ฐ ์งง์ผ๋ฉด ๋จ์ผ ์์ฑ
|
|
@@ -605,9 +624,11 @@ class UnifiedAudioConverter:
|
|
| 605 |
else:
|
| 606 |
# ๊ธด ํ
์คํธ๋ ์ฒญํฌ๋ก ๋ถํ
|
| 607 |
chunks = split_text_into_chunks(text, max_chars=chunk_size_input)
|
|
|
|
| 608 |
|
| 609 |
chunk_audio_segments = []
|
| 610 |
-
for chunk in chunks:
|
|
|
|
| 611 |
wav = model.generate(
|
| 612 |
chunk,
|
| 613 |
audio_prompt_path=audio_prompt_path_input,
|
|
@@ -633,11 +654,15 @@ class UnifiedAudioConverter:
|
|
| 633 |
audio_segments.append(concatenated_turn)
|
| 634 |
|
| 635 |
except Exception as e:
|
| 636 |
-
print(f"Speaker {i+1} ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
continue
|
| 638 |
|
| 639 |
if not audio_segments:
|
| 640 |
-
raise RuntimeError("์ค๋์ค ์์ฑ์ ์คํจํ์ต๋๋ค.")
|
| 641 |
|
| 642 |
# ๋ชจ๋ ์คํผ์ปค์ ์ค๋์ค ์ธ๊ทธ๋จผํธ ์ฐ๊ฒฐ
|
| 643 |
speaker_silence_duration = int(0.5 * model.sr) # ์คํผ์ปค ๊ฐ 0.5์ด ๋ฌด์
|
|
@@ -651,7 +676,7 @@ class UnifiedAudioConverter:
|
|
| 651 |
|
| 652 |
concatenated_audio = np.concatenate(final_audio)
|
| 653 |
|
| 654 |
-
print(f"์ค๋์ค ์์ฑ
|
| 655 |
return (model.sr, concatenated_audio)
|
| 656 |
|
| 657 |
def _create_output_directory(self) -> str:
|
|
@@ -739,6 +764,9 @@ async def regenerate_audio(
|
|
| 739 |
if not conversation_text.strip():
|
| 740 |
return "Please provide conversation text.", None
|
| 741 |
|
|
|
|
|
|
|
|
|
|
| 742 |
try:
|
| 743 |
conversation_json = converter.parse_conversation_text(conversation_text)
|
| 744 |
|
|
@@ -746,25 +774,34 @@ async def regenerate_audio(
|
|
| 746 |
return "No valid conversation found in the text.", None
|
| 747 |
|
| 748 |
# Generate audio using Chatterbox TTS
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
|
|
|
| 758 |
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
|
| 764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
|
| 766 |
except Exception as e:
|
| 767 |
-
return f"Error
|
| 768 |
|
| 769 |
|
| 770 |
def synthesize_sync(article_input, input_type: str = "URL", mode: str = "Local"):
|
|
|
|
| 53 |
try:
|
| 54 |
from chatterbox.src.chatterbox.tts import ChatterboxTTS
|
| 55 |
CHATTERBOX_AVAILABLE = True
|
| 56 |
+
print("โ
Chatterbox TTS imported successfully from chatterbox.src.chatterbox.tts")
|
| 57 |
except ImportError:
|
| 58 |
+
try:
|
| 59 |
+
from chatterbox.tts import ChatterboxTTS
|
| 60 |
+
CHATTERBOX_AVAILABLE = True
|
| 61 |
+
print("โ
Chatterbox TTS imported successfully from chatterbox.tts")
|
| 62 |
+
except ImportError:
|
| 63 |
+
try:
|
| 64 |
+
# ๋ค๋ฅธ ๊ฐ๋ฅํ ๊ฒฝ๋ก ์๋
|
| 65 |
+
import sys
|
| 66 |
+
sys.path.append('/usr/local/lib/python3.10/site-packages')
|
| 67 |
+
from chatterbox import ChatterboxTTS
|
| 68 |
+
CHATTERBOX_AVAILABLE = True
|
| 69 |
+
print("โ
Chatterbox TTS imported successfully from chatterbox")
|
| 70 |
+
except ImportError:
|
| 71 |
+
CHATTERBOX_AVAILABLE = False
|
| 72 |
+
print("โ Chatterbox TTS not available - falling back to text-only mode")
|
| 73 |
|
| 74 |
# Import config and prompts
|
| 75 |
from config_prompts import (
|
|
|
|
| 588 |
Chatterbox TTS๋ฅผ ์ฌ์ฉํ์ฌ ๋ํ๋ฅผ ์์ฑ์ผ๋ก ๋ณํ
|
| 589 |
"""
|
| 590 |
if not CHATTERBOX_AVAILABLE:
|
| 591 |
+
raise RuntimeError("Chatterbox TTS not available. Please install chatterbox package.")
|
| 592 |
|
| 593 |
+
try:
|
| 594 |
+
# GPU ํจ์ ๋ด์์ ๋ชจ๋ธ ๋ก๋
|
| 595 |
+
model = ChatterboxTTS.from_pretrained(DEVICE)
|
| 596 |
+
print(f"โ
Chatterbox TTS model loaded on {DEVICE}")
|
| 597 |
+
except Exception as e:
|
| 598 |
+
raise RuntimeError(f"Failed to load Chatterbox TTS model: {e}")
|
| 599 |
|
| 600 |
if seed_num_input != 0:
|
| 601 |
set_seed(int(seed_num_input))
|
|
|
|
| 607 |
if not text.strip():
|
| 608 |
continue
|
| 609 |
|
| 610 |
+
print(f"๐๏ธ ์์ฑ ์ค: Speaker {i+1} - '{text[:50]}...'")
|
| 611 |
|
| 612 |
try:
|
| 613 |
# ํ
์คํธ๊ฐ ์งง์ผ๋ฉด ๋จ์ผ ์์ฑ
|
|
|
|
| 624 |
else:
|
| 625 |
# ๊ธด ํ
์คํธ๋ ์ฒญํฌ๋ก ๋ถํ
|
| 626 |
chunks = split_text_into_chunks(text, max_chars=chunk_size_input)
|
| 627 |
+
print(f"๐ ํ
์คํธ๋ฅผ {len(chunks)}๊ฐ ์ฒญํฌ๋ก ๋ถํ ")
|
| 628 |
|
| 629 |
chunk_audio_segments = []
|
| 630 |
+
for j, chunk in enumerate(chunks):
|
| 631 |
+
print(f" ๐ ์ฒญํฌ {j+1}/{len(chunks)} ์์ฑ ์ค...")
|
| 632 |
wav = model.generate(
|
| 633 |
chunk,
|
| 634 |
audio_prompt_path=audio_prompt_path_input,
|
|
|
|
| 654 |
audio_segments.append(concatenated_turn)
|
| 655 |
|
| 656 |
except Exception as e:
|
| 657 |
+
print(f"โ Speaker {i+1} ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
| 658 |
+
# ์ค๋ฅ ๋ฐ์ ์ ๋ฌด์์ผ๋ก ๋์ฒด
|
| 659 |
+
silence_duration = int(2.0 * model.sr) # 2์ด ๋ฌด์
|
| 660 |
+
silence = np.zeros(silence_duration)
|
| 661 |
+
audio_segments.append(silence)
|
| 662 |
continue
|
| 663 |
|
| 664 |
if not audio_segments:
|
| 665 |
+
raise RuntimeError("๋ชจ๋ ์ค๋์ค ์์ฑ์ ์คํจํ์ต๋๋ค.")
|
| 666 |
|
| 667 |
# ๋ชจ๋ ์คํผ์ปค์ ์ค๋์ค ์ธ๊ทธ๋จผํธ ์ฐ๊ฒฐ
|
| 668 |
speaker_silence_duration = int(0.5 * model.sr) # ์คํผ์ปค ๊ฐ 0.5์ด ๋ฌด์
|
|
|
|
| 676 |
|
| 677 |
concatenated_audio = np.concatenate(final_audio)
|
| 678 |
|
| 679 |
+
print(f"๐ ์ค๋์ค ์์ฑ ์๋ฃ! ์ด ๊ธธ์ด: {len(concatenated_audio) / model.sr:.2f}์ด")
|
| 680 |
return (model.sr, concatenated_audio)
|
| 681 |
|
| 682 |
def _create_output_directory(self) -> str:
|
|
|
|
| 764 |
if not conversation_text.strip():
|
| 765 |
return "Please provide conversation text.", None
|
| 766 |
|
| 767 |
+
if not CHATTERBOX_AVAILABLE:
|
| 768 |
+
return "Chatterbox TTS not available. Please check the installation.", None
|
| 769 |
+
|
| 770 |
try:
|
| 771 |
conversation_json = converter.parse_conversation_text(conversation_text)
|
| 772 |
|
|
|
|
| 774 |
return "No valid conversation found in the text.", None
|
| 775 |
|
| 776 |
# Generate audio using Chatterbox TTS
|
| 777 |
+
try:
|
| 778 |
+
sr, audio = converter.generate_tts_audio_gpu(
|
| 779 |
+
conversation_json,
|
| 780 |
+
ref_audio_path,
|
| 781 |
+
exaggeration,
|
| 782 |
+
temperature,
|
| 783 |
+
seed_num,
|
| 784 |
+
cfg_weight,
|
| 785 |
+
chunk_size
|
| 786 |
+
)
|
| 787 |
|
| 788 |
+
# Save audio to file
|
| 789 |
+
output_dir = converter._create_output_directory()
|
| 790 |
+
output_file = os.path.join(output_dir, "podcast_audio.wav")
|
| 791 |
+
sf.write(output_file, audio, sr)
|
| 792 |
|
| 793 |
+
return "๐ Audio generated successfully!", output_file
|
| 794 |
+
except Exception as e:
|
| 795 |
+
error_msg = str(e)
|
| 796 |
+
if "Chatterbox TTS not available" in error_msg:
|
| 797 |
+
return "โ Chatterbox TTS is not properly installed. Please check the requirements.", None
|
| 798 |
+
elif "CUDA" in error_msg or "GPU" in error_msg:
|
| 799 |
+
return f"โ GPU error: {error_msg}. Please try reducing chunk size or use CPU.", None
|
| 800 |
+
else:
|
| 801 |
+
return f"โ Audio generation error: {error_msg}", None
|
| 802 |
|
| 803 |
except Exception as e:
|
| 804 |
+
return f"โ Error processing conversation: {str(e)}", None
|
| 805 |
|
| 806 |
|
| 807 |
def synthesize_sync(article_input, input_type: str = "URL", mode: str = "Local"):
|