Spaces:

VIDraft
/

Voice-Clone-Podcast

Runtime error

App Files Files Community

seawolf2357 commited on May 30

Commit

75fc1e5

verified ·

1 Parent(s): 013bd45

Create app.py

Browse files

Files changed (1) hide show

app.py +279 -0

app.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import random
+import numpy as np
+import torch
+from chatterbox.src.chatterbox.tts import ChatterboxTTS
+import gradio as gr
+import spaces
+import re
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🚀 Running on device: {DEVICE}")
+def set_seed(seed: int):
+    """Sets the random seed for reproducibility across torch, numpy, and random."""
+    torch.manual_seed(seed)
+    if DEVICE == "cuda":
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
+    """
+    텍스트를 문장 단위로 나누되, 각 청크가 max_chars를 넘지 않도록 합니다.
+    """
+    # 문장 단위로 분리 (기본적인 문장 분리)
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        # 현재 청크에 문장을 추가해도 max_chars를 넘지 않으면 추가
+        if len(current_chunk) + len(sentence) + 1 <= max_chars:
+            if current_chunk:
+                current_chunk += " " + sentence
+            else:
+                current_chunk = sentence
+        else:
+            # 현재 청크를 저장하고 새 청크 시작
+            if current_chunk:
+                chunks.append(current_chunk)
+            # 문장 자체가 max_chars보다 긴 경우 강제로 분할
+            if len(sentence) > max_chars:
+                words = sentence.split()
+                temp_chunk = ""
+                for word in words:
+                    if len(temp_chunk) + len(word) + 1 <= max_chars:
+                        if temp_chunk:
+                            temp_chunk += " " + word
+                        else:
+                            temp_chunk = word
+                    else:
+                        if temp_chunk:
+                            chunks.append(temp_chunk)
+                        temp_chunk = word
+                if temp_chunk:
+                    current_chunk = temp_chunk
+            else:
+                current_chunk = sentence
+    # 마지막 청크 추가
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+@spaces.GPU(duration=120)  # GPU 사용 시간을 충분히 설정
+def generate_tts_audio_gpu(
+    text_input: str,
+    audio_prompt_path_input: str,
+    exaggeration_input: float,
+    temperature_input: float,
+    seed_num_input: int,
+    cfgw_input: float,
+    chunk_size_input: int
+) -> tuple[int, np.ndarray]:
+    """
+    GPU에서 TTS 오디오를 생성합니다.
+    """
+    # GPU 함수 내에서 모델 로드
+    model = ChatterboxTTS.from_pretrained(DEVICE)
+    if seed_num_input != 0:
+        set_seed(int(seed_num_input))
+    # 텍스트가 짧으면 단일 생성
+    if len(text_input) <= 300:
+        print(f"단일 텍스트 생성: '{text_input[:50]}...'")
+        wav = model.generate(
+            text_input,
+            audio_prompt_path=audio_prompt_path_input,
+            exaggeration=exaggeration_input,
+            temperature=temperature_input,
+            cfg_weight=cfgw_input,
+        )
+        return (model.sr, wav.squeeze(0).numpy())
+    # 긴 텍스트는 청크로 분할
+    chunks = split_text_into_chunks(text_input, max_chars=chunk_size_input)
+    total_chunks = len(chunks)
+    print(f"텍스트를 {total_chunks}개의 청크로 분할했습니다.")
+    audio_segments = []
+    for i, chunk in enumerate(chunks):
+        print(f"청크 {i + 1}/{total_chunks} 생성 중: '{chunk[:50]}...'")
+        try:
+            wav = model.generate(
+                chunk,
+                audio_prompt_path=audio_prompt_path_input,
+                exaggeration=exaggeration_input,
+                temperature=temperature_input,
+                cfg_weight=cfgw_input,
+            )
+            wav_chunk = wav.squeeze(0).numpy()
+            audio_segments.append(wav_chunk)
+        except Exception as e:
+            print(f"청크 {i + 1} 생성 중 오류 발생: {e}")
+            continue
+    if not audio_segments:
+        raise RuntimeError("오디오 생성에 실패했습니다.")
+    # 오디오 세그먼트 연결
+    silence_duration = int(0.2 * model.sr)  # 0.2초 무음
+    silence = np.zeros(silence_duration)
+    final_audio = []
+    for i, segment in enumerate(audio_segments):
+        final_audio.append(segment)
+        if i < len(audio_segments) - 1:
+            final_audio.append(silence)
+    concatenated_audio = np.concatenate(final_audio)
+    print(f"오디오 생성 완료. 총 길이: {len(concatenated_audio) / model.sr:.2f}초")
+    return (model.sr, concatenated_audio)
+# Gradio 인터페이스
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # Chatterbox TTS Demo - 무제한 길이 버전
+        긴 텍스트도 청크로 나누어 처리하여 제한 없이 음��을 생성합니다.
+        ⚠️ **주의**: 긴 텍스트 처리 시 시간이 오래 걸릴 수 있습니다.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(
+                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
+                label="텍스트 입력 (길이 제한 없음)",
+                lines=10,
+                max_lines=30
+            )
+            ref_wav = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Reference Audio File (Optional)",
+                value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
+            )
+            with gr.Row():
+                exaggeration = gr.Slider(
+                    0.25, 2, step=.05,
+                    label="Exaggeration (Neutral = 0.5)",
+                    value=.5
+                )
+                cfg_weight = gr.Slider(
+                    0.2, 1, step=.05,
+                    label="CFG/Pace",
+                    value=0.5
+                )
+            chunk_size = gr.Slider(
+                100, 300, step=50,
+                label="청크 크기 (문자 수)",
+                value=250,
+                info="텍스트를 나눌 청크의 최대 크기입니다."
+            )
+            with gr.Accordion("고급 옵션", open=False):
+                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
+                temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
+            run_btn = gr.Button("🎤 음성 생성", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="생성된 음성")
+            # 텍스트 길이 표시
+            char_count = gr.Textbox(
+                label="텍스트 정보",
+                value="0 문자",
+                interactive=False
+            )
+            status = gr.Textbox(
+                label="상태",
+                value="대기 중...",
+                interactive=False
+            )
+    # 텍스트 입력 시 문자 수 업데이트
+    def update_char_count(text, chunk_size):
+        char_len = len(text)
+        if char_len <= 300:
+            return f"{char_len} 문자 (단일 생성)"
+        else:
+            chunks = split_text_into_chunks(text, max_chars=chunk_size)
+            chunk_count = len(chunks)
+            estimated_time = chunk_count * 3  # 청크당 약 3초 예상
+            return f"{char_len} 문자, {chunk_count}개 청크 (예상 시간: 약 {estimated_time}초)"
+    text.change(
+        fn=update_char_count,
+        inputs=[text, chunk_size],
+        outputs=[char_count]
+    )
+    chunk_size.change(
+        fn=update_char_count,
+        inputs=[text, chunk_size],
+        outputs=[char_count]
+    )
+    # 생성 함수 래퍼 (상태 업데이트 포함)
+    def generate_with_status(text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size):
+        try:
+            yield gr.update(value="처리 중... GPU를 할당받는 중입니다."), None
+            # GPU 함수 호출
+            sr, audio = generate_tts_audio_gpu(
+                text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size
+            )
+            yield gr.update(value="✅ 생성 완료!"), (sr, audio)
+        except Exception as e:
+            yield gr.update(value=f"❌ 오류 발생: {str(e)}"), None
+    run_btn.click(
+        fn=generate_with_status,
+        inputs=[
+            text,
+            ref_wav,
+            exaggeration,
+            temp,
+            seed_num,
+            cfg_weight,
+            chunk_size
+        ],
+        outputs=[status, audio_output],
+    )
+    gr.Markdown(
+        """
+        ### 💡 사용 팁:
+        - **300자 이하**: 빠른 단일 생성
+        - **300자 초과**: 자동으로 청크로 분할하여 처리
+        - 청크 크기가 작을수록 자연스럽지만 처리 시간이 증가합니다
+        - GPU 할당을 기다리는 시간이 있을 수 있습니다
+        ### ⏱️ 예상 처리 시간:
+        - 300자 이하: 약 5-10초
+        - 1000자: 약 15-30초
+        - 5000자: 약 1-2분
+        """
+    )
+# 앱 실행 시 모델 로드 제거 (GPU 함수 내에서만 로드)
+print("앱이 시작되었습니다. 모델은 첫 생성 시 로드됩니다.")
+demo.queue().launch()