Chatterbox_tts_long_handling / chatterbox_processor.py
codewithjarair's picture
Upload 6 files
332cb9e verified
Raw
History Blame Contribute Delete
4.97 kB
import re
import numpy as np
import soundfile as sf
import os
import tempfile
from pydub import AudioSegment
import io
class ChatterboxScriptProcessor:
def __init__(self, engine):
self.engine = engine
def split_text_into_chunks(self, text, max_chars=800):
"""
Splits text into chunks based on sentence boundaries.
Chatterbox can handle longer texts, so we use a larger chunk size.
"""
# Clean text
text = text.replace('\n', ' ').strip()
# Split by sentence boundaries but keep the punctuation
sentences = re.split('(?<=[.!?]) +', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chars:
current_chunk += " " + sentence
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def process_long_script(self, text, voice, speed=1.0, lang='en', custom_voice_path=None, exaggeration=0.5, cfg_weight=0.5, seed=None, temperature=1.0):
"""
Processes a long script by chunking, generating audio for each, and merging.
"""
chunks = self.split_text_into_chunks(text)
print(f"Split script into {len(chunks)} chunks.")
combined_audio = []
sample_rate = None
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
audio, sr = self.engine.generate(chunk, voice=voice, speed=speed, lang=lang, custom_voice_path=custom_voice_path,
exaggeration=exaggeration, cfg_weight=cfg_weight, seed=seed, temperature=temperature)
if audio is not None:
combined_audio.append(audio)
if sample_rate is None:
sample_rate = sr
if not combined_audio:
return None, 22050
# Concatenate numpy arrays (ensuring they are 1D)
final_audio = np.concatenate([a.flatten() for a in combined_audio])
return final_audio, sample_rate
def save_audio(self, audio_data, sample_rate, output_path):
"""
Saves numpy audio data to a file.
"""
try:
sf.write(output_path, audio_data, sample_rate)
return output_path
except Exception as e:
print(f"Error saving audio: {str(e)}")
return None
def process_audio_upload(self, audio_file):
"""
Process uploaded audio file for voice cloning.
Returns the path to the processed audio file.
"""
try:
if audio_file is None:
return None
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
# Read the uploaded audio
audio = AudioSegment.from_file(audio_file)
# Convert to WAV format if needed
audio.export(tmp.name, format="wav")
# Check audio duration (should be at least 10 seconds for better cloning)
duration_seconds = len(audio) / 1000.0
print(f"Audio duration: {duration_seconds:.2f} seconds")
if duration_seconds < 5:
print("Warning: Audio reference is shorter than 5 seconds. Voice cloning quality may be reduced.")
return tmp.name
except Exception as e:
print(f"Error processing audio upload: {str(e)}")
return None
def validate_audio_file(self, audio_path):
"""
Validate that the audio file is suitable for voice cloning.
"""
try:
if not os.path.exists(audio_path):
return False, "Audio file not found"
# Load audio to check properties
audio = AudioSegment.from_file(audio_path)
duration = len(audio) / 1000.0
if duration < 3:
return False, "Audio too short (minimum 3 seconds required)"
if duration > 60:
return False, "Audio too long (maximum 60 seconds recommended)"
# Check sample rate (should be decent quality)
if hasattr(audio, 'frame_rate'):
if audio.frame_rate < 16000:
return False, "Audio quality too low (minimum 16kHz recommended)"
return True, "Audio file is suitable for voice cloning"
except Exception as e:
return False, f"Error validating audio: {str(e)}"