import re import numpy as np import soundfile as sf import os import tempfile from pydub import AudioSegment import io class ChatterboxScriptProcessor: def __init__(self, engine): self.engine = engine def split_text_into_chunks(self, text, max_chars=800): """ Splits text into chunks based on sentence boundaries. Chatterbox can handle longer texts, so we use a larger chunk size. """ # Clean text text = text.replace('\n', ' ').strip() # Split by sentence boundaries but keep the punctuation sentences = re.split('(?<=[.!?]) +', text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) < max_chars: current_chunk += " " + sentence else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence if current_chunk: chunks.append(current_chunk.strip()) return chunks def process_long_script(self, text, voice, speed=1.0, lang='en', custom_voice_path=None, exaggeration=0.5, cfg_weight=0.5, seed=None, temperature=1.0): """ Processes a long script by chunking, generating audio for each, and merging. """ chunks = self.split_text_into_chunks(text) print(f"Split script into {len(chunks)} chunks.") combined_audio = [] sample_rate = None for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1}/{len(chunks)}...") audio, sr = self.engine.generate(chunk, voice=voice, speed=speed, lang=lang, custom_voice_path=custom_voice_path, exaggeration=exaggeration, cfg_weight=cfg_weight, seed=seed, temperature=temperature) if audio is not None: combined_audio.append(audio) if sample_rate is None: sample_rate = sr if not combined_audio: return None, 22050 # Concatenate numpy arrays (ensuring they are 1D) final_audio = np.concatenate([a.flatten() for a in combined_audio]) return final_audio, sample_rate def save_audio(self, audio_data, sample_rate, output_path): """ Saves numpy audio data to a file. """ try: sf.write(output_path, audio_data, sample_rate) return output_path except Exception as e: print(f"Error saving audio: {str(e)}") return None def process_audio_upload(self, audio_file): """ Process uploaded audio file for voice cloning. Returns the path to the processed audio file. """ try: if audio_file is None: return None # Create a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: # Read the uploaded audio audio = AudioSegment.from_file(audio_file) # Convert to WAV format if needed audio.export(tmp.name, format="wav") # Check audio duration (should be at least 10 seconds for better cloning) duration_seconds = len(audio) / 1000.0 print(f"Audio duration: {duration_seconds:.2f} seconds") if duration_seconds < 5: print("Warning: Audio reference is shorter than 5 seconds. Voice cloning quality may be reduced.") return tmp.name except Exception as e: print(f"Error processing audio upload: {str(e)}") return None def validate_audio_file(self, audio_path): """ Validate that the audio file is suitable for voice cloning. """ try: if not os.path.exists(audio_path): return False, "Audio file not found" # Load audio to check properties audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 if duration < 3: return False, "Audio too short (minimum 3 seconds required)" if duration > 60: return False, "Audio too long (maximum 60 seconds recommended)" # Check sample rate (should be decent quality) if hasattr(audio, 'frame_rate'): if audio.frame_rate < 16000: return False, "Audio quality too low (minimum 16kHz recommended)" return True, "Audio file is suitable for voice cloning" except Exception as e: return False, f"Error validating audio: {str(e)}"