|
|
import os |
|
|
if 'XDG_CACHE_HOME' not in os.environ: |
|
|
os.environ['XDG_CACHE_HOME'] = os.path.realpath(os.path.join(os.getcwd(), './models/')) |
|
|
|
|
|
if 'TORTOISE_MODELS_DIR' not in os.environ: |
|
|
os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/')) |
|
|
|
|
|
if 'TRANSFORMERS_CACHE' not in os.environ: |
|
|
os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/')) |
|
|
|
|
|
import argparse |
|
|
import time |
|
|
import math |
|
|
import json |
|
|
import base64 |
|
|
import re |
|
|
import urllib.request |
|
|
import signal |
|
|
import gc |
|
|
import subprocess |
|
|
import psutil |
|
|
import yaml |
|
|
import hashlib |
|
|
import string |
|
|
import random |
|
|
|
|
|
from tqdm import tqdm |
|
|
import torch |
|
|
import torchaudio |
|
|
import music_tag |
|
|
import gradio as gr |
|
|
import gradio.utils |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
from glob import glob |
|
|
from datetime import datetime |
|
|
from datetime import timedelta |
|
|
|
|
|
from tortoise.api import TextToSpeech as TorToise_TTS, MODELS, get_model_path, pad_or_truncate |
|
|
from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices |
|
|
from tortoise.utils.text import split_and_recombine_text |
|
|
from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc |
|
|
|
|
|
from whisper.normalizers.english import EnglishTextNormalizer |
|
|
from whisper.normalizers.basic import BasicTextNormalizer |
|
|
from whisper.tokenizer import LANGUAGES |
|
|
|
|
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth" |
|
|
|
|
|
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"] |
|
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"] |
|
|
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"] |
|
|
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band'] |
|
|
TTSES = ['tortoise'] |
|
|
|
|
|
INFERENCING = False |
|
|
GENERATE_SETTINGS_ARGS = None |
|
|
|
|
|
LEARNING_RATE_SCHEMES = {"Multistep": "MultiStepLR", "Cos. Annealing": "CosineAnnealingLR_Restart"} |
|
|
LEARNING_RATE_SCHEDULE = [ 2, 4, 9, 18, 25, 33, 50 ] |
|
|
|
|
|
RESAMPLERS = {} |
|
|
|
|
|
MIN_TRAINING_DURATION = 0.6 |
|
|
MAX_TRAINING_DURATION = 11.6097505669 |
|
|
|
|
|
VALLE_ENABLED = False |
|
|
BARK_ENABLED = False |
|
|
|
|
|
try: |
|
|
from vall_e.emb.qnt import encode as valle_quantize |
|
|
from vall_e.emb.g2p import encode as valle_phonemize |
|
|
|
|
|
from vall_e.inference import TTS as VALLE_TTS |
|
|
|
|
|
import soundfile |
|
|
|
|
|
VALLE_ENABLED = True |
|
|
except Exception as e: |
|
|
if False: |
|
|
raise e |
|
|
pass |
|
|
|
|
|
if VALLE_ENABLED: |
|
|
TTSES.append('vall-e') |
|
|
|
|
|
try: |
|
|
from bark.generation import SAMPLE_RATE as BARK_SAMPLE_RATE, ALLOWED_PROMPTS, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic, load_codec_model |
|
|
from bark.api import generate_audio as bark_generate_audio |
|
|
from encodec.utils import convert_audio |
|
|
|
|
|
from scipy.io.wavfile import write as write_wav |
|
|
|
|
|
BARK_ENABLED = True |
|
|
except Exception as e: |
|
|
if False: |
|
|
raise e |
|
|
pass |
|
|
|
|
|
if BARK_ENABLED: |
|
|
TTSES.append('bark') |
|
|
class Bark_TTS(): |
|
|
def __init__(self, small=False): |
|
|
self.input_sample_rate = BARK_SAMPLE_RATE |
|
|
self.output_sample_rate = args.output_sample_rate |
|
|
|
|
|
preload_models( |
|
|
text_use_gpu=True, |
|
|
coarse_use_gpu=True, |
|
|
fine_use_gpu=True, |
|
|
codec_use_gpu=True, |
|
|
|
|
|
text_use_small=small, |
|
|
coarse_use_small=small, |
|
|
fine_use_small=small, |
|
|
|
|
|
force_reload=False |
|
|
) |
|
|
|
|
|
def create_voice( self, voice, device='cuda' ): |
|
|
transcription_json = f'./training/{voice}/whisper.json' |
|
|
if not os.path.exists(transcription_json): |
|
|
raise f"Transcription for voice not found: {voice}" |
|
|
|
|
|
transcriptions = json.load(open(transcription_json, 'r', encoding="utf-8")) |
|
|
candidates = [] |
|
|
for file in transcriptions: |
|
|
result = transcriptions[file] |
|
|
for segment in result['segments']: |
|
|
entry = ( |
|
|
file.replace(".wav", f"_{pad(segment['id'], 4)}.wav"), |
|
|
segment['end'] - segment['start'], |
|
|
segment['text'] |
|
|
) |
|
|
candidates.append(entry) |
|
|
|
|
|
candidates.sort(key=lambda x: x[1]) |
|
|
candidate = random.choice(candidates) |
|
|
audio_filepath = f'./training/{voice}/audio/{candidate[0]}' |
|
|
text = candidate[-1] |
|
|
|
|
|
print("Using as reference:", audio_filepath, text) |
|
|
|
|
|
|
|
|
model = load_codec_model(use_gpu=True) |
|
|
wav, sr = torchaudio.load(audio_filepath) |
|
|
wav = convert_audio(wav, sr, model.sample_rate, model.channels) |
|
|
wav = wav.unsqueeze(0).to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
encoded_frames = model.encode(wav) |
|
|
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze().cpu().numpy() |
|
|
|
|
|
|
|
|
seconds = wav.shape[-1] / model.sample_rate |
|
|
|
|
|
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7) |
|
|
|
|
|
output_path = './modules/bark/bark/assets/prompts/' + voice.replace("/", "_") + '.npz' |
|
|
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) |
|
|
|
|
|
def inference( self, text, voice, text_temp=0.7, waveform_temp=0.7 ): |
|
|
if not os.path.exists('./modules/bark/bark/assets/prompts/' + voice + '.npz'): |
|
|
self.create_voice( voice ) |
|
|
voice = voice.replace("/", "_") |
|
|
if voice not in ALLOWED_PROMPTS: |
|
|
ALLOWED_PROMPTS.add( voice ) |
|
|
|
|
|
return (bark_generate_audio(text, history_prompt=voice, text_temp=text_temp, waveform_temp=waveform_temp), BARK_SAMPLE_RATE) |
|
|
|
|
|
args = None |
|
|
tts = None |
|
|
tts_loading = False |
|
|
webui = None |
|
|
voicefixer = None |
|
|
|
|
|
whisper_model = None |
|
|
whisper_vad = None |
|
|
whisper_diarize = None |
|
|
whisper_align_model = None |
|
|
|
|
|
training_state = None |
|
|
|
|
|
current_voice = None |
|
|
|
|
|
def cleanup_voice_name( name ): |
|
|
return name.split("/")[-1] |
|
|
|
|
|
def resample( waveform, input_rate, output_rate=44100 ): |
|
|
|
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
|
|
if input_rate == output_rate: |
|
|
return waveform, output_rate |
|
|
|
|
|
key = f'{input_rate}:{output_rate}' |
|
|
if not key in RESAMPLERS: |
|
|
RESAMPLERS[key] = torchaudio.transforms.Resample( |
|
|
input_rate, |
|
|
output_rate, |
|
|
lowpass_filter_width=16, |
|
|
rolloff=0.85, |
|
|
resampling_method="kaiser_window", |
|
|
beta=8.555504641634386, |
|
|
) |
|
|
|
|
|
return RESAMPLERS[key]( waveform ), output_rate |
|
|
|
|
|
def generate(**kwargs): |
|
|
if args.tts_backend == "tortoise": |
|
|
return generate_tortoise(**kwargs) |
|
|
if args.tts_backend == "vall-e": |
|
|
return generate_valle(**kwargs) |
|
|
if args.tts_backend == "bark": |
|
|
return generate_bark(**kwargs) |
|
|
|
|
|
def generate_bark(**kwargs): |
|
|
parameters = {} |
|
|
parameters.update(kwargs) |
|
|
|
|
|
voice = parameters['voice'] |
|
|
progress = parameters['progress'] if 'progress' in parameters else None |
|
|
if parameters['seed'] == 0: |
|
|
parameters['seed'] = None |
|
|
|
|
|
usedSeed = parameters['seed'] |
|
|
|
|
|
global args |
|
|
global tts |
|
|
|
|
|
unload_whisper() |
|
|
unload_voicefixer() |
|
|
|
|
|
if not tts: |
|
|
|
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
if progress is not None: |
|
|
notify_progress("Initializing TTS...", progress=progress) |
|
|
load_tts() |
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
voice_samples = None |
|
|
conditioning_latents = None |
|
|
sample_voice = None |
|
|
|
|
|
voice_cache = {} |
|
|
|
|
|
def get_settings( override=None ): |
|
|
settings = { |
|
|
'voice': parameters['voice'], |
|
|
'text_temp': float(parameters['temperature']), |
|
|
'waveform_temp': float(parameters['temperature']), |
|
|
} |
|
|
|
|
|
|
|
|
selected_voice = voice |
|
|
if override is not None: |
|
|
if 'voice' in override: |
|
|
selected_voice = override['voice'] |
|
|
|
|
|
for k in override: |
|
|
if k not in settings: |
|
|
continue |
|
|
settings[k] = override[k] |
|
|
|
|
|
return settings |
|
|
|
|
|
if not parameters['delimiter']: |
|
|
parameters['delimiter'] = "\n" |
|
|
elif parameters['delimiter'] == "\\n": |
|
|
parameters['delimiter'] = "\n" |
|
|
|
|
|
if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: |
|
|
texts = parameters['text'].split(parameters['delimiter']) |
|
|
else: |
|
|
texts = split_and_recombine_text(parameters['text']) |
|
|
|
|
|
full_start_time = time.time() |
|
|
|
|
|
outdir = f"{args.results_folder}/{voice}/" |
|
|
os.makedirs(outdir, exist_ok=True) |
|
|
|
|
|
audio_cache = {} |
|
|
|
|
|
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None |
|
|
|
|
|
idx = 0 |
|
|
idx_cache = {} |
|
|
for i, file in enumerate(os.listdir(outdir)): |
|
|
filename = os.path.basename(file) |
|
|
extension = os.path.splitext(filename)[1] |
|
|
if extension != ".json" and extension != ".wav": |
|
|
continue |
|
|
match = re.findall(rf"^{cleanup_voice_name(voice)}_(\d+)(?:.+?)?{extension}$", filename) |
|
|
if match and len(match) > 0: |
|
|
key = int(match[0]) |
|
|
idx_cache[key] = True |
|
|
|
|
|
if len(idx_cache) > 0: |
|
|
keys = sorted(list(idx_cache.keys())) |
|
|
idx = keys[-1] + 1 |
|
|
|
|
|
idx = pad(idx, 4) |
|
|
|
|
|
def get_name(line=0, candidate=0, combined=False): |
|
|
name = f"{idx}" |
|
|
if combined: |
|
|
name = f"{name}_combined" |
|
|
elif len(texts) > 1: |
|
|
name = f"{name}_{line}" |
|
|
if parameters['candidates'] > 1: |
|
|
name = f"{name}_{candidate}" |
|
|
return name |
|
|
|
|
|
def get_info( voice, settings = None, latents = True ): |
|
|
info = {} |
|
|
info.update(parameters) |
|
|
|
|
|
info['time'] = time.time()-full_start_time |
|
|
info['datetime'] = datetime.now().isoformat() |
|
|
|
|
|
info['progress'] = None |
|
|
del info['progress'] |
|
|
|
|
|
if info['delimiter'] == "\n": |
|
|
info['delimiter'] = "\\n" |
|
|
|
|
|
if settings is not None: |
|
|
for k in settings: |
|
|
if k in info: |
|
|
info[k] = settings[k] |
|
|
return info |
|
|
|
|
|
INFERENCING = True |
|
|
for line, cut_text in enumerate(texts): |
|
|
tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' |
|
|
print(f"{tqdm_prefix} Generating line: {cut_text}") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) |
|
|
override = None |
|
|
if match and len(match) > 0: |
|
|
match = match[0] |
|
|
try: |
|
|
override = json.loads(match[0]) |
|
|
cut_text = match[1].strip() |
|
|
except Exception as e: |
|
|
raise Exception("Prompt settings editing requested, but received invalid JSON") |
|
|
|
|
|
settings = get_settings( override=override ) |
|
|
|
|
|
gen = tts.inference(cut_text, **settings ) |
|
|
|
|
|
run_time = time.time()-start_time |
|
|
print(f"Generating line took {run_time} seconds") |
|
|
|
|
|
if not isinstance(gen, list): |
|
|
gen = [gen] |
|
|
|
|
|
for j, g in enumerate(gen): |
|
|
wav, sr = g |
|
|
name = get_name(line=line, candidate=j) |
|
|
|
|
|
settings['text'] = cut_text |
|
|
settings['time'] = run_time |
|
|
settings['datetime'] = datetime.now().isoformat() |
|
|
|
|
|
|
|
|
|
|
|
write_wav(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', sr, wav) |
|
|
wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
|
|
|
audio_cache[name] = { |
|
|
'audio': wav, |
|
|
'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) |
|
|
} |
|
|
|
|
|
del gen |
|
|
do_gc() |
|
|
INFERENCING = False |
|
|
|
|
|
for k in audio_cache: |
|
|
audio = audio_cache[k]['audio'] |
|
|
|
|
|
audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) |
|
|
if volume_adjust is not None: |
|
|
audio = volume_adjust(audio) |
|
|
|
|
|
audio_cache[k]['audio'] = audio |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
output_voices = [] |
|
|
for candidate in range(parameters['candidates']): |
|
|
if len(texts) > 1: |
|
|
audio_clips = [] |
|
|
for line in range(len(texts)): |
|
|
name = get_name(line=line, candidate=candidate) |
|
|
audio = audio_cache[name]['audio'] |
|
|
audio_clips.append(audio) |
|
|
|
|
|
name = get_name(candidate=candidate, combined=True) |
|
|
audio = torch.cat(audio_clips, dim=-1) |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
audio = audio.squeeze(0).cpu() |
|
|
audio_cache[name] = { |
|
|
'audio': audio, |
|
|
'settings': get_info(voice=voice), |
|
|
'output': True |
|
|
} |
|
|
else: |
|
|
name = get_name(candidate=candidate) |
|
|
audio_cache[name]['output'] = True |
|
|
|
|
|
|
|
|
if args.voice_fixer: |
|
|
if not voicefixer: |
|
|
notify_progress("Loading voicefix...", progress=progress) |
|
|
load_voicefixer() |
|
|
|
|
|
try: |
|
|
fixed_cache = {} |
|
|
for name in tqdm(audio_cache, desc="Running voicefix..."): |
|
|
del audio_cache[name]['audio'] |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
continue |
|
|
|
|
|
path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' |
|
|
fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' |
|
|
voicefixer.restore( |
|
|
input=path, |
|
|
output=fixed, |
|
|
cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, |
|
|
|
|
|
) |
|
|
|
|
|
fixed_cache[f'{name}_fixed'] = { |
|
|
'settings': audio_cache[name]['settings'], |
|
|
'output': True |
|
|
} |
|
|
audio_cache[name]['output'] = False |
|
|
|
|
|
for name in fixed_cache: |
|
|
audio_cache[name] = fixed_cache[name] |
|
|
except Exception as e: |
|
|
print(e) |
|
|
print("\nFailed to run Voicefixer") |
|
|
|
|
|
for name in audio_cache: |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
if args.prune_nonfinal_outputs: |
|
|
audio_cache[name]['pruned'] = True |
|
|
os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
continue |
|
|
|
|
|
output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
|
|
|
if not args.embed_output_metadata: |
|
|
with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) |
|
|
|
|
|
if args.embed_output_metadata: |
|
|
for name in tqdm(audio_cache, desc="Embedding metadata..."): |
|
|
if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: |
|
|
continue |
|
|
|
|
|
metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") |
|
|
metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) |
|
|
metadata.save() |
|
|
|
|
|
if sample_voice is not None: |
|
|
sample_voice = (tts.input_sample_rate, sample_voice.numpy()) |
|
|
|
|
|
info = get_info(voice=voice, latents=False) |
|
|
print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") |
|
|
|
|
|
info['seed'] = usedSeed |
|
|
if 'latents' in info: |
|
|
del info['latents'] |
|
|
|
|
|
os.makedirs('./config/', exist_ok=True) |
|
|
with open(f'./config/generate.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(info, indent='\t') ) |
|
|
|
|
|
stats = [ |
|
|
[ parameters['seed'], "{:.3f}".format(info['time']) ] |
|
|
] |
|
|
|
|
|
return ( |
|
|
sample_voice, |
|
|
output_voices, |
|
|
stats, |
|
|
) |
|
|
|
|
|
def generate_valle(**kwargs): |
|
|
parameters = {} |
|
|
parameters.update(kwargs) |
|
|
|
|
|
voice = parameters['voice'] |
|
|
progress = parameters['progress'] if 'progress' in parameters else None |
|
|
if parameters['seed'] == 0: |
|
|
parameters['seed'] = None |
|
|
|
|
|
usedSeed = parameters['seed'] |
|
|
|
|
|
global args |
|
|
global tts |
|
|
|
|
|
unload_whisper() |
|
|
unload_voicefixer() |
|
|
|
|
|
if not tts: |
|
|
|
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
if progress is not None: |
|
|
notify_progress("Initializing TTS...", progress=progress) |
|
|
load_tts() |
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
voice_samples = None |
|
|
conditioning_latents = None |
|
|
sample_voice = None |
|
|
|
|
|
voice_cache = {} |
|
|
def fetch_voice( voice ): |
|
|
voice_dir = f'./training/{voice}/audio/' |
|
|
if not os.path.isdir(voice_dir): |
|
|
voice_dir = f'./voices/{voice}/' |
|
|
files = [ f'{voice_dir}/{d}' for d in os.listdir(voice_dir) if d[-4:] == ".wav" ] |
|
|
|
|
|
return random.choice(files) |
|
|
|
|
|
def get_settings( override=None ): |
|
|
settings = { |
|
|
'ar_temp': float(parameters['temperature']), |
|
|
'nar_temp': float(parameters['temperature']), |
|
|
'max_ar_samples': parameters['num_autoregressive_samples'], |
|
|
} |
|
|
|
|
|
|
|
|
selected_voice = voice |
|
|
if override is not None: |
|
|
if 'voice' in override: |
|
|
selected_voice = override['voice'] |
|
|
|
|
|
for k in override: |
|
|
if k not in settings: |
|
|
continue |
|
|
settings[k] = override[k] |
|
|
|
|
|
settings['reference'] = fetch_voice(voice=selected_voice) |
|
|
return settings |
|
|
|
|
|
if not parameters['delimiter']: |
|
|
parameters['delimiter'] = "\n" |
|
|
elif parameters['delimiter'] == "\\n": |
|
|
parameters['delimiter'] = "\n" |
|
|
|
|
|
if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: |
|
|
texts = parameters['text'].split(parameters['delimiter']) |
|
|
else: |
|
|
texts = split_and_recombine_text(parameters['text']) |
|
|
|
|
|
full_start_time = time.time() |
|
|
|
|
|
outdir = f"{args.results_folder}/{voice}/" |
|
|
os.makedirs(outdir, exist_ok=True) |
|
|
|
|
|
audio_cache = {} |
|
|
|
|
|
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None |
|
|
|
|
|
idx = 0 |
|
|
idx_cache = {} |
|
|
for i, file in enumerate(os.listdir(outdir)): |
|
|
filename = os.path.basename(file) |
|
|
extension = os.path.splitext(filename)[1] |
|
|
if extension != ".json" and extension != ".wav": |
|
|
continue |
|
|
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) |
|
|
if match and len(match) > 0: |
|
|
key = int(match[0]) |
|
|
idx_cache[key] = True |
|
|
|
|
|
if len(idx_cache) > 0: |
|
|
keys = sorted(list(idx_cache.keys())) |
|
|
idx = keys[-1] + 1 |
|
|
|
|
|
idx = pad(idx, 4) |
|
|
|
|
|
def get_name(line=0, candidate=0, combined=False): |
|
|
name = f"{idx}" |
|
|
if combined: |
|
|
name = f"{name}_combined" |
|
|
elif len(texts) > 1: |
|
|
name = f"{name}_{line}" |
|
|
if parameters['candidates'] > 1: |
|
|
name = f"{name}_{candidate}" |
|
|
return name |
|
|
|
|
|
def get_info( voice, settings = None, latents = True ): |
|
|
info = {} |
|
|
info.update(parameters) |
|
|
|
|
|
info['time'] = time.time()-full_start_time |
|
|
info['datetime'] = datetime.now().isoformat() |
|
|
|
|
|
info['progress'] = None |
|
|
del info['progress'] |
|
|
|
|
|
if info['delimiter'] == "\n": |
|
|
info['delimiter'] = "\\n" |
|
|
|
|
|
if settings is not None: |
|
|
for k in settings: |
|
|
if k in info: |
|
|
info[k] = settings[k] |
|
|
return info |
|
|
|
|
|
INFERENCING = True |
|
|
for line, cut_text in enumerate(texts): |
|
|
tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' |
|
|
print(f"{tqdm_prefix} Generating line: {cut_text}") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) |
|
|
override = None |
|
|
if match and len(match) > 0: |
|
|
match = match[0] |
|
|
try: |
|
|
override = json.loads(match[0]) |
|
|
cut_text = match[1].strip() |
|
|
except Exception as e: |
|
|
raise Exception("Prompt settings editing requested, but received invalid JSON") |
|
|
|
|
|
settings = get_settings( override=override ) |
|
|
reference = settings['reference'] |
|
|
settings.pop("reference") |
|
|
|
|
|
gen = tts.inference(cut_text, reference, **settings ) |
|
|
|
|
|
run_time = time.time()-start_time |
|
|
print(f"Generating line took {run_time} seconds") |
|
|
|
|
|
if not isinstance(gen, list): |
|
|
gen = [gen] |
|
|
|
|
|
for j, g in enumerate(gen): |
|
|
wav, sr = g |
|
|
name = get_name(line=line, candidate=j) |
|
|
|
|
|
settings['text'] = cut_text |
|
|
settings['time'] = run_time |
|
|
settings['datetime'] = datetime.now().isoformat() |
|
|
|
|
|
|
|
|
|
|
|
soundfile.write(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu()[0,0], sr) |
|
|
wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
|
|
|
audio_cache[name] = { |
|
|
'audio': wav, |
|
|
'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) |
|
|
} |
|
|
|
|
|
del gen |
|
|
do_gc() |
|
|
INFERENCING = False |
|
|
|
|
|
for k in audio_cache: |
|
|
audio = audio_cache[k]['audio'] |
|
|
|
|
|
audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) |
|
|
if volume_adjust is not None: |
|
|
audio = volume_adjust(audio) |
|
|
|
|
|
audio_cache[k]['audio'] = audio |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
output_voices = [] |
|
|
for candidate in range(parameters['candidates']): |
|
|
if len(texts) > 1: |
|
|
audio_clips = [] |
|
|
for line in range(len(texts)): |
|
|
name = get_name(line=line, candidate=candidate) |
|
|
audio = audio_cache[name]['audio'] |
|
|
audio_clips.append(audio) |
|
|
|
|
|
name = get_name(candidate=candidate, combined=True) |
|
|
audio = torch.cat(audio_clips, dim=-1) |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
audio = audio.squeeze(0).cpu() |
|
|
audio_cache[name] = { |
|
|
'audio': audio, |
|
|
'settings': get_info(voice=voice), |
|
|
'output': True |
|
|
} |
|
|
else: |
|
|
name = get_name(candidate=candidate) |
|
|
audio_cache[name]['output'] = True |
|
|
|
|
|
|
|
|
if args.voice_fixer: |
|
|
if not voicefixer: |
|
|
notify_progress("Loading voicefix...", progress=progress) |
|
|
load_voicefixer() |
|
|
|
|
|
try: |
|
|
fixed_cache = {} |
|
|
for name in tqdm(audio_cache, desc="Running voicefix..."): |
|
|
del audio_cache[name]['audio'] |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
continue |
|
|
|
|
|
path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' |
|
|
fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' |
|
|
voicefixer.restore( |
|
|
input=path, |
|
|
output=fixed, |
|
|
cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, |
|
|
|
|
|
) |
|
|
|
|
|
fixed_cache[f'{name}_fixed'] = { |
|
|
'settings': audio_cache[name]['settings'], |
|
|
'output': True |
|
|
} |
|
|
audio_cache[name]['output'] = False |
|
|
|
|
|
for name in fixed_cache: |
|
|
audio_cache[name] = fixed_cache[name] |
|
|
except Exception as e: |
|
|
print(e) |
|
|
print("\nFailed to run Voicefixer") |
|
|
|
|
|
for name in audio_cache: |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
if args.prune_nonfinal_outputs: |
|
|
audio_cache[name]['pruned'] = True |
|
|
os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
continue |
|
|
|
|
|
output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
|
|
|
if not args.embed_output_metadata: |
|
|
with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) |
|
|
|
|
|
if args.embed_output_metadata: |
|
|
for name in tqdm(audio_cache, desc="Embedding metadata..."): |
|
|
if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: |
|
|
continue |
|
|
|
|
|
metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") |
|
|
metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) |
|
|
metadata.save() |
|
|
|
|
|
if sample_voice is not None: |
|
|
sample_voice = (tts.input_sample_rate, sample_voice.numpy()) |
|
|
|
|
|
info = get_info(voice=voice, latents=False) |
|
|
print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") |
|
|
|
|
|
info['seed'] = usedSeed |
|
|
if 'latents' in info: |
|
|
del info['latents'] |
|
|
|
|
|
os.makedirs('./config/', exist_ok=True) |
|
|
with open(f'./config/generate.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(info, indent='\t') ) |
|
|
|
|
|
stats = [ |
|
|
[ parameters['seed'], "{:.3f}".format(info['time']) ] |
|
|
] |
|
|
|
|
|
return ( |
|
|
sample_voice, |
|
|
output_voices, |
|
|
stats, |
|
|
) |
|
|
|
|
|
def generate_tortoise(**kwargs): |
|
|
parameters = {} |
|
|
parameters.update(kwargs) |
|
|
|
|
|
voice = parameters['voice'] |
|
|
progress = parameters['progress'] if 'progress' in parameters else None |
|
|
if parameters['seed'] == 0: |
|
|
parameters['seed'] = None |
|
|
|
|
|
usedSeed = parameters['seed'] |
|
|
|
|
|
global args |
|
|
global tts |
|
|
|
|
|
unload_whisper() |
|
|
unload_voicefixer() |
|
|
|
|
|
if not tts: |
|
|
|
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
load_tts() |
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
voice_samples = None |
|
|
conditioning_latents = None |
|
|
sample_voice = None |
|
|
|
|
|
voice_cache = {} |
|
|
def fetch_voice( voice ): |
|
|
cache_key = f'{voice}:{tts.autoregressive_model_hash[:8]}' |
|
|
if cache_key in voice_cache: |
|
|
return voice_cache[cache_key] |
|
|
|
|
|
print(f"Loading voice: {voice} with model {tts.autoregressive_model_hash[:8]}") |
|
|
sample_voice = None |
|
|
if voice == "microphone": |
|
|
if parameters['mic_audio'] is None: |
|
|
raise Exception("Please provide audio from mic when choosing `microphone` as a voice input") |
|
|
voice_samples, conditioning_latents = [load_audio(parameters['mic_audio'], tts.input_sample_rate)], None |
|
|
elif voice == "random": |
|
|
voice_samples, conditioning_latents = None, tts.get_random_conditioning_latents() |
|
|
else: |
|
|
if progress is not None: |
|
|
notify_progress(f"Loading voice: {voice}", progress=progress) |
|
|
|
|
|
voice_samples, conditioning_latents = load_voice(voice, model_hash=tts.autoregressive_model_hash) |
|
|
|
|
|
if voice_samples and len(voice_samples) > 0: |
|
|
if conditioning_latents is None: |
|
|
conditioning_latents = compute_latents(voice=voice, voice_samples=voice_samples, voice_latents_chunks=parameters['voice_latents_chunks']) |
|
|
|
|
|
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() |
|
|
voice_samples = None |
|
|
|
|
|
voice_cache[cache_key] = (voice_samples, conditioning_latents, sample_voice) |
|
|
return voice_cache[cache_key] |
|
|
|
|
|
def get_settings( override=None ): |
|
|
settings = { |
|
|
'temperature': float(parameters['temperature']), |
|
|
|
|
|
'top_p': float(parameters['top_p']), |
|
|
'diffusion_temperature': float(parameters['diffusion_temperature']), |
|
|
'length_penalty': float(parameters['length_penalty']), |
|
|
'repetition_penalty': float(parameters['repetition_penalty']), |
|
|
'cond_free_k': float(parameters['cond_free_k']), |
|
|
|
|
|
'num_autoregressive_samples': parameters['num_autoregressive_samples'], |
|
|
'sample_batch_size': args.sample_batch_size, |
|
|
'diffusion_iterations': parameters['diffusion_iterations'], |
|
|
|
|
|
'voice_samples': None, |
|
|
'conditioning_latents': None, |
|
|
|
|
|
'use_deterministic_seed': parameters['seed'], |
|
|
'return_deterministic_state': True, |
|
|
'k': parameters['candidates'], |
|
|
'diffusion_sampler': parameters['diffusion_sampler'], |
|
|
'breathing_room': parameters['breathing_room'], |
|
|
'progress': parameters['progress'], |
|
|
'half_p': "Half Precision" in parameters['experimentals'], |
|
|
'cond_free': "Conditioning-Free" in parameters['experimentals'], |
|
|
'cvvp_amount': parameters['cvvp_weight'], |
|
|
|
|
|
'autoregressive_model': args.autoregressive_model, |
|
|
'diffusion_model': args.diffusion_model, |
|
|
'tokenizer_json': args.tokenizer_json, |
|
|
} |
|
|
|
|
|
|
|
|
selected_voice = voice |
|
|
if override is not None: |
|
|
if 'voice' in override: |
|
|
selected_voice = override['voice'] |
|
|
|
|
|
for k in override: |
|
|
if k not in settings: |
|
|
continue |
|
|
settings[k] = override[k] |
|
|
|
|
|
if settings['autoregressive_model'] is not None: |
|
|
if settings['autoregressive_model'] == "auto": |
|
|
settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice) |
|
|
tts.load_autoregressive_model(settings['autoregressive_model']) |
|
|
|
|
|
if settings['diffusion_model'] is not None: |
|
|
if settings['diffusion_model'] == "auto": |
|
|
settings['diffusion_model'] = deduce_diffusion_model(selected_voice) |
|
|
tts.load_diffusion_model(settings['diffusion_model']) |
|
|
|
|
|
if settings['tokenizer_json'] is not None: |
|
|
tts.load_tokenizer_json(settings['tokenizer_json']) |
|
|
|
|
|
settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice) |
|
|
|
|
|
|
|
|
|
|
|
settings['sample_batch_size'] = args.sample_batch_size |
|
|
if not settings['sample_batch_size']: |
|
|
settings['sample_batch_size'] = tts.autoregressive_batch_size |
|
|
if settings['num_autoregressive_samples'] < settings['sample_batch_size']: |
|
|
settings['sample_batch_size'] = settings['num_autoregressive_samples'] |
|
|
|
|
|
if settings['conditioning_latents'] is not None and len(settings['conditioning_latents']) == 2 and settings['cvvp_amount'] > 0: |
|
|
print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents with 'Slimmer voice latents' unchecked.") |
|
|
settings['cvvp_amount'] = 0 |
|
|
|
|
|
return settings |
|
|
|
|
|
if not parameters['delimiter']: |
|
|
parameters['delimiter'] = "\n" |
|
|
elif parameters['delimiter'] == "\\n": |
|
|
parameters['delimiter'] = "\n" |
|
|
|
|
|
if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: |
|
|
texts = parameters['text'].split(parameters['delimiter']) |
|
|
else: |
|
|
texts = split_and_recombine_text(parameters['text']) |
|
|
|
|
|
full_start_time = time.time() |
|
|
|
|
|
outdir = f"{args.results_folder}/{voice}/" |
|
|
os.makedirs(outdir, exist_ok=True) |
|
|
|
|
|
audio_cache = {} |
|
|
|
|
|
volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None |
|
|
|
|
|
idx = 0 |
|
|
idx_cache = {} |
|
|
for i, file in enumerate(os.listdir(outdir)): |
|
|
filename = os.path.basename(file) |
|
|
extension = os.path.splitext(filename)[1] |
|
|
if extension != ".json" and extension != ".wav": |
|
|
continue |
|
|
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) |
|
|
if match and len(match) > 0: |
|
|
key = int(match[0]) |
|
|
idx_cache[key] = True |
|
|
|
|
|
if len(idx_cache) > 0: |
|
|
keys = sorted(list(idx_cache.keys())) |
|
|
idx = keys[-1] + 1 |
|
|
|
|
|
idx = pad(idx, 4) |
|
|
|
|
|
def get_name(line=0, candidate=0, combined=False): |
|
|
name = f"{idx}" |
|
|
if combined: |
|
|
name = f"{name}_combined" |
|
|
elif len(texts) > 1: |
|
|
name = f"{name}_{line}" |
|
|
if parameters['candidates'] > 1: |
|
|
name = f"{name}_{candidate}" |
|
|
return name |
|
|
|
|
|
def get_info( voice, settings = None, latents = True ): |
|
|
info = {} |
|
|
info.update(parameters) |
|
|
|
|
|
info['time'] = time.time()-full_start_time |
|
|
info['datetime'] = datetime.now().isoformat() |
|
|
|
|
|
info['model'] = tts.autoregressive_model_path |
|
|
info['model_hash'] = tts.autoregressive_model_hash |
|
|
|
|
|
info['progress'] = None |
|
|
del info['progress'] |
|
|
|
|
|
if info['delimiter'] == "\n": |
|
|
info['delimiter'] = "\\n" |
|
|
|
|
|
if settings is not None: |
|
|
for k in settings: |
|
|
if k in info: |
|
|
info[k] = settings[k] |
|
|
|
|
|
if 'half_p' in settings and 'cond_free' in settings: |
|
|
info['experimentals'] = [] |
|
|
if settings['half_p']: |
|
|
info['experimentals'].append("Half Precision") |
|
|
if settings['cond_free']: |
|
|
info['experimentals'].append("Conditioning-Free") |
|
|
|
|
|
if latents and "latents" not in info: |
|
|
voice = info['voice'] |
|
|
model_hash = settings["model_hash"][:8] if settings is not None and "model_hash" in settings else tts.autoregressive_model_hash[:8] |
|
|
|
|
|
dir = f'{get_voice_dir()}/{voice}/' |
|
|
latents_path = f'{dir}/cond_latents_{model_hash}.pth' |
|
|
|
|
|
if voice == "random" or voice == "microphone": |
|
|
if latents and settings is not None and settings['conditioning_latents']: |
|
|
os.makedirs(dir, exist_ok=True) |
|
|
torch.save(conditioning_latents, latents_path) |
|
|
|
|
|
if latents_path and os.path.exists(latents_path): |
|
|
try: |
|
|
with open(latents_path, 'rb') as f: |
|
|
info['latents'] = base64.b64encode(f.read()).decode("ascii") |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
return info |
|
|
|
|
|
INFERENCING = True |
|
|
for line, cut_text in enumerate(texts): |
|
|
if should_phonemize(): |
|
|
cut_text = phonemizer( cut_text ) |
|
|
|
|
|
if parameters['emotion'] == "Custom": |
|
|
if parameters['prompt'] and parameters['prompt'].strip() != "": |
|
|
cut_text = f"[{parameters['prompt']},] {cut_text}" |
|
|
elif parameters['emotion'] != "None" and parameters['emotion']: |
|
|
cut_text = f"[I am really {parameters['emotion'].lower()},] {cut_text}" |
|
|
|
|
|
tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' |
|
|
print(f"{tqdm_prefix} Generating line: {cut_text}") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) |
|
|
override = None |
|
|
if match and len(match) > 0: |
|
|
match = match[0] |
|
|
try: |
|
|
override = json.loads(match[0]) |
|
|
cut_text = match[1].strip() |
|
|
except Exception as e: |
|
|
raise Exception("Prompt settings editing requested, but received invalid JSON") |
|
|
|
|
|
settings = get_settings( override=override ) |
|
|
gen, additionals = tts.tts(cut_text, **settings ) |
|
|
|
|
|
parameters['seed'] = additionals[0] |
|
|
run_time = time.time()-start_time |
|
|
print(f"Generating line took {run_time} seconds") |
|
|
|
|
|
if not isinstance(gen, list): |
|
|
gen = [gen] |
|
|
|
|
|
for j, g in enumerate(gen): |
|
|
audio = g.squeeze(0).cpu() |
|
|
name = get_name(line=line, candidate=j) |
|
|
|
|
|
settings['text'] = cut_text |
|
|
settings['time'] = run_time |
|
|
settings['datetime'] = datetime.now().isoformat() |
|
|
if args.tts_backend == "tortoise": |
|
|
settings['model'] = tts.autoregressive_model_path |
|
|
settings['model_hash'] = tts.autoregressive_model_hash |
|
|
|
|
|
audio_cache[name] = { |
|
|
'audio': audio, |
|
|
'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) |
|
|
} |
|
|
|
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, tts.output_sample_rate) |
|
|
|
|
|
del gen |
|
|
do_gc() |
|
|
INFERENCING = False |
|
|
|
|
|
for k in audio_cache: |
|
|
audio = audio_cache[k]['audio'] |
|
|
|
|
|
audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) |
|
|
if volume_adjust is not None: |
|
|
audio = volume_adjust(audio) |
|
|
|
|
|
audio_cache[k]['audio'] = audio |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
output_voices = [] |
|
|
for candidate in range(parameters['candidates']): |
|
|
if len(texts) > 1: |
|
|
audio_clips = [] |
|
|
for line in range(len(texts)): |
|
|
name = get_name(line=line, candidate=candidate) |
|
|
audio = audio_cache[name]['audio'] |
|
|
audio_clips.append(audio) |
|
|
|
|
|
name = get_name(candidate=candidate, combined=True) |
|
|
audio = torch.cat(audio_clips, dim=-1) |
|
|
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) |
|
|
|
|
|
audio = audio.squeeze(0).cpu() |
|
|
audio_cache[name] = { |
|
|
'audio': audio, |
|
|
'settings': get_info(voice=voice), |
|
|
'output': True |
|
|
} |
|
|
else: |
|
|
name = get_name(candidate=candidate) |
|
|
audio_cache[name]['output'] = True |
|
|
|
|
|
|
|
|
if args.voice_fixer: |
|
|
if not voicefixer: |
|
|
notify_progress("Loading voicefix...", progress=progress) |
|
|
load_voicefixer() |
|
|
|
|
|
try: |
|
|
fixed_cache = {} |
|
|
for name in tqdm(audio_cache, desc="Running voicefix..."): |
|
|
del audio_cache[name]['audio'] |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
continue |
|
|
|
|
|
path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' |
|
|
fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' |
|
|
voicefixer.restore( |
|
|
input=path, |
|
|
output=fixed, |
|
|
cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, |
|
|
|
|
|
) |
|
|
|
|
|
fixed_cache[f'{name}_fixed'] = { |
|
|
'settings': audio_cache[name]['settings'], |
|
|
'output': True |
|
|
} |
|
|
audio_cache[name]['output'] = False |
|
|
|
|
|
for name in fixed_cache: |
|
|
audio_cache[name] = fixed_cache[name] |
|
|
except Exception as e: |
|
|
print(e) |
|
|
print("\nFailed to run Voicefixer") |
|
|
|
|
|
for name in audio_cache: |
|
|
if 'output' not in audio_cache[name] or not audio_cache[name]['output']: |
|
|
if args.prune_nonfinal_outputs: |
|
|
audio_cache[name]['pruned'] = True |
|
|
os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
continue |
|
|
|
|
|
output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') |
|
|
|
|
|
if not args.embed_output_metadata: |
|
|
with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) |
|
|
|
|
|
if args.embed_output_metadata: |
|
|
for name in tqdm(audio_cache, desc="Embedding metadata..."): |
|
|
if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: |
|
|
continue |
|
|
|
|
|
metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") |
|
|
metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) |
|
|
metadata.save() |
|
|
|
|
|
if sample_voice is not None: |
|
|
sample_voice = (tts.input_sample_rate, sample_voice.numpy()) |
|
|
|
|
|
info = get_info(voice=voice, latents=False) |
|
|
print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") |
|
|
|
|
|
info['seed'] = usedSeed |
|
|
if 'latents' in info: |
|
|
del info['latents'] |
|
|
|
|
|
os.makedirs('./config/', exist_ok=True) |
|
|
with open(f'./config/generate.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(info, indent='\t') ) |
|
|
|
|
|
stats = [ |
|
|
[ parameters['seed'], "{:.3f}".format(info['time']) ] |
|
|
] |
|
|
|
|
|
return ( |
|
|
sample_voice, |
|
|
output_voices, |
|
|
stats, |
|
|
) |
|
|
|
|
|
def cancel_generate(): |
|
|
if not INFERENCING: |
|
|
return |
|
|
|
|
|
import tortoise.api |
|
|
|
|
|
tortoise.api.STOP_SIGNAL = True |
|
|
|
|
|
def hash_file(path, algo="md5", buffer_size=0): |
|
|
hash = None |
|
|
if algo == "md5": |
|
|
hash = hashlib.md5() |
|
|
elif algo == "sha1": |
|
|
hash = hashlib.sha1() |
|
|
else: |
|
|
raise Exception(f'Unknown hash algorithm specified: {algo}') |
|
|
|
|
|
if not os.path.exists(path): |
|
|
raise Exception(f'Path not found: {path}') |
|
|
|
|
|
with open(path, 'rb') as f: |
|
|
if buffer_size > 0: |
|
|
while True: |
|
|
data = f.read(buffer_size) |
|
|
if not data: |
|
|
break |
|
|
hash.update(data) |
|
|
else: |
|
|
hash.update(f.read()) |
|
|
|
|
|
return "{0}".format(hash.hexdigest()) |
|
|
|
|
|
def update_baseline_for_latents_chunks( voice ): |
|
|
global current_voice |
|
|
current_voice = voice |
|
|
|
|
|
path = f'{get_voice_dir()}/{voice}/' |
|
|
if not os.path.isdir(path): |
|
|
return 1 |
|
|
|
|
|
dataset_file = f'./training/{voice}/train.txt' |
|
|
if os.path.exists(dataset_file): |
|
|
return 0 |
|
|
|
|
|
files = os.listdir(path) |
|
|
|
|
|
total = 0 |
|
|
total_duration = 0 |
|
|
|
|
|
for file in files: |
|
|
if file[-4:] != ".wav": |
|
|
continue |
|
|
|
|
|
metadata = torchaudio.info(f'{path}/{file}') |
|
|
duration = metadata.num_frames / metadata.sample_rate |
|
|
total_duration += duration |
|
|
total = total + 1 |
|
|
|
|
|
|
|
|
|
|
|
if args.autocalculate_voice_chunk_duration_size == 0: |
|
|
return int(total_duration / total) if total > 0 else 1 |
|
|
return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1 |
|
|
|
|
|
def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, progress=None): |
|
|
global tts |
|
|
global args |
|
|
|
|
|
unload_whisper() |
|
|
unload_voicefixer() |
|
|
|
|
|
if not tts: |
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
load_tts() |
|
|
|
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
if args.autoregressive_model == "auto": |
|
|
tts.load_autoregressive_model(deduce_autoregressive_model(voice)) |
|
|
|
|
|
if voice: |
|
|
load_from_dataset = voice_latents_chunks == 0 |
|
|
|
|
|
if load_from_dataset: |
|
|
dataset_path = f'./training/{voice}/train.txt' |
|
|
if not os.path.exists(dataset_path): |
|
|
load_from_dataset = False |
|
|
else: |
|
|
with open(dataset_path, 'r', encoding="utf-8") as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
print("Leveraging dataset for computing latents") |
|
|
|
|
|
voice_samples = [] |
|
|
max_length = 0 |
|
|
for line in lines: |
|
|
filename = f'./training/{voice}/{line.split("|")[0]}' |
|
|
|
|
|
waveform = load_audio(filename, 22050) |
|
|
max_length = max(max_length, waveform.shape[-1]) |
|
|
voice_samples.append(waveform) |
|
|
|
|
|
for i in range(len(voice_samples)): |
|
|
voice_samples[i] = pad_or_truncate(voice_samples[i], max_length) |
|
|
|
|
|
voice_latents_chunks = len(voice_samples) |
|
|
if voice_latents_chunks == 0: |
|
|
print("Dataset is empty!") |
|
|
load_from_dataset = True |
|
|
if not load_from_dataset: |
|
|
voice_samples, _ = load_voice(voice, load_latents=False) |
|
|
|
|
|
if voice_samples is None: |
|
|
return |
|
|
|
|
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents) |
|
|
|
|
|
if len(conditioning_latents) == 4: |
|
|
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) |
|
|
|
|
|
outfile = f'{get_voice_dir()}/{voice}/cond_latents_{tts.autoregressive_model_hash[:8]}.pth' |
|
|
torch.save(conditioning_latents, outfile) |
|
|
print(f'Saved voice latents: {outfile}') |
|
|
|
|
|
return conditioning_latents |
|
|
|
|
|
|
|
|
class TrainingState(): |
|
|
def __init__(self, config_path, keep_x_past_checkpoints=0, start=True): |
|
|
self.killed = False |
|
|
|
|
|
self.training_dir = os.path.dirname(config_path) |
|
|
with open(config_path, 'r') as file: |
|
|
self.yaml_config = yaml.safe_load(file) |
|
|
|
|
|
self.json_config = json.load(open(f"{self.training_dir}/train.json", 'r', encoding="utf-8")) |
|
|
self.dataset_path = f"{self.training_dir}/train.txt" |
|
|
with open(self.dataset_path, 'r', encoding="utf-8") as f: |
|
|
self.dataset_size = len(f.readlines()) |
|
|
|
|
|
self.batch_size = self.json_config["batch_size"] |
|
|
self.save_rate = self.json_config["save_rate"] |
|
|
|
|
|
self.epoch = 0 |
|
|
self.epochs = self.json_config["epochs"] |
|
|
self.it = 0 |
|
|
self.its = calc_iterations( self.epochs, self.dataset_size, self.batch_size ) |
|
|
self.step = 0 |
|
|
self.steps = int(self.its / self.dataset_size) |
|
|
self.checkpoint = 0 |
|
|
self.checkpoints = int((self.its - self.it) / self.save_rate) |
|
|
|
|
|
self.gpus = self.json_config['gpus'] |
|
|
|
|
|
self.buffer = [] |
|
|
|
|
|
self.open_state = False |
|
|
self.training_started = False |
|
|
|
|
|
self.info = {} |
|
|
|
|
|
self.it_rate = "" |
|
|
self.it_rates = 0 |
|
|
|
|
|
self.epoch_rate = "" |
|
|
|
|
|
self.eta = "?" |
|
|
self.eta_hhmmss = "?" |
|
|
|
|
|
self.nan_detected = False |
|
|
|
|
|
self.last_info_check_at = 0 |
|
|
self.statistics = { |
|
|
'loss': [], |
|
|
'lr': [], |
|
|
'grad_norm': [], |
|
|
} |
|
|
self.losses = [] |
|
|
self.metrics = { |
|
|
'step': "", |
|
|
'rate': "", |
|
|
'loss': "", |
|
|
} |
|
|
|
|
|
self.loss_milestones = [ 1.0, 0.15, 0.05 ] |
|
|
|
|
|
if args.tts_backend=="vall-e": |
|
|
self.valle_last_it = 0 |
|
|
self.valle_steps = 0 |
|
|
|
|
|
if keep_x_past_checkpoints > 0: |
|
|
self.cleanup_old(keep=keep_x_past_checkpoints) |
|
|
if start: |
|
|
self.spawn_process(config_path=config_path, gpus=self.gpus) |
|
|
|
|
|
def spawn_process(self, config_path, gpus=1): |
|
|
if args.tts_backend == "vall-e": |
|
|
self.cmd = ['deepspeed', f'--num_gpus={gpus}', '--module', 'vall_e.train', f'yaml="{config_path}"'] |
|
|
else: |
|
|
self.cmd = ['train.bat', config_path] if os.name == "nt" else ['./train.sh', config_path] |
|
|
|
|
|
print("Spawning process: ", " ".join(self.cmd)) |
|
|
self.process = subprocess.Popen(self.cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) |
|
|
|
|
|
def parse_metrics(self, data): |
|
|
if isinstance(data, str): |
|
|
if line.find('Training Metrics:') >= 0: |
|
|
data = json.loads(line.split("Training Metrics:")[-1]) |
|
|
data['mode'] = "training" |
|
|
elif line.find('Validation Metrics:') >= 0: |
|
|
data = json.loads(line.split("Validation Metrics:")[-1]) |
|
|
data['mode'] = "validation" |
|
|
else: |
|
|
return |
|
|
|
|
|
self.info = data |
|
|
if 'epoch' in self.info: |
|
|
self.epoch = int(self.info['epoch']) |
|
|
if 'it' in self.info: |
|
|
self.it = int(self.info['it']) |
|
|
if 'step' in self.info: |
|
|
self.step = int(self.info['step']) |
|
|
if 'steps' in self.info: |
|
|
self.steps = int(self.info['steps']) |
|
|
|
|
|
if 'elapsed_time' in self.info: |
|
|
self.info['iteration_rate'] = self.info['elapsed_time'] |
|
|
del self.info['elapsed_time'] |
|
|
|
|
|
if 'iteration_rate' in self.info: |
|
|
it_rate = self.info['iteration_rate'] |
|
|
self.it_rate = f'{"{:.3f}".format(1/it_rate)}it/s' if 0 < it_rate and it_rate < 1 else f'{"{:.3f}".format(it_rate)}s/it' |
|
|
self.it_rates += it_rate |
|
|
|
|
|
if self.it_rates > 0 and self.it * self.steps > 0: |
|
|
epoch_rate = self.it_rates / self.it * self.steps |
|
|
self.epoch_rate = f'{"{:.3f}".format(1/epoch_rate)}epoch/s' if 0 < epoch_rate and epoch_rate < 1 else f'{"{:.3f}".format(epoch_rate)}s/epoch' |
|
|
|
|
|
try: |
|
|
self.eta = (self.its - self.it) * (self.it_rates / self.it) |
|
|
eta = str(timedelta(seconds=int(self.eta))) |
|
|
self.eta_hhmmss = eta |
|
|
except Exception as e: |
|
|
self.eta_hhmmss = "?" |
|
|
pass |
|
|
|
|
|
self.metrics['step'] = [f"{self.epoch}/{self.epochs}"] |
|
|
if self.epochs != self.its: |
|
|
self.metrics['step'].append(f"{self.it}/{self.its}") |
|
|
if self.steps > 1: |
|
|
self.metrics['step'].append(f"{self.step}/{self.steps}") |
|
|
self.metrics['step'] = ", ".join(self.metrics['step']) |
|
|
|
|
|
if args.tts_backend == "tortoise": |
|
|
epoch = self.epoch + (self.step / self.steps) |
|
|
else: |
|
|
epoch = self.info['epoch'] if 'epoch' in self.info else self.it |
|
|
|
|
|
if self.it > 0: |
|
|
|
|
|
keys = { |
|
|
'lrs': ['lr'], |
|
|
'losses': ['loss_text_ce', 'loss_mel_ce'], |
|
|
'accuracies': [], |
|
|
'precisions': [], |
|
|
'grad_norms': [], |
|
|
} |
|
|
if args.tts_backend == "vall-e": |
|
|
keys['lrs'] = [ |
|
|
'ar.lr', 'nar.lr', |
|
|
'ar-half.lr', 'nar-half.lr', |
|
|
'ar-quarter.lr', 'nar-quarter.lr', |
|
|
] |
|
|
keys['losses'] = [ |
|
|
'ar.loss', 'nar.loss', 'ar+nar.loss', |
|
|
'ar-half.loss', 'nar-half.loss', 'ar-half+nar-half.loss', |
|
|
'ar-quarter.loss', 'nar-quarter.loss', 'ar-quarter+nar-quarter.loss', |
|
|
|
|
|
'ar.loss.nll', 'nar.loss.nll', |
|
|
'ar-half.loss.nll', 'nar-half.loss.nll', |
|
|
'ar-quarter.loss.nll', 'nar-quarter.loss.nll', |
|
|
] |
|
|
|
|
|
keys['accuracies'] = [ |
|
|
'ar.loss.acc', 'nar.loss.acc', |
|
|
'ar-half.loss.acc', 'nar-half.loss.acc', |
|
|
'ar-quarter.loss.acc', 'nar-quarter.loss.acc', |
|
|
] |
|
|
keys['precisions'] = [ |
|
|
'ar.loss.precision', 'nar.loss.precision', |
|
|
'ar-half.loss.precision', 'nar-half.loss.precision', |
|
|
'ar-quarter.loss.precision', 'nar-quarter.loss.precision', |
|
|
] |
|
|
keys['grad_norms'] = ['ar.grad_norm', 'nar.grad_norm', 'ar-half.grad_norm', 'nar-half.grad_norm', 'ar-quarter.grad_norm', 'nar-quarter.grad_norm'] |
|
|
|
|
|
for k in keys['lrs']: |
|
|
if k not in self.info: |
|
|
continue |
|
|
|
|
|
self.statistics['lr'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) |
|
|
|
|
|
for k in keys['accuracies']: |
|
|
if k not in self.info: |
|
|
continue |
|
|
|
|
|
self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) |
|
|
|
|
|
for k in keys['precisions']: |
|
|
if k not in self.info: |
|
|
continue |
|
|
|
|
|
self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) |
|
|
|
|
|
for k in keys['losses']: |
|
|
if k not in self.info: |
|
|
continue |
|
|
|
|
|
prefix = "" |
|
|
|
|
|
if "mode" in self.info and self.info["mode"] == "validation": |
|
|
prefix = f'{self.info["name"] if "name" in self.info else "val"}_' |
|
|
|
|
|
self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': f'{prefix}{k}' }) |
|
|
|
|
|
self.losses.append( self.statistics['loss'][-1] ) |
|
|
|
|
|
for k in keys['grad_norms']: |
|
|
if k not in self.info: |
|
|
continue |
|
|
self.statistics['grad_norm'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) |
|
|
|
|
|
return data |
|
|
|
|
|
def get_status(self): |
|
|
message = None |
|
|
|
|
|
self.metrics['rate'] = [] |
|
|
if self.epoch_rate: |
|
|
self.metrics['rate'].append(self.epoch_rate) |
|
|
if self.it_rate and self.epoch_rate[:-7] != self.it_rate[:-4]: |
|
|
self.metrics['rate'].append(self.it_rate) |
|
|
self.metrics['rate'] = ", ".join(self.metrics['rate']) |
|
|
|
|
|
eta_hhmmss = self.eta_hhmmss if self.eta_hhmmss else "?" |
|
|
|
|
|
self.metrics['loss'] = [] |
|
|
if 'lr' in self.info: |
|
|
self.metrics['loss'].append(f'LR: {"{:.3e}".format(self.info["lr"])}') |
|
|
|
|
|
if len(self.losses) > 0: |
|
|
self.metrics['loss'].append(f'Loss: {"{:.3f}".format(self.losses[-1]["value"])}') |
|
|
|
|
|
if False and len(self.losses) >= 2: |
|
|
deriv = 0 |
|
|
accum_length = len(self.losses)//2 |
|
|
loss_value = self.losses[-1]["value"] |
|
|
|
|
|
for i in range(accum_length): |
|
|
d1_loss = self.losses[accum_length-i-1]["value"] |
|
|
d2_loss = self.losses[accum_length-i-2]["value"] |
|
|
dloss = (d2_loss - d1_loss) |
|
|
|
|
|
d1_step = self.losses[accum_length-i-1]["it"] |
|
|
d2_step = self.losses[accum_length-i-2]["it"] |
|
|
dstep = (d2_step - d1_step) |
|
|
|
|
|
if dstep == 0: |
|
|
continue |
|
|
|
|
|
inst_deriv = dloss / dstep |
|
|
deriv += inst_deriv |
|
|
|
|
|
deriv = deriv / accum_length |
|
|
|
|
|
print("Deriv: ", deriv) |
|
|
|
|
|
if deriv != 0: |
|
|
next_milestone = None |
|
|
for milestone in self.loss_milestones: |
|
|
if loss_value > milestone: |
|
|
next_milestone = milestone |
|
|
break |
|
|
|
|
|
print(f"Loss value: {loss_value} | Next milestone: {next_milestone} | Distance: {loss_value - next_milestone}") |
|
|
|
|
|
if next_milestone: |
|
|
|
|
|
est_its = (next_milestone - loss_value) / deriv * 100 |
|
|
print(f"Estimated: {est_its}") |
|
|
if est_its >= 0: |
|
|
self.metrics['loss'].append(f'Est. milestone {next_milestone} in: {int(est_its)}its') |
|
|
else: |
|
|
est_loss = inst_deriv * (self.its - self.it) + loss_value |
|
|
if est_loss >= 0: |
|
|
self.metrics['loss'].append(f'Est. final loss: {"{:.3f}".format(est_loss)}') |
|
|
|
|
|
self.metrics['loss'] = ", ".join(self.metrics['loss']) |
|
|
|
|
|
message = f"[{self.metrics['step']}] [{self.metrics['rate']}] [ETA: {eta_hhmmss}] [{self.metrics['loss']}]" |
|
|
if self.nan_detected: |
|
|
message = f"[!NaN DETECTED! {self.nan_detected}] {message}" |
|
|
|
|
|
return message |
|
|
|
|
|
def load_statistics(self, update=False): |
|
|
if not os.path.isdir(self.training_dir): |
|
|
return |
|
|
|
|
|
if args.tts_backend == "tortoise": |
|
|
logs = sorted([f'{self.training_dir}/finetune/{d}' for d in os.listdir(f'{self.training_dir}/finetune/') if d[-4:] == ".log" ]) |
|
|
else: |
|
|
logs = sorted([f'{self.training_dir}/logs/{d}/log.txt' for d in os.listdir(f'{self.training_dir}/logs/') ]) |
|
|
|
|
|
if update: |
|
|
logs = [logs[-1]] |
|
|
|
|
|
infos = {} |
|
|
highest_step = self.last_info_check_at |
|
|
|
|
|
if not update: |
|
|
self.statistics['loss'] = [] |
|
|
self.statistics['lr'] = [] |
|
|
self.statistics['grad_norm'] = [] |
|
|
self.it_rates = 0 |
|
|
|
|
|
unq = {} |
|
|
averager = None |
|
|
prev_state = 0 |
|
|
|
|
|
for log in logs: |
|
|
with open(log, 'r', encoding="utf-8") as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
if line[-1] == ".": |
|
|
line = line[:-1] |
|
|
|
|
|
if line.find('Training Metrics:') >= 0: |
|
|
split = line.split("Training Metrics:")[-1] |
|
|
data = json.loads(split) |
|
|
|
|
|
name = "train" |
|
|
mode = "training" |
|
|
prev_state = 0 |
|
|
elif line.find('Validation Metrics:') >= 0: |
|
|
data = json.loads(line.split("Validation Metrics:")[-1]) |
|
|
if "it" not in data: |
|
|
data['it'] = it |
|
|
if "epoch" not in data: |
|
|
data['epoch'] = epoch |
|
|
|
|
|
|
|
|
mode = "validation" |
|
|
|
|
|
if prev_state == 0: |
|
|
name = "subtrain" |
|
|
else: |
|
|
name = "val" |
|
|
|
|
|
prev_state += 1 |
|
|
else: |
|
|
continue |
|
|
|
|
|
if "it" not in data: |
|
|
continue |
|
|
|
|
|
it = data['it'] |
|
|
epoch = data['epoch'] |
|
|
|
|
|
if args.tts_backend == "vall-e": |
|
|
if not averager or averager['key'] != f'{it}_{name}' or averager['mode'] != mode: |
|
|
averager = { |
|
|
'key': f'{it}_{name}', |
|
|
'name': name, |
|
|
'mode': mode, |
|
|
"metrics": {} |
|
|
} |
|
|
for k in data: |
|
|
if data[k] is None: |
|
|
continue |
|
|
averager['metrics'][k] = [ data[k] ] |
|
|
else: |
|
|
for k in data: |
|
|
if data[k] is None: |
|
|
continue |
|
|
if k not in averager['metrics']: |
|
|
averager['metrics'][k] = [ data[k] ] |
|
|
else: |
|
|
averager['metrics'][k].append( data[k] ) |
|
|
|
|
|
unq[f'{it}_{mode}_{name}'] = averager |
|
|
else: |
|
|
unq[f'{it}_{mode}_{name}'] = data |
|
|
|
|
|
if update and it <= self.last_info_check_at: |
|
|
continue |
|
|
|
|
|
blacklist = [ "batch", "eval" ] |
|
|
for it in unq: |
|
|
if args.tts_backend == "vall-e": |
|
|
stats = unq[it] |
|
|
data = {k: sum(v) / len(v) for k, v in stats['metrics'].items() if k not in blacklist } |
|
|
|
|
|
|
|
|
data['name'] = stats['name'] |
|
|
data['mode'] = stats['mode'] |
|
|
data['steps'] = len(stats['metrics']['it']) |
|
|
else: |
|
|
data = unq[it] |
|
|
self.parse_metrics(data) |
|
|
|
|
|
self.last_info_check_at = highest_step |
|
|
|
|
|
def cleanup_old(self, keep=2): |
|
|
if keep <= 0: |
|
|
return |
|
|
|
|
|
if args.tts_backend == "vall-e": |
|
|
return |
|
|
|
|
|
if not os.path.isdir(f'{self.training_dir}/finetune/'): |
|
|
return |
|
|
|
|
|
models = sorted([ int(d[:-8]) for d in os.listdir(f'{self.training_dir}/finetune/models/') if d[-8:] == "_gpt.pth" ]) |
|
|
states = sorted([ int(d[:-6]) for d in os.listdir(f'{self.training_dir}/finetune/training_state/') if d[-6:] == ".state" ]) |
|
|
remove_models = models[:-keep] |
|
|
remove_states = states[:-keep] |
|
|
|
|
|
for d in remove_models: |
|
|
path = f'{self.training_dir}/finetune/models/{d}_gpt.pth' |
|
|
print("Removing", path) |
|
|
os.remove(path) |
|
|
for d in remove_states: |
|
|
path = f'{self.training_dir}/finetune/training_state/{d}.state' |
|
|
print("Removing", path) |
|
|
os.remove(path) |
|
|
|
|
|
def parse(self, line, verbose=False, keep_x_past_checkpoints=0, buffer_size=8, progress=None ): |
|
|
self.buffer.append(f'{line}') |
|
|
|
|
|
data = None |
|
|
percent = 0 |
|
|
message = None |
|
|
should_return = False |
|
|
|
|
|
MESSAGE_START = 'Start training from epoch' |
|
|
MESSAGE_FINSIHED = 'Finished training' |
|
|
MESSAGE_SAVING = 'Saving models and training states.' |
|
|
|
|
|
MESSAGE_METRICS_TRAINING = 'Training Metrics:' |
|
|
MESSAGE_METRICS_VALIDATION = 'Validation Metrics:' |
|
|
|
|
|
if line.find(MESSAGE_FINSIHED) >= 0: |
|
|
self.killed = True |
|
|
|
|
|
elif not self.training_started: |
|
|
if line.find(MESSAGE_START) >= 0: |
|
|
self.training_started = True |
|
|
|
|
|
match = re.findall(r'epoch: ([\d,]+)', line) |
|
|
if match and len(match) > 0: |
|
|
self.epoch = int(match[0].replace(",", "")) |
|
|
match = re.findall(r'iter: ([\d,]+)', line) |
|
|
if match and len(match) > 0: |
|
|
self.it = int(match[0].replace(",", "")) |
|
|
|
|
|
self.checkpoints = int((self.its - self.it) / self.save_rate) |
|
|
|
|
|
self.load_statistics() |
|
|
|
|
|
should_return = True |
|
|
else: |
|
|
if line.find(MESSAGE_SAVING) >= 0: |
|
|
self.checkpoint += 1 |
|
|
message = f"[{self.checkpoint}/{self.checkpoints}] Saving checkpoint..." |
|
|
percent = self.checkpoint / self.checkpoints |
|
|
|
|
|
self.cleanup_old(keep=keep_x_past_checkpoints) |
|
|
elif line.find(MESSAGE_METRICS_TRAINING) >= 0: |
|
|
data = json.loads(line.split(MESSAGE_METRICS_TRAINING)[-1]) |
|
|
data['mode'] = "training" |
|
|
elif line.find(MESSAGE_METRICS_VALIDATION) >= 0: |
|
|
data = json.loads(line.split(MESSAGE_METRICS_VALIDATION)[-1]) |
|
|
data['mode'] = "validation" |
|
|
|
|
|
if data is not None: |
|
|
if ': nan' in line and not self.nan_detected: |
|
|
self.nan_detected = self.it |
|
|
|
|
|
self.parse_metrics( data ) |
|
|
message = self.get_status() |
|
|
|
|
|
if message: |
|
|
percent = self.it / float(self.its) |
|
|
if progress is not None: |
|
|
progress(percent, message) |
|
|
|
|
|
self.buffer.append(f'[{"{:.3f}".format(percent*100)}%] {message}') |
|
|
should_return = True |
|
|
|
|
|
if verbose and not self.training_started: |
|
|
should_return = True |
|
|
|
|
|
self.buffer = self.buffer[-buffer_size:] |
|
|
|
|
|
result = None |
|
|
if should_return: |
|
|
result = "".join(self.buffer) if not self.training_started else message |
|
|
|
|
|
return ( |
|
|
result, |
|
|
percent, |
|
|
message, |
|
|
) |
|
|
|
|
|
try: |
|
|
import altair as alt |
|
|
alt.data_transformers.enable('default', max_rows=None) |
|
|
except Exception as e: |
|
|
print(e) |
|
|
pass |
|
|
|
|
|
def run_training(config_path, verbose=False, keep_x_past_checkpoints=0, progress=gr.Progress(track_tqdm=True)): |
|
|
global training_state |
|
|
if training_state and training_state.process: |
|
|
return "Training already in progress" |
|
|
|
|
|
|
|
|
|
|
|
if args.tts_backend == "tortoise": |
|
|
get_model_path('dvae.pth') |
|
|
|
|
|
|
|
|
torch.multiprocessing.freeze_support() |
|
|
|
|
|
unload_tts() |
|
|
unload_whisper() |
|
|
unload_voicefixer() |
|
|
|
|
|
training_state = TrainingState(config_path=config_path, keep_x_past_checkpoints=keep_x_past_checkpoints) |
|
|
|
|
|
for line in iter(training_state.process.stdout.readline, ""): |
|
|
if training_state.killed: |
|
|
return |
|
|
|
|
|
result, percent, message = training_state.parse( line=line, verbose=verbose, keep_x_past_checkpoints=keep_x_past_checkpoints, progress=progress ) |
|
|
print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") |
|
|
if result: |
|
|
yield result |
|
|
|
|
|
if progress is not None and message: |
|
|
progress(percent, message) |
|
|
|
|
|
if training_state: |
|
|
training_state.process.stdout.close() |
|
|
return_code = training_state.process.wait() |
|
|
training_state = None |
|
|
|
|
|
def update_training_dataplot(x_min=None, x_max=None, y_min=None, y_max=None, config_path=None): |
|
|
global training_state |
|
|
losses = None |
|
|
lrs = None |
|
|
grad_norms = None |
|
|
|
|
|
x_lim = [ x_min, x_max ] |
|
|
y_lim = [ y_min, y_max ] |
|
|
|
|
|
if not training_state: |
|
|
if config_path: |
|
|
training_state = TrainingState(config_path=config_path, start=False) |
|
|
training_state.load_statistics() |
|
|
message = training_state.get_status() |
|
|
|
|
|
if training_state: |
|
|
if not x_lim[-1]: |
|
|
x_lim[-1] = training_state.epochs |
|
|
|
|
|
if not y_lim[-1]: |
|
|
y_lim = None |
|
|
|
|
|
if len(training_state.statistics['loss']) > 0: |
|
|
losses = gr.LinePlot.update( |
|
|
value = pd.DataFrame(training_state.statistics['loss']), |
|
|
x_lim=x_lim, y_lim=y_lim, |
|
|
x="it", y="value", |
|
|
title="Loss Metrics", color="type", tooltip=['epoch', 'it', 'value', 'type'], |
|
|
width=500, height=350 |
|
|
) |
|
|
if len(training_state.statistics['lr']) > 0: |
|
|
lrs = gr.LinePlot.update( |
|
|
value = pd.DataFrame(training_state.statistics['lr']), |
|
|
x_lim=x_lim, |
|
|
x="it", y="value", |
|
|
title="Learning Rate", color="type", tooltip=['epoch', 'it', 'value', 'type'], |
|
|
width=500, height=350 |
|
|
) |
|
|
if len(training_state.statistics['grad_norm']) > 0: |
|
|
grad_norms = gr.LinePlot.update( |
|
|
value = pd.DataFrame(training_state.statistics['grad_norm']), |
|
|
x_lim=x_lim, |
|
|
x="it", y="value", |
|
|
title="Gradient Normals", color="type", tooltip=['epoch', 'it', 'value', 'type'], |
|
|
width=500, height=350 |
|
|
) |
|
|
|
|
|
if config_path: |
|
|
del training_state |
|
|
training_state = None |
|
|
|
|
|
return (losses, lrs, grad_norms) |
|
|
|
|
|
def reconnect_training(verbose=False, progress=gr.Progress(track_tqdm=True)): |
|
|
global training_state |
|
|
if not training_state or not training_state.process: |
|
|
return "Training not in progress" |
|
|
|
|
|
for line in iter(training_state.process.stdout.readline, ""): |
|
|
result, percent, message = training_state.parse( line=line, verbose=verbose, progress=progress ) |
|
|
print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") |
|
|
if result: |
|
|
yield result |
|
|
|
|
|
if progress is not None and message: |
|
|
progress(percent, message) |
|
|
|
|
|
def stop_training(): |
|
|
global training_state |
|
|
if training_state is None: |
|
|
return "No training in progress" |
|
|
print("Killing training process...") |
|
|
training_state.killed = True |
|
|
|
|
|
children = [] |
|
|
if args.tts_backend == "tortoise": |
|
|
|
|
|
try: |
|
|
children = [p.info for p in psutil.process_iter(attrs=['pid', 'name', 'cmdline']) if './src/train.py' in p.info['cmdline']] |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
training_state.process.stdout.close() |
|
|
training_state.process.terminate() |
|
|
training_state.process.kill() |
|
|
elif args.tts_backend == "vall-e": |
|
|
print(training_state.process.communicate(input='quit')[0]) |
|
|
|
|
|
return_code = training_state.process.wait() |
|
|
|
|
|
for p in children: |
|
|
os.kill( p['pid'], signal.SIGKILL ) |
|
|
|
|
|
training_state = None |
|
|
print("Killed training process.") |
|
|
return f"Training cancelled: {return_code}" |
|
|
|
|
|
def get_halfp_model_path(): |
|
|
autoregressive_model_path = get_model_path('autoregressive.pth') |
|
|
return autoregressive_model_path.replace(".pth", "_half.pth") |
|
|
|
|
|
def convert_to_halfp(): |
|
|
autoregressive_model_path = get_model_path('autoregressive.pth') |
|
|
print(f'Converting model to half precision: {autoregressive_model_path}') |
|
|
model = torch.load(autoregressive_model_path) |
|
|
for k in model: |
|
|
model[k] = model[k].half() |
|
|
|
|
|
outfile = get_halfp_model_path() |
|
|
torch.save(model, outfile) |
|
|
print(f'Converted model to half precision: {outfile}') |
|
|
|
|
|
|
|
|
|
|
|
def whisper_sanitize( results ): |
|
|
sanitized = json.loads(json.dumps(results)) |
|
|
sanitized['segments'] = [] |
|
|
|
|
|
for segment in results['segments']: |
|
|
length = segment['end'] - segment['start'] |
|
|
if length >= MIN_TRAINING_DURATION or len(sanitized['segments']) == 0: |
|
|
sanitized['segments'].append(segment) |
|
|
continue |
|
|
|
|
|
last_segment = sanitized['segments'][-1] |
|
|
|
|
|
if last_segment['end'] >= segment['end']: |
|
|
continue |
|
|
""" |
|
|
# segment already asimilitated it, somehow |
|
|
if last_segment['text'].endswith(segment['text']): |
|
|
continue |
|
|
""" |
|
|
last_segment['text'] += segment['text'] |
|
|
last_segment['end'] = segment['end'] |
|
|
|
|
|
for i in range(len(sanitized['segments'])): |
|
|
sanitized['segments'][i]['id'] = i |
|
|
|
|
|
return sanitized |
|
|
|
|
|
def whisper_transcribe( file, language=None ): |
|
|
|
|
|
global whisper_model |
|
|
global whisper_vad |
|
|
global whisper_diarize |
|
|
global whisper_align_model |
|
|
|
|
|
if not whisper_model: |
|
|
load_whisper_model(language=language) |
|
|
|
|
|
if args.whisper_backend == "openai/whisper": |
|
|
if not language: |
|
|
language = None |
|
|
|
|
|
return whisper_model.transcribe(file, language=language) |
|
|
|
|
|
if args.whisper_backend == "lightmare/whispercpp": |
|
|
res = whisper_model.transcribe(file) |
|
|
segments = whisper_model.extract_text_and_timestamps( res ) |
|
|
|
|
|
result = { |
|
|
'text': [], |
|
|
'segments': [] |
|
|
} |
|
|
for segment in segments: |
|
|
reparsed = { |
|
|
'start': segment[0] / 100.0, |
|
|
'end': segment[1] / 100.0, |
|
|
'text': segment[2], |
|
|
'id': len(result['segments']) |
|
|
} |
|
|
result['text'].append( segment[2] ) |
|
|
result['segments'].append(reparsed) |
|
|
|
|
|
result['text'] = " ".join(result['text']) |
|
|
return result |
|
|
|
|
|
if args.whisper_backend == "m-bain/whisperx": |
|
|
import whisperx |
|
|
from whisperx.diarize import assign_word_speakers |
|
|
|
|
|
device = "cuda" if get_device_name() == "cuda" else "cpu" |
|
|
if whisper_vad: |
|
|
|
|
|
if args.whisper_batchsize > 1: |
|
|
result = whisperx.transcribe_with_vad_parallel(whisper_model, file, whisper_vad, batch_size=args.whisper_batchsize, language=language, task="transcribe") |
|
|
else: |
|
|
result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad) |
|
|
""" |
|
|
result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad) |
|
|
""" |
|
|
else: |
|
|
result = whisper_model.transcribe(file) |
|
|
|
|
|
align_model, metadata = whisper_align_model |
|
|
result_aligned = whisperx.align(result["segments"], align_model, metadata, file, device) |
|
|
|
|
|
if whisper_diarize: |
|
|
diarize_segments = whisper_diarize(file) |
|
|
diarize_df = pd.DataFrame(diarize_segments.itertracks(yield_label=True)) |
|
|
diarize_df['start'] = diarize_df[0].apply(lambda x: x.start) |
|
|
diarize_df['end'] = diarize_df[0].apply(lambda x: x.end) |
|
|
|
|
|
result_segments, word_segments = assign_word_speakers(diarize_df, result_aligned["segments"], fill_nearest=True) |
|
|
result_aligned["segments"] = result_segments |
|
|
result_aligned["word_segments"] = word_segments |
|
|
|
|
|
for i in range(len(result_aligned['segments'])): |
|
|
del result_aligned['segments'][i]['word-segments'] |
|
|
del result_aligned['segments'][i]['char-segments'] |
|
|
|
|
|
result['segments'] = result_aligned['segments'] |
|
|
result['text'] = [] |
|
|
for segment in result['segments']: |
|
|
segment['id'] = len(result['text']) |
|
|
result['text'].append(segment['text'].strip()) |
|
|
result['text'] = " ".join(result['text']) |
|
|
|
|
|
return result |
|
|
|
|
|
def validate_waveform( waveform, sample_rate, min_only=False ): |
|
|
if not torch.any(waveform < 0): |
|
|
return "Waveform is empty" |
|
|
|
|
|
num_channels, num_frames = waveform.shape |
|
|
duration = num_frames / sample_rate |
|
|
|
|
|
if duration < MIN_TRAINING_DURATION: |
|
|
return "Duration too short ({:.3f}s < {:.3f}s)".format(duration, MIN_TRAINING_DURATION) |
|
|
|
|
|
if not min_only: |
|
|
if duration > MAX_TRAINING_DURATION: |
|
|
return "Duration too long ({:.3f}s < {:.3f}s)".format(MAX_TRAINING_DURATION, duration) |
|
|
|
|
|
return |
|
|
|
|
|
def transcribe_dataset( voice, language=None, skip_existings=False, progress=None ): |
|
|
unload_tts() |
|
|
|
|
|
global whisper_model |
|
|
if whisper_model is None: |
|
|
load_whisper_model(language=language) |
|
|
|
|
|
results = {} |
|
|
|
|
|
files = get_voice(voice, load_latents=False) |
|
|
indir = f'./training/{voice}/' |
|
|
infile = f'{indir}/whisper.json' |
|
|
|
|
|
os.makedirs(f'{indir}/audio/', exist_ok=True) |
|
|
|
|
|
TARGET_SAMPLE_RATE = 22050 |
|
|
if args.tts_backend != "tortoise": |
|
|
TARGET_SAMPLE_RATE = 24000 |
|
|
if tts: |
|
|
TARGET_SAMPLE_RATE = tts.input_sample_rate |
|
|
|
|
|
if os.path.exists(infile): |
|
|
results = json.load(open(infile, 'r', encoding="utf-8")) |
|
|
|
|
|
for file in tqdm(files, desc="Iterating through voice files"): |
|
|
basename = os.path.basename(file) |
|
|
|
|
|
if basename in results and skip_existings: |
|
|
print(f"Skipping already parsed file: {basename}") |
|
|
continue |
|
|
|
|
|
try: |
|
|
result = whisper_transcribe(file, language=language) |
|
|
except Exception as e: |
|
|
print("Failed to transcribe:", file, e) |
|
|
continue |
|
|
|
|
|
results[basename] = result |
|
|
waveform, sample_rate = torchaudio.load(file) |
|
|
|
|
|
|
|
|
waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE) |
|
|
if waveform.shape[0] == 2: |
|
|
waveform = waveform[:1] |
|
|
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16) |
|
|
|
|
|
with open(infile, 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(results, indent='\t')) |
|
|
|
|
|
do_gc() |
|
|
|
|
|
modified = False |
|
|
for basename in results: |
|
|
try: |
|
|
sanitized = whisper_sanitize(results[basename]) |
|
|
if len(sanitized['segments']) > 0 and len(sanitized['segments']) != len(results[basename]['segments']): |
|
|
results[basename] = sanitized |
|
|
modified = True |
|
|
print("Segments sanizited: ", basename) |
|
|
except Exception as e: |
|
|
print("Failed to sanitize:", basename, e) |
|
|
pass |
|
|
|
|
|
if modified: |
|
|
os.rename(infile, infile.replace(".json", ".unsanitized.json")) |
|
|
with open(infile, 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(results, indent='\t')) |
|
|
|
|
|
return f"Processed dataset to: {indir}" |
|
|
|
|
|
def slice_waveform( waveform, sample_rate, start, end, trim ): |
|
|
start = int(start * sample_rate) |
|
|
end = int(end * sample_rate) |
|
|
|
|
|
if start < 0: |
|
|
start = 0 |
|
|
if end >= waveform.shape[-1]: |
|
|
end = waveform.shape[-1] - 1 |
|
|
|
|
|
sliced = waveform[:, start:end] |
|
|
|
|
|
error = validate_waveform( sliced, sample_rate, min_only=True ) |
|
|
if trim and not error: |
|
|
sliced = torchaudio.functional.vad( sliced, sample_rate ) |
|
|
|
|
|
return sliced, error |
|
|
|
|
|
def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, results=None, progress=gr.Progress() ): |
|
|
indir = f'./training/{voice}/' |
|
|
infile = f'{indir}/whisper.json' |
|
|
messages = [] |
|
|
|
|
|
if not os.path.exists(infile): |
|
|
message = f"Missing dataset: {infile}" |
|
|
print(message) |
|
|
return message |
|
|
|
|
|
if results is None: |
|
|
results = json.load(open(infile, 'r', encoding="utf-8")) |
|
|
|
|
|
TARGET_SAMPLE_RATE = 22050 |
|
|
if args.tts_backend != "tortoise": |
|
|
TARGET_SAMPLE_RATE = 24000 |
|
|
if tts: |
|
|
TARGET_SAMPLE_RATE = tts.input_sample_rate |
|
|
|
|
|
files = 0 |
|
|
segments = 0 |
|
|
for filename in results: |
|
|
path = f'./voices/{voice}/{filename}' |
|
|
if not os.path.exists(path): |
|
|
path = f'./training/{voice}/{filename}' |
|
|
|
|
|
if not os.path.exists(path): |
|
|
message = f"Missing source audio: {filename}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
continue |
|
|
|
|
|
files += 1 |
|
|
result = results[filename] |
|
|
waveform, sample_rate = torchaudio.load(path) |
|
|
num_channels, num_frames = waveform.shape |
|
|
duration = num_frames / sample_rate |
|
|
|
|
|
for segment in result['segments']: |
|
|
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav") |
|
|
|
|
|
sliced, error = slice_waveform( waveform, sample_rate, segment['start'] + start_offset, segment['end'] + end_offset, trim_silence ) |
|
|
if error: |
|
|
message = f"{error}, skipping... {file}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
continue |
|
|
sliced, _ = resample( sliced, sample_rate, TARGET_SAMPLE_RATE ) |
|
|
|
|
|
if waveform.shape[0] == 2: |
|
|
waveform = waveform[:1] |
|
|
|
|
|
torchaudio.save(f"{indir}/audio/{file}", sliced, TARGET_SAMPLE_RATE, encoding="PCM_S", bits_per_sample=16) |
|
|
|
|
|
segments +=1 |
|
|
|
|
|
messages.append(f"Sliced segments: {files} => {segments}.") |
|
|
return "\n".join(messages) |
|
|
|
|
|
|
|
|
def phonemize_txt_file( path ): |
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
reparsed = [] |
|
|
with open(path.replace(".txt", ".phn.txt"), 'a', encoding='utf-8') as f: |
|
|
for line in tqdm(lines, desc='Phonemizing...'): |
|
|
split = line.split("|") |
|
|
audio = split[0] |
|
|
text = split[2] |
|
|
|
|
|
phonemes = phonemizer( text ) |
|
|
reparsed.append(f'{audio}|{phonemes}') |
|
|
f.write(f'\n{audio}|{phonemes}') |
|
|
|
|
|
|
|
|
joined = "\n".join(reparsed) |
|
|
with open(path.replace(".txt", ".phn.txt"), 'w', encoding='utf-8') as f: |
|
|
f.write(joined) |
|
|
|
|
|
return joined |
|
|
|
|
|
|
|
|
def create_dataset_json( path ): |
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
phonemes = None |
|
|
phn_path = path.replace(".txt", ".phn.txt") |
|
|
if os.path.exists(phn_path): |
|
|
with open(phn_path, 'r', encoding='utf-8') as f: |
|
|
phonemes = f.readlines() |
|
|
|
|
|
data = {} |
|
|
|
|
|
for line in lines: |
|
|
split = line.split("|") |
|
|
audio = split[0] |
|
|
text = split[1] |
|
|
|
|
|
data[audio] = { |
|
|
'text': text.strip() |
|
|
} |
|
|
|
|
|
for line in phonemes: |
|
|
split = line.split("|") |
|
|
audio = split[0] |
|
|
text = split[1] |
|
|
|
|
|
data[audio]['phonemes'] = text.strip() |
|
|
|
|
|
with open(path.replace(".txt", ".json"), 'w', encoding='utf-8') as f: |
|
|
f.write(json.dumps(data, indent="\t")) |
|
|
|
|
|
|
|
|
cached_backends = {} |
|
|
|
|
|
def phonemizer( text, language="en-us" ): |
|
|
from phonemizer import phonemize |
|
|
from phonemizer.backend import BACKENDS |
|
|
|
|
|
def _get_backend( language="en-us", backend="espeak" ): |
|
|
key = f'{language}_{backend}' |
|
|
if key in cached_backends: |
|
|
return cached_backends[key] |
|
|
|
|
|
if backend == 'espeak': |
|
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=True, with_stress=True) |
|
|
elif backend == 'espeak-mbrola': |
|
|
phonemizer = BACKENDS[backend]( language ) |
|
|
else: |
|
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=True ) |
|
|
|
|
|
cached_backends[key] = phonemizer |
|
|
return phonemizer |
|
|
if language == "en": |
|
|
language = "en-us" |
|
|
|
|
|
backend = _get_backend(language=language, backend=args.phonemizer_backend) |
|
|
if backend is not None: |
|
|
tokens = backend.phonemize( text, strip=True ) |
|
|
else: |
|
|
tokens = phonemize( text, language=language, strip=True, preserve_punctuation=True, with_stress=True ) |
|
|
|
|
|
return tokens[0] if len(tokens) == 0 else tokens |
|
|
tokenized = " ".join( tokens ) |
|
|
|
|
|
def should_phonemize(): |
|
|
should = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json" |
|
|
if should: |
|
|
try: |
|
|
from phonemizer import phonemize |
|
|
except Exception as e: |
|
|
return False |
|
|
return should |
|
|
|
|
|
def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ): |
|
|
indir = f'./training/{voice}/' |
|
|
infile = f'{indir}/whisper.json' |
|
|
if not os.path.exists(infile): |
|
|
message = f"Missing dataset: {infile}" |
|
|
print(message) |
|
|
return message |
|
|
|
|
|
results = json.load(open(infile, 'r', encoding="utf-8")) |
|
|
|
|
|
errored = 0 |
|
|
messages = [] |
|
|
normalize = True |
|
|
phonemize = should_phonemize() |
|
|
lines = { 'training': [], 'validation': [] } |
|
|
segments = {} |
|
|
|
|
|
if args.tts_backend != "tortoise": |
|
|
text_length = 0 |
|
|
audio_length = 0 |
|
|
|
|
|
for filename in tqdm(results, desc="Parsing results"): |
|
|
use_segment = use_segments |
|
|
|
|
|
result = results[filename] |
|
|
lang = result['language'] |
|
|
language = LANGUAGES[lang] if lang in LANGUAGES else lang |
|
|
normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer() |
|
|
|
|
|
|
|
|
if not use_segment: |
|
|
if len(result['text']) > 200: |
|
|
message = f"Text length too long (200 < {len(result['text'])}), using segments: {filename}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
use_segment = True |
|
|
|
|
|
|
|
|
if not use_segment: |
|
|
path = f'{indir}/audio/{filename}' |
|
|
if not os.path.exists(path): |
|
|
messages.append(f"Missing source audio: {filename}") |
|
|
errored += 1 |
|
|
continue |
|
|
|
|
|
metadata = torchaudio.info(path) |
|
|
duration = metadata.num_frames / metadata.sample_rate |
|
|
if duration >= MAX_TRAINING_DURATION: |
|
|
message = f"Audio too large, using segments: {filename}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
use_segment = True |
|
|
|
|
|
|
|
|
if use_segment and not use_segments: |
|
|
exists = True |
|
|
for segment in result['segments']: |
|
|
duration = segment['end'] - segment['start'] |
|
|
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: |
|
|
continue |
|
|
|
|
|
path = f'{indir}/audio/' + filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav") |
|
|
if os.path.exists(path): |
|
|
continue |
|
|
exists = False |
|
|
break |
|
|
|
|
|
if not exists: |
|
|
tmp = {} |
|
|
tmp[filename] = result |
|
|
print(f"Audio not segmented, segmenting: {filename}") |
|
|
message = slice_dataset( voice, results=tmp ) |
|
|
print(message) |
|
|
messages = messages + message.split("\n") |
|
|
|
|
|
if not use_segment: |
|
|
segments[filename] = { |
|
|
'text': result['text'], |
|
|
'lang': lang, |
|
|
'language': language, |
|
|
'normalizer': normalizer, |
|
|
'phonemes': result['phonemes'] if 'phonemes' in result else None |
|
|
} |
|
|
else: |
|
|
for segment in result['segments']: |
|
|
duration = segment['end'] - segment['start'] |
|
|
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: |
|
|
continue |
|
|
|
|
|
segments[filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")] = { |
|
|
'text': segment['text'], |
|
|
'lang': lang, |
|
|
'language': language, |
|
|
'normalizer': normalizer, |
|
|
'phonemes': segment['phonemes'] if 'phonemes' in segment else None |
|
|
} |
|
|
|
|
|
jobs = { |
|
|
'quantize': [[], []], |
|
|
'phonemize': [[], []], |
|
|
} |
|
|
|
|
|
for file in tqdm(segments, desc="Parsing segments"): |
|
|
result = segments[file] |
|
|
path = f'{indir}/audio/{file}' |
|
|
|
|
|
if not os.path.exists(path): |
|
|
message = f"Missing segment, skipping... {file}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
errored += 1 |
|
|
continue |
|
|
|
|
|
text = result['text'] |
|
|
lang = result['lang'] |
|
|
language = result['language'] |
|
|
normalizer = result['normalizer'] |
|
|
phonemes = result['phonemes'] |
|
|
if phonemize and phonemes is None: |
|
|
phonemes = phonemizer( text, language=lang ) |
|
|
|
|
|
normalized = normalizer(text) if normalize else text |
|
|
|
|
|
if len(text) > 200: |
|
|
message = f"Text length too long (200 < {len(text)}), skipping... {file}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
errored += 1 |
|
|
continue |
|
|
|
|
|
waveform, sample_rate = torchaudio.load(path) |
|
|
num_channels, num_frames = waveform.shape |
|
|
duration = num_frames / sample_rate |
|
|
|
|
|
error = validate_waveform( waveform, sample_rate ) |
|
|
if error: |
|
|
message = f"{error}, skipping... {file}" |
|
|
print(message) |
|
|
messages.append(message) |
|
|
errored += 1 |
|
|
continue |
|
|
|
|
|
culled = len(text) < text_length |
|
|
if not culled and audio_length > 0: |
|
|
culled = duration < audio_length |
|
|
|
|
|
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}' |
|
|
|
|
|
lines['training' if not culled else 'validation'].append(line) |
|
|
|
|
|
if culled or args.tts_backend != "vall-e": |
|
|
continue |
|
|
|
|
|
os.makedirs(f'{indir}/valle/', exist_ok=True) |
|
|
|
|
|
qnt_file = f'{indir}/valle/{file.replace(".wav",".qnt.pt")}' |
|
|
if not os.path.exists(qnt_file): |
|
|
jobs['quantize'][0].append(qnt_file) |
|
|
jobs['quantize'][1].append((waveform, sample_rate)) |
|
|
""" |
|
|
quantized = valle_quantize( waveform, sample_rate ).cpu() |
|
|
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}') |
|
|
print("Quantized:", file) |
|
|
""" |
|
|
|
|
|
phn_file = f'{indir}/valle/{file.replace(".wav",".phn.txt")}' |
|
|
if not os.path.exists(phn_file): |
|
|
jobs['phonemize'][0].append(phn_file) |
|
|
jobs['phonemize'][1].append(normalized) |
|
|
""" |
|
|
phonemized = valle_phonemize( normalized ) |
|
|
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemized)) |
|
|
print("Phonemized:", file, normalized, text) |
|
|
""" |
|
|
|
|
|
for i in tqdm(range(len(jobs['quantize'][0])), desc="Quantizing"): |
|
|
qnt_file = jobs['quantize'][0][i] |
|
|
waveform, sample_rate = jobs['quantize'][1][i] |
|
|
|
|
|
quantized = valle_quantize( waveform, sample_rate ).cpu() |
|
|
torch.save(quantized, qnt_file) |
|
|
print("Quantized:", qnt_file) |
|
|
|
|
|
for i in tqdm(range(len(jobs['phonemize'][0])), desc="Phonemizing"): |
|
|
phn_file = jobs['phonemize'][0][i] |
|
|
normalized = jobs['phonemize'][1][i] |
|
|
|
|
|
try: |
|
|
phonemized = valle_phonemize( normalized ) |
|
|
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized)) |
|
|
print("Phonemized:", phn_file) |
|
|
except Exception as e: |
|
|
message = f"Failed to phonemize: {phn_file}: {normalized}" |
|
|
messages.append(message) |
|
|
print(message) |
|
|
|
|
|
|
|
|
training_joined = "\n".join(lines['training']) |
|
|
validation_joined = "\n".join(lines['validation']) |
|
|
|
|
|
with open(f'{indir}/train.txt', 'w', encoding="utf-8") as f: |
|
|
f.write(training_joined) |
|
|
|
|
|
with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f: |
|
|
f.write(validation_joined) |
|
|
|
|
|
messages.append(f"Prepared {len(lines['training'])} lines (validation: {len(lines['validation'])}, culled: {errored}).\n{training_joined}\n\n{validation_joined}") |
|
|
return "\n".join(messages) |
|
|
|
|
|
def calc_iterations( epochs, lines, batch_size ): |
|
|
return int(math.ceil(epochs * math.ceil(lines / batch_size))) |
|
|
|
|
|
def schedule_learning_rate( iterations, schedule=LEARNING_RATE_SCHEDULE ): |
|
|
return [int(iterations * d) for d in schedule] |
|
|
|
|
|
def optimize_training_settings( **kwargs ): |
|
|
messages = [] |
|
|
settings = {} |
|
|
settings.update(kwargs) |
|
|
|
|
|
dataset_path = f"./training/{settings['voice']}/train.txt" |
|
|
with open(dataset_path, 'r', encoding="utf-8") as f: |
|
|
lines = len(f.readlines()) |
|
|
|
|
|
if lines == 0: |
|
|
raise Exception("Empty dataset.") |
|
|
|
|
|
if settings['batch_size'] > lines: |
|
|
settings['batch_size'] = lines |
|
|
messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}") |
|
|
|
|
|
""" |
|
|
if lines % settings['batch_size'] != 0: |
|
|
settings['batch_size'] = int(lines / settings['batch_size']) |
|
|
if settings['batch_size'] == 0: |
|
|
settings['batch_size'] = 1 |
|
|
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}") |
|
|
""" |
|
|
if settings['gradient_accumulation_size'] == 0: |
|
|
settings['gradient_accumulation_size'] = 1 |
|
|
|
|
|
if settings['batch_size'] / settings['gradient_accumulation_size'] < 2: |
|
|
settings['gradient_accumulation_size'] = int(settings['batch_size'] / 2) |
|
|
if settings['gradient_accumulation_size'] == 0: |
|
|
settings['gradient_accumulation_size'] = 1 |
|
|
|
|
|
messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}") |
|
|
elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0: |
|
|
settings['gradient_accumulation_size'] -= settings['batch_size'] % settings['gradient_accumulation_size'] |
|
|
if settings['gradient_accumulation_size'] == 0: |
|
|
settings['gradient_accumulation_size'] = 1 |
|
|
|
|
|
messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") |
|
|
|
|
|
if settings['batch_size'] % settings['gpus'] != 0: |
|
|
settings['batch_size'] -= settings['batch_size'] % settings['gpus'] |
|
|
if settings['batch_size'] == 0: |
|
|
settings['batch_size'] = 1 |
|
|
messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}") |
|
|
|
|
|
|
|
|
def get_device_batch_size( vram ): |
|
|
DEVICE_BATCH_SIZE_MAP = [ |
|
|
(70, 128), |
|
|
(32, 64), |
|
|
(16, 8), |
|
|
(8, 4), |
|
|
(6, 2), |
|
|
] |
|
|
for k, v in DEVICE_BATCH_SIZE_MAP: |
|
|
if vram > (k-1): |
|
|
return v |
|
|
return 1 |
|
|
|
|
|
if settings['gpus'] > get_device_count(): |
|
|
settings['gpus'] = get_device_count() |
|
|
messages.append(f"GPU count exceeds defacto GPU count, clamping to: {settings['gpus']}") |
|
|
|
|
|
if settings['gpus'] <= 1: |
|
|
settings['gpus'] = 1 |
|
|
else: |
|
|
messages.append(f"! EXPERIMENTAL ! Multi-GPU training is extremely particular, expect issues.") |
|
|
|
|
|
|
|
|
vram = get_device_vram() * settings['gpus'] |
|
|
batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size']) |
|
|
batch_cap = get_device_batch_size(vram) |
|
|
|
|
|
if batch_ratio > batch_cap: |
|
|
settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap) |
|
|
messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") |
|
|
|
|
|
iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) |
|
|
|
|
|
if settings['epochs'] < settings['save_rate']: |
|
|
settings['save_rate'] = settings['epochs'] |
|
|
messages.append(f"Save rate is too small for the given iteration step, clamping save rate to: {settings['save_rate']}") |
|
|
|
|
|
if settings['epochs'] < settings['validation_rate']: |
|
|
settings['validation_rate'] = settings['epochs'] |
|
|
messages.append(f"Validation rate is too small for the given iteration step, clamping validation rate to: {settings['validation_rate']}") |
|
|
|
|
|
if settings['resume_state'] and not os.path.exists(settings['resume_state']): |
|
|
settings['resume_state'] = None |
|
|
messages.append("Resume path specified, but does not exist. Disabling...") |
|
|
|
|
|
if settings['bitsandbytes']: |
|
|
messages.append("! EXPERIMENTAL ! BitsAndBytes requested.") |
|
|
|
|
|
if settings['half_p']: |
|
|
if settings['bitsandbytes']: |
|
|
settings['half_p'] = False |
|
|
messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...") |
|
|
else: |
|
|
messages.append("! EXPERIMENTAL ! Half Precision requested.") |
|
|
if not os.path.exists(get_halfp_model_path()): |
|
|
convert_to_halfp() |
|
|
|
|
|
steps = int(iterations / settings['epochs']) |
|
|
|
|
|
messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({steps}) steps per epoch)") |
|
|
|
|
|
return settings, messages |
|
|
|
|
|
def save_training_settings( **kwargs ): |
|
|
messages = [] |
|
|
settings = {} |
|
|
settings.update(kwargs) |
|
|
|
|
|
|
|
|
outjson = f'./training/{settings["voice"]}/train.json' |
|
|
with open(outjson, 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(settings, indent='\t') ) |
|
|
|
|
|
settings['dataset_path'] = f"./training/{settings['voice']}/train.txt" |
|
|
settings['validation_path'] = f"./training/{settings['voice']}/validation.txt" |
|
|
|
|
|
with open(settings['dataset_path'], 'r', encoding="utf-8") as f: |
|
|
lines = len(f.readlines()) |
|
|
|
|
|
settings['iterations'] = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) |
|
|
|
|
|
if not settings['source_model'] or settings['source_model'] == "auto": |
|
|
settings['source_model'] = f"./models/tortoise/autoregressive{'_half' if settings['half_p'] else ''}.pth" |
|
|
|
|
|
if settings['half_p']: |
|
|
if not os.path.exists(get_halfp_model_path()): |
|
|
convert_to_halfp() |
|
|
|
|
|
messages.append(f"For {settings['epochs']} epochs with {lines} lines, iterating for {settings['iterations']} steps") |
|
|
|
|
|
iterations_per_epoch = settings['iterations'] / settings['epochs'] |
|
|
|
|
|
settings['save_rate'] = int(settings['save_rate'] * iterations_per_epoch) |
|
|
settings['validation_rate'] = int(settings['validation_rate'] * iterations_per_epoch) |
|
|
|
|
|
iterations_per_epoch = int(iterations_per_epoch) |
|
|
|
|
|
if settings['save_rate'] < 1: |
|
|
settings['save_rate'] = 1 |
|
|
""" |
|
|
if settings['validation_rate'] < 1: |
|
|
settings['validation_rate'] = 1 |
|
|
""" |
|
|
""" |
|
|
if settings['iterations'] % settings['save_rate'] != 0: |
|
|
adjustment = int(settings['iterations'] / settings['save_rate']) * settings['save_rate'] |
|
|
messages.append(f"Iteration rate is not evenly divisible by save rate, adjusting: {settings['iterations']} => {adjustment}") |
|
|
settings['iterations'] = adjustment |
|
|
""" |
|
|
|
|
|
settings['validation_batch_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size']) |
|
|
if not os.path.exists(settings['validation_path']): |
|
|
settings['validation_enabled'] = False |
|
|
messages.append("Validation not found, disabling validation...") |
|
|
elif settings['validation_batch_size'] == 0: |
|
|
settings['validation_enabled'] = False |
|
|
messages.append("Validation batch size == 0, disabling validation...") |
|
|
else: |
|
|
with open(settings['validation_path'], 'r', encoding="utf-8") as f: |
|
|
validation_lines = len(f.readlines()) |
|
|
|
|
|
if validation_lines < settings['validation_batch_size']: |
|
|
settings['validation_batch_size'] = validation_lines |
|
|
messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}") |
|
|
|
|
|
settings['tokenizer_json'] = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] |
|
|
|
|
|
if settings['gpus'] > get_device_count(): |
|
|
settings['gpus'] = get_device_count() |
|
|
|
|
|
|
|
|
settings['optimizer'] = 'adamw' |
|
|
|
|
|
if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES: |
|
|
settings['learning_rate_scheme'] = "Multistep" |
|
|
|
|
|
settings['learning_rate_scheme'] = LEARNING_RATE_SCHEMES[settings['learning_rate_scheme']] |
|
|
|
|
|
learning_rate_schema = [f"default_lr_scheme: {settings['learning_rate_scheme']}"] |
|
|
if settings['learning_rate_scheme'] == "MultiStepLR": |
|
|
if not settings['learning_rate_schedule']: |
|
|
settings['learning_rate_schedule'] = LEARNING_RATE_SCHEDULE |
|
|
elif isinstance(settings['learning_rate_schedule'],str): |
|
|
settings['learning_rate_schedule'] = json.loads(settings['learning_rate_schedule']) |
|
|
|
|
|
settings['learning_rate_schedule'] = schedule_learning_rate( iterations_per_epoch, settings['learning_rate_schedule'] ) |
|
|
|
|
|
learning_rate_schema.append(f" gen_lr_steps: {settings['learning_rate_schedule']}") |
|
|
learning_rate_schema.append(f" lr_gamma: 0.5") |
|
|
elif settings['learning_rate_scheme'] == "CosineAnnealingLR_Restart": |
|
|
epochs = settings['epochs'] |
|
|
restarts = settings['learning_rate_restarts'] |
|
|
restart_period = int(epochs / restarts) |
|
|
|
|
|
if 'learning_rate_warmup' not in settings: |
|
|
settings['learning_rate_warmup'] = 0 |
|
|
if 'learning_rate_min' not in settings: |
|
|
settings['learning_rate_min'] = 1e-08 |
|
|
|
|
|
if 'learning_rate_period' not in settings: |
|
|
settings['learning_rate_period'] = [ iterations_per_epoch * restart_period for x in range(epochs) ] |
|
|
|
|
|
settings['learning_rate_restarts'] = [ iterations_per_epoch * (x+1) * restart_period for x in range(restarts) ] |
|
|
|
|
|
if 'learning_rate_restart_weights' not in settings: |
|
|
settings['learning_rate_restart_weights'] = [ ( restarts - x - 1 ) / restarts for x in range(restarts) ] |
|
|
settings['learning_rate_restart_weights'][-1] = settings['learning_rate_restart_weights'][-2] * 0.5 |
|
|
|
|
|
learning_rate_schema.append(f" T_period: {settings['learning_rate_period']}") |
|
|
learning_rate_schema.append(f" warmup: {settings['learning_rate_warmup']}") |
|
|
learning_rate_schema.append(f" eta_min: !!float {settings['learning_rate_min']}") |
|
|
learning_rate_schema.append(f" restarts: {settings['learning_rate_restarts']}") |
|
|
learning_rate_schema.append(f" restart_weights: {settings['learning_rate_restart_weights']}") |
|
|
settings['learning_rate_scheme'] = "\n".join(learning_rate_schema) |
|
|
|
|
|
if settings['resume_state']: |
|
|
settings['source_model'] = f"# pretrain_model_gpt: '{settings['source_model']}'" |
|
|
settings['resume_state'] = f"resume_state: '{settings['resume_state']}'" |
|
|
else: |
|
|
settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'" |
|
|
settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'" |
|
|
|
|
|
def use_template(template, out): |
|
|
with open(template, 'r', encoding="utf-8") as f: |
|
|
yaml = f.read() |
|
|
|
|
|
|
|
|
for k in settings: |
|
|
if settings[k] is None: |
|
|
continue |
|
|
yaml = yaml.replace(f"${{{k}}}", str(settings[k])) |
|
|
|
|
|
with open(out, 'w', encoding="utf-8") as f: |
|
|
f.write(yaml) |
|
|
|
|
|
if args.tts_backend == "tortoise": |
|
|
use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml') |
|
|
elif args.tts_backend == "vall-e": |
|
|
settings['model_name'] = "[ 'ar-quarter', 'nar-quarter' ]" |
|
|
use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/config.yaml') |
|
|
|
|
|
messages.append(f"Saved training output") |
|
|
return settings, messages |
|
|
|
|
|
def import_voices(files, saveAs=None, progress=None): |
|
|
global args |
|
|
|
|
|
if not isinstance(files, list): |
|
|
files = [files] |
|
|
|
|
|
for file in tqdm(files, desc="Importing voice files"): |
|
|
j, latents = read_generate_settings(file, read_latents=True) |
|
|
|
|
|
if j is not None and saveAs is None: |
|
|
saveAs = j['voice'] |
|
|
if saveAs is None or saveAs == "": |
|
|
raise Exception("Specify a voice name") |
|
|
|
|
|
outdir = f'{get_voice_dir()}/{saveAs}/' |
|
|
os.makedirs(outdir, exist_ok=True) |
|
|
|
|
|
if latents: |
|
|
print(f"Importing latents to {latents}") |
|
|
with open(f'{outdir}/cond_latents.pth', 'wb') as f: |
|
|
f.write(latents) |
|
|
latents = f'{outdir}/cond_latents.pth' |
|
|
print(f"Imported latents to {latents}") |
|
|
else: |
|
|
filename = file.name |
|
|
if filename[-4:] != ".wav": |
|
|
raise Exception("Please convert to a WAV first") |
|
|
|
|
|
path = f"{outdir}/{os.path.basename(filename)}" |
|
|
print(f"Importing voice to {path}") |
|
|
|
|
|
waveform, sample_rate = torchaudio.load(filename) |
|
|
|
|
|
if args.voice_fixer: |
|
|
if not voicefixer: |
|
|
load_voicefixer() |
|
|
|
|
|
waveform, sample_rate = resample(waveform, sample_rate, 44100) |
|
|
torchaudio.save(path, waveform, sample_rate) |
|
|
|
|
|
print(f"Running 'voicefixer' on voice sample: {path}") |
|
|
voicefixer.restore( |
|
|
input = path, |
|
|
output = path, |
|
|
cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, |
|
|
|
|
|
) |
|
|
else: |
|
|
torchaudio.save(path, waveform, sample_rate) |
|
|
|
|
|
print(f"Imported voice to {path}") |
|
|
|
|
|
def relative_paths( dirs ): |
|
|
return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ] |
|
|
|
|
|
def get_voice( name, dir=get_voice_dir(), load_latents=True ): |
|
|
subj = f'{dir}/{name}/' |
|
|
if not os.path.isdir(subj): |
|
|
return |
|
|
|
|
|
voice = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.flac')) |
|
|
if load_latents: |
|
|
voice = voice + list(glob(f'{subj}/*.pth')) |
|
|
return sorted( voice ) |
|
|
|
|
|
def get_voice_list(dir=get_voice_dir(), append_defaults=False): |
|
|
defaults = [ "random", "microphone" ] |
|
|
os.makedirs(dir, exist_ok=True) |
|
|
|
|
|
|
|
|
res = [] |
|
|
for name in os.listdir(dir): |
|
|
if name in defaults: |
|
|
continue |
|
|
if not os.path.isdir(f'{dir}/{name}'): |
|
|
continue |
|
|
if len(os.listdir(os.path.join(dir, name))) == 0: |
|
|
continue |
|
|
files = get_voice( name, dir=dir ) |
|
|
|
|
|
if len(files) > 0: |
|
|
res.append(name) |
|
|
else: |
|
|
for subdir in os.listdir(f'{dir}/{name}'): |
|
|
if not os.path.isdir(f'{dir}/{name}/{subdir}'): |
|
|
continue |
|
|
files = get_voice( f'{name}/{subdir}', dir=dir ) |
|
|
if len(files) == 0: |
|
|
continue |
|
|
res.append(f'{name}/{subdir}') |
|
|
|
|
|
res = sorted(res) |
|
|
|
|
|
if append_defaults: |
|
|
res = res + defaults |
|
|
|
|
|
return res |
|
|
|
|
|
def get_valle_models(dir="./training/"): |
|
|
return [ f'{dir}/{d}/config.yaml' for d in os.listdir(dir) if os.path.exists(f'{dir}/{d}/config.yaml') ] |
|
|
|
|
|
def get_autoregressive_models(dir="./models/finetunes/", prefixed=False, auto=False): |
|
|
os.makedirs(dir, exist_ok=True) |
|
|
base = [get_model_path('autoregressive.pth')] |
|
|
halfp = get_halfp_model_path() |
|
|
if os.path.exists(halfp): |
|
|
base.append(halfp) |
|
|
|
|
|
additionals = sorted([f'{dir}/{d}' for d in os.listdir(dir) if d[-4:] == ".pth" ]) |
|
|
found = [] |
|
|
for training in os.listdir(f'./training/'): |
|
|
if not os.path.isdir(f'./training/{training}/') or not os.path.isdir(f'./training/{training}/finetune/') or not os.path.isdir(f'./training/{training}/finetune/models/'): |
|
|
continue |
|
|
models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ]) |
|
|
found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ] |
|
|
|
|
|
res = base + additionals + found |
|
|
|
|
|
if prefixed: |
|
|
for i in range(len(res)): |
|
|
path = res[i] |
|
|
hash = hash_file(path) |
|
|
shorthash = hash[:8] |
|
|
|
|
|
res[i] = f'[{shorthash}] {path}' |
|
|
|
|
|
paths = relative_paths(res) |
|
|
if auto: |
|
|
paths = ["auto"] + paths |
|
|
|
|
|
return paths |
|
|
|
|
|
def get_diffusion_models(dir="./models/finetunes/", prefixed=False): |
|
|
return relative_paths([ get_model_path('diffusion_decoder.pth') ]) |
|
|
|
|
|
def get_tokenizer_jsons( dir="./models/tokenizers/" ): |
|
|
additionals = sorted([ f'{dir}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else [] |
|
|
return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals) |
|
|
|
|
|
def tokenize_text( text, config=None, stringed=True, skip_specials=False ): |
|
|
from tortoise.utils.tokenizer import VoiceBpeTokenizer |
|
|
|
|
|
if not config: |
|
|
config = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] |
|
|
|
|
|
if not tts: |
|
|
tokenizer = VoiceBpeTokenizer(config) |
|
|
else: |
|
|
tokenizer = tts.tokenizer |
|
|
|
|
|
encoded = tokenizer.encode(text) |
|
|
decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=skip_specials).split(" ") |
|
|
|
|
|
if stringed: |
|
|
return "\n".join([ str(encoded), str(decoded) ]) |
|
|
|
|
|
return decoded |
|
|
|
|
|
def get_dataset_list(dir="./training/"): |
|
|
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) |
|
|
|
|
|
def get_training_list(dir="./training/"): |
|
|
if args.tts_backend == "tortoise": |
|
|
return sorted([f'./training/{d}/train.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.yaml" in os.listdir(os.path.join(dir, d)) ]) |
|
|
else: |
|
|
return sorted([f'./training/{d}/config.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "config.yaml" in os.listdir(os.path.join(dir, d)) ]) |
|
|
|
|
|
def pad(num, zeroes): |
|
|
return str(num).zfill(zeroes+1) |
|
|
|
|
|
def curl(url): |
|
|
try: |
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Python'}) |
|
|
conn = urllib.request.urlopen(req) |
|
|
data = conn.read() |
|
|
data = data.decode() |
|
|
data = json.loads(data) |
|
|
conn.close() |
|
|
return data |
|
|
except Exception as e: |
|
|
print(e) |
|
|
return None |
|
|
|
|
|
def check_for_updates( dir = None ): |
|
|
if dir is None: |
|
|
check_for_updates("./.git/") |
|
|
check_for_updates("./.git/modules/dlas/") |
|
|
check_for_updates("./.git/modules/tortoise-tts/") |
|
|
return |
|
|
|
|
|
git_dir = dir |
|
|
if not os.path.isfile(f'{git_dir}/FETCH_HEAD'): |
|
|
print(f"Cannot check for updates for {dir}: not from a git repo") |
|
|
return False |
|
|
|
|
|
with open(f'{git_dir}/FETCH_HEAD', 'r', encoding="utf-8") as f: |
|
|
head = f.read() |
|
|
|
|
|
match = re.findall(r"^([a-f0-9]+).+?https:\/\/(.+?)\/(.+?)\/(.+?)\n", head) |
|
|
if match is None or len(match) == 0: |
|
|
print(f"Cannot check for updates for {dir}: cannot parse FETCH_HEAD") |
|
|
return False |
|
|
|
|
|
match = match[0] |
|
|
|
|
|
local = match[0] |
|
|
host = match[1] |
|
|
owner = match[2] |
|
|
repo = match[3] |
|
|
|
|
|
res = curl(f"https://{host}/api/v1/repos/{owner}/{repo}/branches/") |
|
|
|
|
|
if res is None or len(res) == 0: |
|
|
print(f"Cannot check for updates for {dir}: cannot fetch from remote") |
|
|
return False |
|
|
|
|
|
remote = res[0]["commit"]["id"] |
|
|
|
|
|
if remote != local: |
|
|
print(f"New version found for {dir}: {local[:8]} => {remote[:8]}") |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def notify_progress(message, progress=None, verbose=True): |
|
|
if verbose: |
|
|
print(message) |
|
|
|
|
|
if progress is None: |
|
|
tqdm.write(message) |
|
|
else: |
|
|
progress(0, desc=message) |
|
|
|
|
|
def get_args(): |
|
|
global args |
|
|
return args |
|
|
|
|
|
def setup_args(): |
|
|
global args |
|
|
|
|
|
default_arguments = { |
|
|
'share': False, |
|
|
'listen': None, |
|
|
'check-for-updates': False, |
|
|
'models-from-local-only': False, |
|
|
'low-vram': False, |
|
|
'sample-batch-size': None, |
|
|
'unsqueeze-sample-batches': False, |
|
|
'embed-output-metadata': True, |
|
|
'latents-lean-and-mean': True, |
|
|
'voice-fixer': False, |
|
|
'voice-fixer-use-cuda': True, |
|
|
|
|
|
|
|
|
'force-cpu-for-conditioning-latents': False, |
|
|
'defer-tts-load': False, |
|
|
'device-override': None, |
|
|
'prune-nonfinal-outputs': True, |
|
|
'concurrency-count': 2, |
|
|
'autocalculate-voice-chunk-duration-size': 10, |
|
|
|
|
|
'output-sample-rate': 44100, |
|
|
'output-volume': 1, |
|
|
'results-folder': "./results/", |
|
|
|
|
|
'hf-token': None, |
|
|
'tts-backend': TTSES[0], |
|
|
|
|
|
'autoregressive-model': None, |
|
|
'diffusion-model': None, |
|
|
'vocoder-model': VOCODERS[-1], |
|
|
'tokenizer-json': None, |
|
|
|
|
|
'phonemizer-backend': 'espeak', |
|
|
|
|
|
'valle-model': None, |
|
|
|
|
|
'whisper-backend': 'openai/whisper', |
|
|
'whisper-model': "base", |
|
|
'whisper-batchsize': 1, |
|
|
|
|
|
'training-default-halfp': False, |
|
|
'training-default-bnb': True, |
|
|
} |
|
|
|
|
|
if os.path.isfile('./config/exec.json'): |
|
|
with open(f'./config/exec.json', 'r', encoding="utf-8") as f: |
|
|
try: |
|
|
overrides = json.load(f) |
|
|
for k in overrides: |
|
|
default_arguments[k] = overrides[k] |
|
|
except Exception as e: |
|
|
print(e) |
|
|
pass |
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--share", action='store_true', default=default_arguments['share'], help="Lets Gradio return a public URL to use anywhere") |
|
|
parser.add_argument("--listen", default=default_arguments['listen'], help="Path for Gradio to listen on") |
|
|
parser.add_argument("--check-for-updates", action='store_true', default=default_arguments['check-for-updates'], help="Checks for update on startup") |
|
|
parser.add_argument("--models-from-local-only", action='store_true', default=default_arguments['models-from-local-only'], help="Only loads models from disk, does not check for updates for models") |
|
|
parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage") |
|
|
parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)") |
|
|
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.") |
|
|
parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.") |
|
|
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.") |
|
|
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)") |
|
|
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model") |
|
|
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation") |
|
|
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch") |
|
|
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") |
|
|
parser.add_argument("--unsqueeze-sample-batches", default=default_arguments['unsqueeze-sample-batches'], action='store_true', help="Unsqueezes sample batches to process one by one after sampling") |
|
|
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") |
|
|
parser.add_argument("--autocalculate-voice-chunk-duration-size", type=float, default=default_arguments['autocalculate-voice-chunk-duration-size'], help="Number of seconds to suggest voice chunk size for (for example, 100 seconds of audio at 10 seconds per chunk will suggest 10 chunks)") |
|
|
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") |
|
|
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") |
|
|
parser.add_argument("--results-folder", type=str, default=default_arguments['results-folder'], help="Sets output directory") |
|
|
|
|
|
parser.add_argument("--hf-token", type=str, default=default_arguments['hf-token'], help="HuggingFace Token") |
|
|
parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.") |
|
|
|
|
|
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") |
|
|
parser.add_argument("--diffusion-model", default=default_arguments['diffusion-model'], help="Specifies which diffusion model to use for sampling.") |
|
|
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use") |
|
|
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.") |
|
|
|
|
|
parser.add_argument("--phonemizer-backend", default=default_arguments['phonemizer-backend'], help="Specifies which phonemizer backend to use.") |
|
|
|
|
|
parser.add_argument("--valle-model", default=default_arguments['valle-model'], help="Specifies which VALL-E model to use for sampling.") |
|
|
|
|
|
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") |
|
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") |
|
|
parser.add_argument("--whisper-batchsize", type=int, default=default_arguments['whisper-batchsize'], help="Specifies batch size for WhisperX") |
|
|
|
|
|
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp") |
|
|
parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb") |
|
|
|
|
|
parser.add_argument("--os", default="unix", help="Specifies which OS, easily") |
|
|
args = parser.parse_args() |
|
|
|
|
|
args.embed_output_metadata = not args.no_embed_output_metadata |
|
|
|
|
|
if not args.device_override: |
|
|
set_device_name(args.device_override) |
|
|
|
|
|
if args.sample_batch_size == 0 and get_device_batch_size() == 1: |
|
|
print("!WARNING! Automatically deduced sample batch size returned 1.") |
|
|
|
|
|
args.listen_host = None |
|
|
args.listen_port = None |
|
|
args.listen_path = None |
|
|
if args.listen: |
|
|
try: |
|
|
match = re.findall(r"^(?:(.+?):(\d+))?(\/.*?)?$", args.listen)[0] |
|
|
|
|
|
args.listen_host = match[0] if match[0] != "" else "127.0.0.1" |
|
|
args.listen_port = match[1] if match[1] != "" else None |
|
|
args.listen_path = match[2] if match[2] != "" else "/" |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
if args.listen_port is not None: |
|
|
args.listen_port = int(args.listen_port) |
|
|
if args.listen_port == 0: |
|
|
args.listen_port = None |
|
|
|
|
|
return args |
|
|
|
|
|
def get_default_settings( hypenated=True ): |
|
|
settings = { |
|
|
'listen': None if not args.listen else args.listen, |
|
|
'share': args.share, |
|
|
'low-vram':args.low_vram, |
|
|
'check-for-updates':args.check_for_updates, |
|
|
'models-from-local-only':args.models_from_local_only, |
|
|
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, |
|
|
'defer-tts-load': args.defer_tts_load, |
|
|
'prune-nonfinal-outputs': args.prune_nonfinal_outputs, |
|
|
'device-override': args.device_override, |
|
|
'sample-batch-size': args.sample_batch_size, |
|
|
'unsqueeze-sample-batches': args.unsqueeze_sample_batches, |
|
|
'embed-output-metadata': args.embed_output_metadata, |
|
|
'latents-lean-and-mean': args.latents_lean_and_mean, |
|
|
'voice-fixer': args.voice_fixer, |
|
|
'voice-fixer-use-cuda': args.voice_fixer_use_cuda, |
|
|
'concurrency-count': args.concurrency_count, |
|
|
'output-sample-rate': args.output_sample_rate, |
|
|
'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, |
|
|
'output-volume': args.output_volume, |
|
|
'results-folder': args.results_folder, |
|
|
|
|
|
'hf-token': args.hf_token, |
|
|
'tts-backend': args.tts_backend, |
|
|
|
|
|
'autoregressive-model': args.autoregressive_model, |
|
|
'diffusion-model': args.diffusion_model, |
|
|
'vocoder-model': args.vocoder_model, |
|
|
'tokenizer-json': args.tokenizer_json, |
|
|
|
|
|
'phonemizer-backend': args.phonemizer_backend, |
|
|
|
|
|
'valle-model': args.valle_model, |
|
|
|
|
|
'whisper-backend': args.whisper_backend, |
|
|
'whisper-model': args.whisper_model, |
|
|
'whisper-batchsize': args.whisper_batchsize, |
|
|
|
|
|
'training-default-halfp': args.training_default_halfp, |
|
|
'training-default-bnb': args.training_default_bnb, |
|
|
} |
|
|
|
|
|
res = {} |
|
|
for k in settings: |
|
|
res[k.replace("-", "_") if not hypenated else k] = settings[k] |
|
|
return res |
|
|
|
|
|
def update_args( **kwargs ): |
|
|
global args |
|
|
|
|
|
settings = get_default_settings(hypenated=False) |
|
|
settings.update(kwargs) |
|
|
|
|
|
args.listen = settings['listen'] |
|
|
args.share = settings['share'] |
|
|
args.check_for_updates = settings['check_for_updates'] |
|
|
args.models_from_local_only = settings['models_from_local_only'] |
|
|
args.low_vram = settings['low_vram'] |
|
|
args.force_cpu_for_conditioning_latents = settings['force_cpu_for_conditioning_latents'] |
|
|
args.defer_tts_load = settings['defer_tts_load'] |
|
|
args.prune_nonfinal_outputs = settings['prune_nonfinal_outputs'] |
|
|
args.device_override = settings['device_override'] |
|
|
args.sample_batch_size = settings['sample_batch_size'] |
|
|
args.unsqueeze_sample_batches = settings['unsqueeze_sample_batches'] |
|
|
args.embed_output_metadata = settings['embed_output_metadata'] |
|
|
args.latents_lean_and_mean = settings['latents_lean_and_mean'] |
|
|
args.voice_fixer = settings['voice_fixer'] |
|
|
args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda'] |
|
|
args.concurrency_count = settings['concurrency_count'] |
|
|
args.output_sample_rate = 44000 |
|
|
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size'] |
|
|
args.output_volume = settings['output_volume'] |
|
|
args.results_folder = settings['results_folder'] |
|
|
|
|
|
args.hf_token = settings['hf_token'] |
|
|
args.tts_backend = settings['tts_backend'] |
|
|
|
|
|
args.autoregressive_model = settings['autoregressive_model'] |
|
|
args.diffusion_model = settings['diffusion_model'] |
|
|
args.vocoder_model = settings['vocoder_model'] |
|
|
args.tokenizer_json = settings['tokenizer_json'] |
|
|
|
|
|
args.phonemizer_backend = settings['phonemizer_backend'] |
|
|
|
|
|
args.valle_model = settings['valle_model'] |
|
|
|
|
|
args.whisper_backend = settings['whisper_backend'] |
|
|
args.whisper_model = settings['whisper_model'] |
|
|
args.whisper_batchsize = settings['whisper_batchsize'] |
|
|
|
|
|
args.training_default_halfp = settings['training_default_halfp'] |
|
|
args.training_default_bnb = settings['training_default_bnb'] |
|
|
|
|
|
save_args_settings() |
|
|
|
|
|
def save_args_settings(): |
|
|
global args |
|
|
settings = get_default_settings() |
|
|
|
|
|
os.makedirs('./config/', exist_ok=True) |
|
|
with open(f'./config/exec.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps(settings, indent='\t') ) |
|
|
|
|
|
|
|
|
def import_generate_settings(file = None): |
|
|
if not file: |
|
|
file = "./config/generate.json" |
|
|
|
|
|
res = { |
|
|
'text': None, |
|
|
'delimiter': None, |
|
|
'emotion': None, |
|
|
'prompt': None, |
|
|
'voice': "random", |
|
|
'mic_audio': None, |
|
|
'voice_latents_chunks': None, |
|
|
'candidates': None, |
|
|
'seed': None, |
|
|
'num_autoregressive_samples': 16, |
|
|
'diffusion_iterations': 30, |
|
|
'temperature': 0.8, |
|
|
'diffusion_sampler': "DDIM", |
|
|
'breathing_room': 8 , |
|
|
'cvvp_weight': 0.0, |
|
|
'top_p': 0.8, |
|
|
'diffusion_temperature': 1.0, |
|
|
'length_penalty': 1.0, |
|
|
'repetition_penalty': 2.0, |
|
|
'cond_free_k': 2.0, |
|
|
'experimentals': None, |
|
|
} |
|
|
|
|
|
settings, _ = read_generate_settings(file, read_latents=False) |
|
|
|
|
|
if settings is not None: |
|
|
res.update(settings) |
|
|
|
|
|
return res |
|
|
|
|
|
def reset_generate_settings(): |
|
|
with open(f'./config/generate.json', 'w', encoding="utf-8") as f: |
|
|
f.write(json.dumps({}, indent='\t') ) |
|
|
return import_generate_settings() |
|
|
|
|
|
def read_generate_settings(file, read_latents=True): |
|
|
j = None |
|
|
latents = None |
|
|
|
|
|
if isinstance(file, list) and len(file) == 1: |
|
|
file = file[0] |
|
|
|
|
|
try: |
|
|
if file is not None: |
|
|
if hasattr(file, 'name'): |
|
|
file = file.name |
|
|
|
|
|
if file[-4:] == ".wav": |
|
|
metadata = music_tag.load_file(file) |
|
|
if 'lyrics' in metadata: |
|
|
j = json.loads(str(metadata['lyrics'])) |
|
|
elif file[-5:] == ".json": |
|
|
with open(file, 'r') as f: |
|
|
j = json.load(f) |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
if j is not None: |
|
|
if 'latents' in j: |
|
|
if read_latents: |
|
|
latents = base64.b64decode(j['latents']) |
|
|
del j['latents'] |
|
|
|
|
|
|
|
|
if "time" in j: |
|
|
j["time"] = "{:.3f}".format(j["time"]) |
|
|
|
|
|
|
|
|
|
|
|
return ( |
|
|
j, |
|
|
latents, |
|
|
) |
|
|
|
|
|
def version_check_tts( min_version ): |
|
|
global tts |
|
|
if not tts: |
|
|
raise Exception("TTS is not initialized") |
|
|
|
|
|
if not hasattr(tts, 'version'): |
|
|
return False |
|
|
|
|
|
if min_version[0] > tts.version[0]: |
|
|
return True |
|
|
if min_version[1] > tts.version[1]: |
|
|
return True |
|
|
if min_version[2] >= tts.version[2]: |
|
|
return True |
|
|
return False |
|
|
|
|
|
def load_tts( restart=False, |
|
|
|
|
|
autoregressive_model=None, diffusion_model=None, vocoder_model=None, tokenizer_json=None, |
|
|
|
|
|
valle_model=None, |
|
|
): |
|
|
global args |
|
|
global tts |
|
|
|
|
|
if restart: |
|
|
unload_tts() |
|
|
|
|
|
tts_loading = True |
|
|
if args.tts_backend == "tortoise": |
|
|
if autoregressive_model: |
|
|
args.autoregressive_model = autoregressive_model |
|
|
else: |
|
|
autoregressive_model = args.autoregressive_model |
|
|
|
|
|
if autoregressive_model == "auto": |
|
|
autoregressive_model = deduce_autoregressive_model() |
|
|
|
|
|
if diffusion_model: |
|
|
args.diffusion_model = diffusion_model |
|
|
else: |
|
|
diffusion_model = args.diffusion_model |
|
|
|
|
|
if vocoder_model: |
|
|
args.vocoder_model = vocoder_model |
|
|
else: |
|
|
vocoder_model = args.vocoder_model |
|
|
|
|
|
if tokenizer_json: |
|
|
args.tokenizer_json = tokenizer_json |
|
|
else: |
|
|
tokenizer_json = args.tokenizer_json |
|
|
|
|
|
if get_device_name() == "cpu": |
|
|
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.") |
|
|
|
|
|
print(f"Loading TorToiSe... (AR: {autoregressive_model}, diffusion: {diffusion_model}, vocoder: {vocoder_model})") |
|
|
tts = TorToise_TTS(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json, unsqueeze_sample_batches=args.unsqueeze_sample_batches) |
|
|
elif args.tts_backend == "vall-e": |
|
|
if valle_model: |
|
|
args.valle_model = valle_model |
|
|
else: |
|
|
valle_model = args.valle_model |
|
|
|
|
|
print(f"Loading VALL-E... (Config: {valle_model})") |
|
|
tts = VALLE_TTS(config=args.valle_model) |
|
|
elif args.tts_backend == "bark": |
|
|
|
|
|
print(f"Loading Bark...") |
|
|
tts = Bark_TTS(small=args.low_vram) |
|
|
|
|
|
print("Loaded TTS, ready for generation.") |
|
|
tts_loading = False |
|
|
return tts |
|
|
|
|
|
def unload_tts(): |
|
|
global tts |
|
|
|
|
|
if tts: |
|
|
del tts |
|
|
tts = None |
|
|
print("Unloaded TTS") |
|
|
do_gc() |
|
|
|
|
|
def reload_tts(): |
|
|
unload_tts() |
|
|
load_tts() |
|
|
|
|
|
def get_current_voice(): |
|
|
global current_voice |
|
|
if current_voice: |
|
|
return current_voice |
|
|
|
|
|
settings, _ = read_generate_settings("./config/generate.json", read_latents=False) |
|
|
|
|
|
if settings and "voice" in settings['voice']: |
|
|
return settings["voice"] |
|
|
|
|
|
return None |
|
|
|
|
|
def deduce_autoregressive_model(voice=None): |
|
|
if not voice: |
|
|
voice = get_current_voice() |
|
|
|
|
|
if voice: |
|
|
if os.path.exists(f'./models/finetunes/{voice}.pth'): |
|
|
return f'./models/finetunes/{voice}.pth' |
|
|
|
|
|
dir = f'./training/{voice}/finetune/models/' |
|
|
if os.path.isdir(dir): |
|
|
counts = sorted([ int(d[:-8]) for d in os.listdir(dir) if d[-8:] == "_gpt.pth" ]) |
|
|
names = [ f'{dir}/{d}_gpt.pth' for d in counts ] |
|
|
if len(names) > 0: |
|
|
return names[-1] |
|
|
|
|
|
if args.autoregressive_model != "auto": |
|
|
return args.autoregressive_model |
|
|
|
|
|
return get_model_path('autoregressive.pth') |
|
|
|
|
|
def update_autoregressive_model(autoregressive_model_path): |
|
|
if args.tts_backend != "tortoise": |
|
|
raise f"Unsupported backend: {args.tts_backend}" |
|
|
|
|
|
match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', autoregressive_model_path) |
|
|
if match: |
|
|
autoregressive_model_path = match[0] |
|
|
|
|
|
if not autoregressive_model_path or not os.path.exists(autoregressive_model_path): |
|
|
print(f"Invalid model: {autoregressive_model_path}") |
|
|
return |
|
|
|
|
|
args.autoregressive_model = autoregressive_model_path |
|
|
save_args_settings() |
|
|
print(f'Stored autoregressive model to settings: {autoregressive_model_path}') |
|
|
|
|
|
global tts |
|
|
if not tts: |
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
return |
|
|
|
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
if autoregressive_model_path == "auto": |
|
|
autoregressive_model_path = deduce_autoregressive_model() |
|
|
|
|
|
if autoregressive_model_path == tts.autoregressive_model_path: |
|
|
return |
|
|
|
|
|
tts.load_autoregressive_model(autoregressive_model_path) |
|
|
|
|
|
do_gc() |
|
|
|
|
|
return autoregressive_model_path |
|
|
|
|
|
def update_diffusion_model(diffusion_model_path): |
|
|
if args.tts_backend != "tortoise": |
|
|
raise f"Unsupported backend: {args.tts_backend}" |
|
|
|
|
|
match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', diffusion_model_path) |
|
|
if match: |
|
|
diffusion_model_path = match[0] |
|
|
|
|
|
if not diffusion_model_path or not os.path.exists(diffusion_model_path): |
|
|
print(f"Invalid model: {diffusion_model_path}") |
|
|
return |
|
|
|
|
|
args.diffusion_model = diffusion_model_path |
|
|
save_args_settings() |
|
|
print(f'Stored diffusion model to settings: {diffusion_model_path}') |
|
|
|
|
|
global tts |
|
|
if not tts: |
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
return |
|
|
|
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
if diffusion_model_path == "auto": |
|
|
diffusion_model_path = deduce_diffusion_model() |
|
|
|
|
|
if diffusion_model_path == tts.diffusion_model_path: |
|
|
return |
|
|
|
|
|
tts.load_diffusion_model(diffusion_model_path) |
|
|
|
|
|
do_gc() |
|
|
|
|
|
return diffusion_model_path |
|
|
|
|
|
def update_vocoder_model(vocoder_model): |
|
|
if args.tts_backend != "tortoise": |
|
|
raise f"Unsupported backend: {args.tts_backend}" |
|
|
|
|
|
args.vocoder_model = vocoder_model |
|
|
save_args_settings() |
|
|
print(f'Stored vocoder model to settings: {vocoder_model}') |
|
|
|
|
|
global tts |
|
|
if not tts: |
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
return |
|
|
|
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
print(f"Loading model: {vocoder_model}") |
|
|
tts.load_vocoder_model(vocoder_model) |
|
|
print(f"Loaded model: {tts.vocoder_model}") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
return vocoder_model |
|
|
|
|
|
def update_tokenizer(tokenizer_json): |
|
|
if args.tts_backend != "tortoise": |
|
|
raise f"Unsupported backend: {args.tts_backend}" |
|
|
|
|
|
args.tokenizer_json = tokenizer_json |
|
|
save_args_settings() |
|
|
print(f'Stored tokenizer to settings: {tokenizer_json}') |
|
|
|
|
|
global tts |
|
|
if not tts: |
|
|
if tts_loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
return |
|
|
|
|
|
if hasattr(tts, "loading") and tts.loading: |
|
|
raise Exception("TTS is still initializing...") |
|
|
|
|
|
print(f"Loading tokenizer vocab: {tokenizer_json}") |
|
|
tts.load_tokenizer_json(tokenizer_json) |
|
|
print(f"Loaded tokenizer vocab: {tts.tokenizer_json}") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
return vocoder_model |
|
|
|
|
|
def load_voicefixer(restart=False): |
|
|
global voicefixer |
|
|
|
|
|
if restart: |
|
|
unload_voicefixer() |
|
|
|
|
|
try: |
|
|
print("Loading Voicefixer") |
|
|
from voicefixer import VoiceFixer |
|
|
voicefixer = VoiceFixer() |
|
|
print("Loaded Voicefixer") |
|
|
except Exception as e: |
|
|
print(f"Error occurred while tring to initialize voicefixer: {e}") |
|
|
if voicefixer: |
|
|
del voicefixer |
|
|
voicefixer = None |
|
|
|
|
|
def unload_voicefixer(): |
|
|
global voicefixer |
|
|
|
|
|
if voicefixer: |
|
|
del voicefixer |
|
|
voicefixer = None |
|
|
print("Unloaded Voicefixer") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
def load_whisper_model(language=None, model_name=None, progress=None): |
|
|
global whisper_model |
|
|
global whisper_vad |
|
|
global whisper_diarize |
|
|
global whisper_align_model |
|
|
|
|
|
if args.whisper_backend not in WHISPER_BACKENDS: |
|
|
raise Exception(f"unavailable backend: {args.whisper_backend}") |
|
|
|
|
|
if not model_name: |
|
|
model_name = args.whisper_model |
|
|
else: |
|
|
args.whisper_model = model_name |
|
|
save_args_settings() |
|
|
|
|
|
if language and f'{model_name}.{language}' in WHISPER_SPECIALIZED_MODELS: |
|
|
model_name = f'{model_name}.{language}' |
|
|
print(f"Loading specialized model for language: {language}") |
|
|
|
|
|
notify_progress(f"Loading Whisper model: {model_name}", progress=progress) |
|
|
|
|
|
if args.whisper_backend == "openai/whisper": |
|
|
import whisper |
|
|
try: |
|
|
|
|
|
whisper_model = whisper.load_model(model_name) |
|
|
except: |
|
|
print("Out of VRAM memory. falling back to loading Whisper on CPU.") |
|
|
whisper_model = whisper.load_model(model_name, device="cpu") |
|
|
elif args.whisper_backend == "lightmare/whispercpp": |
|
|
from whispercpp import Whisper |
|
|
if not language: |
|
|
language = 'auto' |
|
|
|
|
|
b_lang = language.encode('ascii') |
|
|
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang) |
|
|
elif args.whisper_backend == "m-bain/whisperx": |
|
|
import whisper, whisperx |
|
|
device = "cuda" if get_device_name() == "cuda" else "cpu" |
|
|
try: |
|
|
whisper_model = whisperx.load_model(model_name, device) |
|
|
except Exception as e: |
|
|
whisper_model = whisper.load_model(model_name, device) |
|
|
|
|
|
if not args.hf_token: |
|
|
print("No huggingface token used, needs to be saved in environment variable, otherwise will throw error loading VAD model.") |
|
|
|
|
|
try: |
|
|
from pyannote.audio import Inference, Pipeline |
|
|
whisper_vad = Inference( |
|
|
"pyannote/segmentation", |
|
|
pre_aggregation_hook=lambda segmentation: segmentation, |
|
|
use_auth_token=args.hf_token, |
|
|
device=torch.device(device), |
|
|
) |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
whisper_align_model = whisperx.load_align_model(model_name="WAV2VEC2_ASR_LARGE_LV60K_960H" if language=="en" else None, language_code=language, device=device) |
|
|
|
|
|
print("Loaded Whisper model") |
|
|
|
|
|
def unload_whisper(): |
|
|
global whisper_model |
|
|
global whisper_vad |
|
|
global whisper_diarize |
|
|
global whisper_align_model |
|
|
|
|
|
if whisper_vad: |
|
|
del whisper_vad |
|
|
whisper_vad = None |
|
|
|
|
|
if whisper_diarize: |
|
|
del whisper_diarize |
|
|
whisper_diarize = None |
|
|
|
|
|
if whisper_align_model: |
|
|
del whisper_align_model |
|
|
whisper_align_model = None |
|
|
|
|
|
if whisper_model: |
|
|
del whisper_model |
|
|
whisper_model = None |
|
|
print("Unloaded Whisper") |
|
|
|
|
|
do_gc() |
|
|
|
|
|
|
|
|
def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.Progress() ): |
|
|
key_blacklist = [] |
|
|
|
|
|
def weighted_sum(theta0, theta1, alpha): |
|
|
return ((1 - alpha) * theta0) + (alpha * theta1) |
|
|
|
|
|
def read_model( filename ): |
|
|
print(f"Loading {filename}") |
|
|
return torch.load(filename) |
|
|
|
|
|
theta_func = weighted_sum |
|
|
|
|
|
theta_0 = read_model(primary_model_name) |
|
|
theta_1 = read_model(secondary_model_name) |
|
|
|
|
|
for key in tqdm(theta_0.keys(), desc="Merging..."): |
|
|
if key in key_blacklist: |
|
|
print("Skipping ignored key:", key) |
|
|
continue |
|
|
|
|
|
a = theta_0[key] |
|
|
b = theta_1[key] |
|
|
|
|
|
if a.dtype != torch.float32 and a.dtype != torch.float16: |
|
|
print("Skipping key:", key, a.dtype) |
|
|
continue |
|
|
|
|
|
if b.dtype != torch.float32 and b.dtype != torch.float16: |
|
|
print("Skipping key:", key, b.dtype) |
|
|
continue |
|
|
|
|
|
theta_0[key] = theta_func(a, b, alpha) |
|
|
|
|
|
del theta_1 |
|
|
|
|
|
primary_basename = os.path.splitext(os.path.basename(primary_model_name))[0] |
|
|
secondary_basename = os.path.splitext(os.path.basename(secondary_model_name))[0] |
|
|
suffix = "{:.3f}".format(alpha) |
|
|
output_path = f'./models/finetunes/{primary_basename}_{secondary_basename}_{suffix}_merge.pth' |
|
|
|
|
|
torch.save(theta_0, output_path) |
|
|
message = f"Saved to {output_path}" |
|
|
print(message) |
|
|
return message |