|
|
|
|
|
#ifndef XTTS_V2_FULL_H |
|
|
#define XTTS_V2_FULL_H |
|
|
|
|
|
#include <ggml.h> |
|
|
#include <ggml-alloc.h> |
|
|
#include <ggml-backend.h> |
|
|
#include <cstdint> |
|
|
#include <string> |
|
|
#include <vector> |
|
|
#include <memory> |
|
|
#include <unordered_map> |
|
|
|
|
|
namespace xtts_v2 { |
|
|
|
|
|
|
|
|
struct XTTSConfig { |
|
|
|
|
|
static constexpr int GPT_N_VOCAB = 6681; |
|
|
static constexpr int GPT_N_CTX = 402; |
|
|
static constexpr int GPT_N_EMBD = 1024; |
|
|
static constexpr int GPT_N_HEAD = 16; |
|
|
static constexpr int GPT_N_LAYER = 30; |
|
|
static constexpr int GPT_INTERMEDIATE = 4096; |
|
|
|
|
|
|
|
|
static constexpr int LATENT_DIM = 1024; |
|
|
static constexpr int MEL_CHANNELS = 80; |
|
|
static constexpr int DECODER_LAYERS = 12; |
|
|
static constexpr int REF_ENCODER_LAYERS = 6; |
|
|
|
|
|
|
|
|
static constexpr int HIFIGAN_UPSAMPLE_RATES[4] = {8, 8, 2, 2}; |
|
|
static constexpr int HIFIGAN_KERNEL_SIZES[4] = {16, 16, 4, 4}; |
|
|
static constexpr int HIFIGAN_CHANNELS = 512; |
|
|
static constexpr int HIFIGAN_RESBLOCK_KERNELS[3] = {3, 7, 11}; |
|
|
static constexpr int HIFIGAN_RESBLOCK_DILATIONS[3][3] = { |
|
|
{1, 3, 5}, {1, 3, 5}, {1, 3, 5} |
|
|
}; |
|
|
|
|
|
|
|
|
static constexpr int SAMPLE_RATE = 24000; |
|
|
static constexpr int HOP_LENGTH = 256; |
|
|
static constexpr int WIN_LENGTH = 1024; |
|
|
|
|
|
|
|
|
static constexpr int N_LANGUAGES = 17; |
|
|
static constexpr int SPEAKER_EMBEDDING_DIM = 512; |
|
|
|
|
|
|
|
|
static constexpr int COND_LATENT_DIM = 1024; |
|
|
static constexpr int MAX_MEL_LENGTH = 605; |
|
|
static constexpr int MAX_AUDIO_LENGTH = 155520; |
|
|
}; |
|
|
|
|
|
|
|
|
struct XTTSv2Model { |
|
|
|
|
|
struct TextEncoder { |
|
|
ggml_tensor* wte; |
|
|
ggml_tensor* wpe; |
|
|
|
|
|
|
|
|
struct Layer { |
|
|
|
|
|
ggml_tensor* ln1_g; |
|
|
ggml_tensor* ln1_b; |
|
|
ggml_tensor* attn_qkv; |
|
|
ggml_tensor* attn_proj; |
|
|
|
|
|
|
|
|
ggml_tensor* ln2_g; |
|
|
ggml_tensor* ln2_b; |
|
|
ggml_tensor* ffn_fc1; |
|
|
ggml_tensor* ffn_fc2; |
|
|
}; |
|
|
std::vector<Layer> layers; |
|
|
|
|
|
ggml_tensor* ln_f_g; |
|
|
ggml_tensor* ln_f_b; |
|
|
} text_encoder; |
|
|
|
|
|
|
|
|
struct ReferenceEncoder { |
|
|
ggml_tensor* mel_conv1; |
|
|
|
|
|
struct ConvBlock { |
|
|
ggml_tensor* conv; |
|
|
ggml_tensor* norm_g; |
|
|
ggml_tensor* norm_b; |
|
|
}; |
|
|
std::vector<ConvBlock> conv_blocks; |
|
|
|
|
|
ggml_tensor* gru_ih; |
|
|
ggml_tensor* gru_hh; |
|
|
ggml_tensor* gru_bias; |
|
|
|
|
|
ggml_tensor* speaker_proj; |
|
|
} ref_encoder; |
|
|
|
|
|
|
|
|
struct LatentDecoder { |
|
|
ggml_tensor* latent_proj; |
|
|
|
|
|
struct DecoderLayer { |
|
|
|
|
|
ggml_tensor* sa_ln_g; |
|
|
ggml_tensor* sa_ln_b; |
|
|
ggml_tensor* sa_qkv; |
|
|
ggml_tensor* sa_proj; |
|
|
|
|
|
|
|
|
ggml_tensor* ca_ln_g; |
|
|
ggml_tensor* ca_ln_b; |
|
|
ggml_tensor* ca_q; |
|
|
ggml_tensor* ca_kv; |
|
|
ggml_tensor* ca_proj; |
|
|
|
|
|
|
|
|
ggml_tensor* ffn_ln_g; |
|
|
ggml_tensor* ffn_ln_b; |
|
|
ggml_tensor* ffn_fc1; |
|
|
ggml_tensor* ffn_fc2; |
|
|
}; |
|
|
std::vector<DecoderLayer> layers; |
|
|
|
|
|
ggml_tensor* mel_head; |
|
|
ggml_tensor* stop_head; |
|
|
} decoder; |
|
|
|
|
|
|
|
|
struct Vocoder { |
|
|
ggml_tensor* conv_pre; |
|
|
|
|
|
struct UpsampleBlock { |
|
|
ggml_tensor* conv_transpose; |
|
|
|
|
|
struct ResBlock { |
|
|
ggml_tensor* conv1; |
|
|
ggml_tensor* conv2; |
|
|
}; |
|
|
std::vector<ResBlock> res_blocks; |
|
|
}; |
|
|
std::vector<UpsampleBlock> upsample_blocks; |
|
|
|
|
|
ggml_tensor* conv_post; |
|
|
} vocoder; |
|
|
|
|
|
|
|
|
struct Conditioning { |
|
|
ggml_tensor* speaker_embedding; |
|
|
ggml_tensor* language_embedding; |
|
|
ggml_tensor* style_embedding; |
|
|
} conditioning; |
|
|
|
|
|
|
|
|
ggml_context* ctx = nullptr; |
|
|
ggml_backend_t backend = nullptr; |
|
|
ggml_backend_buffer_t buffer = nullptr; |
|
|
size_t buffer_size = 0; |
|
|
}; |
|
|
|
|
|
|
|
|
struct XTTSKVCache { |
|
|
|
|
|
struct { |
|
|
ggml_tensor* k[30]; |
|
|
ggml_tensor* v[30]; |
|
|
int n_cached = 0; |
|
|
} text_cache; |
|
|
|
|
|
|
|
|
struct { |
|
|
ggml_tensor* k[12]; |
|
|
ggml_tensor* v[12]; |
|
|
ggml_tensor* cross_k[12]; |
|
|
ggml_tensor* cross_v[12]; |
|
|
int n_cached = 0; |
|
|
} decoder_cache; |
|
|
}; |
|
|
|
|
|
|
|
|
class XTTSv2Inference { |
|
|
public: |
|
|
XTTSv2Inference(); |
|
|
~XTTSv2Inference(); |
|
|
|
|
|
|
|
|
bool load_model(const std::string& model_path, bool use_mmap = true); |
|
|
|
|
|
|
|
|
std::vector<float> synthesize( |
|
|
const std::string& text, |
|
|
const std::string& language = "en", |
|
|
const std::vector<float>& speaker_wav = {}, |
|
|
float temperature = 0.65f, |
|
|
float length_penalty = 1.0f, |
|
|
float repetition_penalty = 2.0f, |
|
|
float top_k = 50, |
|
|
float top_p = 0.85f, |
|
|
float speed = 1.0f |
|
|
); |
|
|
|
|
|
|
|
|
struct InferenceComponents { |
|
|
std::vector<int32_t> tokens; |
|
|
ggml_tensor* text_embeddings; |
|
|
ggml_tensor* speaker_embedding; |
|
|
ggml_tensor* latents; |
|
|
ggml_tensor* mel_spectrogram; |
|
|
std::vector<float> audio; |
|
|
}; |
|
|
|
|
|
InferenceComponents synthesize_components( |
|
|
const std::string& text, |
|
|
const std::string& language = "en", |
|
|
const std::vector<float>& speaker_wav = {} |
|
|
); |
|
|
|
|
|
|
|
|
class Stream { |
|
|
public: |
|
|
Stream(XTTSv2Inference* parent, const std::string& text, |
|
|
const std::string& language, const std::vector<float>& speaker_wav); |
|
|
~Stream(); |
|
|
|
|
|
std::vector<float> get_chunk(size_t max_samples = 4096); |
|
|
bool is_done() const { return done; } |
|
|
|
|
|
private: |
|
|
XTTSv2Inference* parent; |
|
|
InferenceComponents components; |
|
|
size_t mel_offset = 0; |
|
|
size_t audio_offset = 0; |
|
|
bool done = false; |
|
|
|
|
|
void generate_next_mel_chunk(); |
|
|
std::vector<float> vocoder_chunk(ggml_tensor* mel_chunk); |
|
|
}; |
|
|
|
|
|
std::unique_ptr<Stream> create_stream( |
|
|
const std::string& text, |
|
|
const std::string& language = "en", |
|
|
const std::vector<float>& speaker_wav = {} |
|
|
); |
|
|
|
|
|
private: |
|
|
XTTSConfig config; |
|
|
XTTSv2Model model; |
|
|
XTTSKVCache kv_cache; |
|
|
|
|
|
|
|
|
struct gguf_context* gguf_ctx = nullptr; |
|
|
void* mapped_memory = nullptr; |
|
|
size_t mapped_size = 0; |
|
|
|
|
|
|
|
|
ggml_cgraph* gf = nullptr; |
|
|
ggml_gallocr* allocr = nullptr; |
|
|
|
|
|
|
|
|
std::unordered_map<std::string, int32_t> bpe_vocab; |
|
|
std::vector<std::pair<std::string, std::string>> bpe_merges; |
|
|
|
|
|
|
|
|
bool load_gguf_weights(const std::string& path, bool use_mmap); |
|
|
void init_model_architecture(); |
|
|
|
|
|
|
|
|
std::vector<int32_t> tokenize(const std::string& text); |
|
|
std::vector<std::string> bpe_encode(const std::string& text); |
|
|
|
|
|
|
|
|
ggml_tensor* text_encoder_forward( |
|
|
const std::vector<int32_t>& tokens, |
|
|
const std::string& language |
|
|
); |
|
|
|
|
|
ggml_tensor* reference_encoder_forward( |
|
|
const std::vector<float>& audio_wav |
|
|
); |
|
|
|
|
|
ggml_tensor* decoder_forward( |
|
|
ggml_tensor* text_embeddings, |
|
|
ggml_tensor* speaker_embedding, |
|
|
float temperature, |
|
|
float length_penalty |
|
|
); |
|
|
|
|
|
std::vector<float> vocoder_forward( |
|
|
ggml_tensor* mel_spectrogram |
|
|
); |
|
|
|
|
|
|
|
|
ggml_tensor* multi_head_attention( |
|
|
ggml_tensor* q, ggml_tensor* k, ggml_tensor* v, |
|
|
int n_heads, bool use_cache = true |
|
|
); |
|
|
|
|
|
ggml_tensor* cross_attention( |
|
|
ggml_tensor* queries, |
|
|
ggml_tensor* keys, |
|
|
ggml_tensor* values, |
|
|
int n_heads |
|
|
); |
|
|
|
|
|
|
|
|
ggml_tensor* layer_norm(ggml_tensor* x, ggml_tensor* g, ggml_tensor* b, float eps = 1e-5f); |
|
|
ggml_tensor* gelu(ggml_tensor* x); |
|
|
ggml_tensor* conv1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding); |
|
|
ggml_tensor* conv_transpose1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding); |
|
|
|
|
|
|
|
|
std::vector<int32_t> sample_latents( |
|
|
ggml_tensor* logits, |
|
|
float temperature, |
|
|
float top_k, |
|
|
float top_p, |
|
|
float repetition_penalty |
|
|
); |
|
|
}; |
|
|
|
|
|
|
|
|
namespace kernels { |
|
|
#ifdef __ARM_NEON |
|
|
|
|
|
void gemm_q4_neon( |
|
|
const uint8_t* a_q4, |
|
|
const float* b, |
|
|
float* c, |
|
|
int m, int k, int n, |
|
|
const float* scales |
|
|
); |
|
|
|
|
|
void conv1d_q8_neon( |
|
|
const uint8_t* input_q8, |
|
|
const uint8_t* kernel_q8, |
|
|
float* output, |
|
|
int batch, int in_c, int out_c, |
|
|
int length, int kernel_size, |
|
|
int stride, int padding, |
|
|
const float* input_scale, |
|
|
const float* kernel_scale |
|
|
); |
|
|
|
|
|
void attention_q4_neon( |
|
|
const uint8_t* q_q4, |
|
|
const uint8_t* k_q4, |
|
|
const uint8_t* v_q4, |
|
|
float* output, |
|
|
int seq_len, int n_heads, int head_dim, |
|
|
const float* q_scale, |
|
|
const float* k_scale, |
|
|
const float* v_scale |
|
|
); |
|
|
|
|
|
#endif |
|
|
} |
|
|
|
|
|
|
|
|
extern "C" { |
|
|
void* xtts_v2_init(const char* model_path, bool use_mmap); |
|
|
|
|
|
float* xtts_v2_synthesize( |
|
|
void* model, |
|
|
const char* text, |
|
|
const char* language, |
|
|
const float* speaker_wav, |
|
|
size_t speaker_wav_len, |
|
|
float temperature, |
|
|
float speed, |
|
|
size_t* out_len |
|
|
); |
|
|
|
|
|
void* xtts_v2_stream_init( |
|
|
void* model, |
|
|
const char* text, |
|
|
const char* language, |
|
|
const float* speaker_wav, |
|
|
size_t speaker_wav_len |
|
|
); |
|
|
|
|
|
float* xtts_v2_stream_chunk( |
|
|
void* stream, |
|
|
size_t chunk_size, |
|
|
size_t* out_len |
|
|
); |
|
|
|
|
|
void xtts_v2_stream_free(void* stream); |
|
|
void xtts_v2_free(void* model); |
|
|
void xtts_v2_free_audio(float* audio); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif |