xtts-gguf / cpp /xtts_v2_full.h
bnewton-genmedlabs's picture
Add full XTTS v2 architecture header with GPT-2 encoder and HiFi-GAN vocoder
1667f3a verified
// xtts_v2_full.h - Full XTTS v2 GGUF Implementation
#ifndef XTTS_V2_FULL_H
#define XTTS_V2_FULL_H
#include <ggml.h>
#include <ggml-alloc.h>
#include <ggml-backend.h>
#include <cstdint>
#include <string>
#include <vector>
#include <memory>
#include <unordered_map>
namespace xtts_v2 {
// XTTS v2 Architecture Constants
struct XTTSConfig {
// GPT-2 Text Encoder
static constexpr int GPT_N_VOCAB = 6681; // BPE vocab size
static constexpr int GPT_N_CTX = 402; // Max context length
static constexpr int GPT_N_EMBD = 1024; // Hidden dimension
static constexpr int GPT_N_HEAD = 16; // Attention heads
static constexpr int GPT_N_LAYER = 30; // Transformer layers
static constexpr int GPT_INTERMEDIATE = 4096; // FFN intermediate size
// Latent Diffusion Decoder
static constexpr int LATENT_DIM = 1024; // Latent vector dimension
static constexpr int MEL_CHANNELS = 80; // Mel spectrogram bins
static constexpr int DECODER_LAYERS = 12; // Decoder depth
static constexpr int REF_ENCODER_LAYERS = 6; // Reference encoder layers
// HiFi-GAN Vocoder
static constexpr int HIFIGAN_UPSAMPLE_RATES[4] = {8, 8, 2, 2};
static constexpr int HIFIGAN_KERNEL_SIZES[4] = {16, 16, 4, 4};
static constexpr int HIFIGAN_CHANNELS = 512;
static constexpr int HIFIGAN_RESBLOCK_KERNELS[3] = {3, 7, 11};
static constexpr int HIFIGAN_RESBLOCK_DILATIONS[3][3] = {
{1, 3, 5}, {1, 3, 5}, {1, 3, 5}
};
// Audio settings
static constexpr int SAMPLE_RATE = 24000;
static constexpr int HOP_LENGTH = 256;
static constexpr int WIN_LENGTH = 1024;
// Languages (17 supported)
static constexpr int N_LANGUAGES = 17;
static constexpr int SPEAKER_EMBEDDING_DIM = 512;
// Conditioning
static constexpr int COND_LATENT_DIM = 1024;
static constexpr int MAX_MEL_LENGTH = 605;
static constexpr int MAX_AUDIO_LENGTH = 155520; // ~6.5 seconds @ 24kHz
};
// Full XTTS v2 Model Components
struct XTTSv2Model {
// Text Encoder (GPT-2 style)
struct TextEncoder {
ggml_tensor* wte; // Token embeddings [n_vocab, n_embd]
ggml_tensor* wpe; // Position embeddings [n_ctx, n_embd]
// Per-layer components
struct Layer {
// Attention
ggml_tensor* ln1_g; // LayerNorm gain
ggml_tensor* ln1_b; // LayerNorm bias
ggml_tensor* attn_qkv; // Combined QKV projection
ggml_tensor* attn_proj; // Output projection
// FFN
ggml_tensor* ln2_g; // LayerNorm gain
ggml_tensor* ln2_b; // LayerNorm bias
ggml_tensor* ffn_fc1; // FFN first layer
ggml_tensor* ffn_fc2; // FFN second layer
};
std::vector<Layer> layers;
ggml_tensor* ln_f_g; // Final LayerNorm gain
ggml_tensor* ln_f_b; // Final LayerNorm bias
} text_encoder;
// Reference Encoder (for voice cloning)
struct ReferenceEncoder {
ggml_tensor* mel_conv1; // Initial mel convolution
struct ConvBlock {
ggml_tensor* conv;
ggml_tensor* norm_g;
ggml_tensor* norm_b;
};
std::vector<ConvBlock> conv_blocks;
ggml_tensor* gru_ih; // GRU input-hidden weights
ggml_tensor* gru_hh; // GRU hidden-hidden weights
ggml_tensor* gru_bias; // GRU bias
ggml_tensor* speaker_proj; // Project to speaker embedding
} ref_encoder;
// Latent Diffusion Decoder
struct LatentDecoder {
ggml_tensor* latent_proj; // Project latents to hidden
struct DecoderLayer {
// Self-attention
ggml_tensor* sa_ln_g;
ggml_tensor* sa_ln_b;
ggml_tensor* sa_qkv;
ggml_tensor* sa_proj;
// Cross-attention (to text)
ggml_tensor* ca_ln_g;
ggml_tensor* ca_ln_b;
ggml_tensor* ca_q;
ggml_tensor* ca_kv;
ggml_tensor* ca_proj;
// FFN
ggml_tensor* ffn_ln_g;
ggml_tensor* ffn_ln_b;
ggml_tensor* ffn_fc1;
ggml_tensor* ffn_fc2;
};
std::vector<DecoderLayer> layers;
ggml_tensor* mel_head; // Project to mel spectrogram
ggml_tensor* stop_head; // Predict stop token
} decoder;
// HiFi-GAN Vocoder
struct Vocoder {
ggml_tensor* conv_pre; // Pre-conv [80, 512, 7]
struct UpsampleBlock {
ggml_tensor* conv_transpose; // Transposed convolution
struct ResBlock {
ggml_tensor* conv1;
ggml_tensor* conv2;
};
std::vector<ResBlock> res_blocks;
};
std::vector<UpsampleBlock> upsample_blocks;
ggml_tensor* conv_post; // Post-conv [512, 1, 7]
} vocoder;
// Conditioning layers
struct Conditioning {
ggml_tensor* speaker_embedding; // Speaker lookup table
ggml_tensor* language_embedding; // Language embeddings
ggml_tensor* style_embedding; // Style tokens (optional)
} conditioning;
// Model context
ggml_context* ctx = nullptr;
ggml_backend_t backend = nullptr;
ggml_backend_buffer_t buffer = nullptr;
size_t buffer_size = 0;
};
// KV Cache for autoregressive generation
struct XTTSKVCache {
// Text encoder cache
struct {
ggml_tensor* k[30]; // K cache per layer
ggml_tensor* v[30]; // V cache per layer
int n_cached = 0;
} text_cache;
// Decoder cache
struct {
ggml_tensor* k[12]; // K cache per layer
ggml_tensor* v[12]; // V cache per layer
ggml_tensor* cross_k[12]; // Cross-attention K cache
ggml_tensor* cross_v[12]; // Cross-attention V cache
int n_cached = 0;
} decoder_cache;
};
// Main XTTS v2 Inference Engine
class XTTSv2Inference {
public:
XTTSv2Inference();
~XTTSv2Inference();
// Load model from GGUF file
bool load_model(const std::string& model_path, bool use_mmap = true);
// High-level TTS interface
std::vector<float> synthesize(
const std::string& text,
const std::string& language = "en",
const std::vector<float>& speaker_wav = {}, // Optional reference audio
float temperature = 0.65f,
float length_penalty = 1.0f,
float repetition_penalty = 2.0f,
float top_k = 50,
float top_p = 0.85f,
float speed = 1.0f
);
// Component-wise inference (for debugging/testing)
struct InferenceComponents {
std::vector<int32_t> tokens; // BPE tokens
ggml_tensor* text_embeddings; // Text encoder output
ggml_tensor* speaker_embedding; // Speaker embedding
ggml_tensor* latents; // Decoder latents
ggml_tensor* mel_spectrogram; // Generated mel
std::vector<float> audio; // Final audio
};
InferenceComponents synthesize_components(
const std::string& text,
const std::string& language = "en",
const std::vector<float>& speaker_wav = {}
);
// Streaming interface
class Stream {
public:
Stream(XTTSv2Inference* parent, const std::string& text,
const std::string& language, const std::vector<float>& speaker_wav);
~Stream();
std::vector<float> get_chunk(size_t max_samples = 4096);
bool is_done() const { return done; }
private:
XTTSv2Inference* parent;
InferenceComponents components;
size_t mel_offset = 0;
size_t audio_offset = 0;
bool done = false;
void generate_next_mel_chunk();
std::vector<float> vocoder_chunk(ggml_tensor* mel_chunk);
};
std::unique_ptr<Stream> create_stream(
const std::string& text,
const std::string& language = "en",
const std::vector<float>& speaker_wav = {}
);
private:
XTTSConfig config;
XTTSv2Model model;
XTTSKVCache kv_cache;
// GGUF file handling
struct gguf_context* gguf_ctx = nullptr;
void* mapped_memory = nullptr;
size_t mapped_size = 0;
// Computation graph
ggml_cgraph* gf = nullptr;
ggml_gallocr* allocr = nullptr;
// Tokenizer
std::unordered_map<std::string, int32_t> bpe_vocab;
std::vector<std::pair<std::string, std::string>> bpe_merges;
// Internal methods
bool load_gguf_weights(const std::string& path, bool use_mmap);
void init_model_architecture();
// Text processing
std::vector<int32_t> tokenize(const std::string& text);
std::vector<std::string> bpe_encode(const std::string& text);
// Model forward passes
ggml_tensor* text_encoder_forward(
const std::vector<int32_t>& tokens,
const std::string& language
);
ggml_tensor* reference_encoder_forward(
const std::vector<float>& audio_wav
);
ggml_tensor* decoder_forward(
ggml_tensor* text_embeddings,
ggml_tensor* speaker_embedding,
float temperature,
float length_penalty
);
std::vector<float> vocoder_forward(
ggml_tensor* mel_spectrogram
);
// Attention mechanisms
ggml_tensor* multi_head_attention(
ggml_tensor* q, ggml_tensor* k, ggml_tensor* v,
int n_heads, bool use_cache = true
);
ggml_tensor* cross_attention(
ggml_tensor* queries,
ggml_tensor* keys,
ggml_tensor* values,
int n_heads
);
// Helper functions
ggml_tensor* layer_norm(ggml_tensor* x, ggml_tensor* g, ggml_tensor* b, float eps = 1e-5f);
ggml_tensor* gelu(ggml_tensor* x);
ggml_tensor* conv1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding);
ggml_tensor* conv_transpose1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding);
// Sampling
std::vector<int32_t> sample_latents(
ggml_tensor* logits,
float temperature,
float top_k,
float top_p,
float repetition_penalty
);
};
// NEON-optimized kernels for ARM
namespace kernels {
#ifdef __ARM_NEON
void gemm_q4_neon(
const uint8_t* a_q4,
const float* b,
float* c,
int m, int k, int n,
const float* scales
);
void conv1d_q8_neon(
const uint8_t* input_q8,
const uint8_t* kernel_q8,
float* output,
int batch, int in_c, int out_c,
int length, int kernel_size,
int stride, int padding,
const float* input_scale,
const float* kernel_scale
);
void attention_q4_neon(
const uint8_t* q_q4,
const uint8_t* k_q4,
const uint8_t* v_q4,
float* output,
int seq_len, int n_heads, int head_dim,
const float* q_scale,
const float* k_scale,
const float* v_scale
);
#endif // __ARM_NEON
} // namespace kernels
// C API for React Native / FFI
extern "C" {
void* xtts_v2_init(const char* model_path, bool use_mmap);
float* xtts_v2_synthesize(
void* model,
const char* text,
const char* language,
const float* speaker_wav,
size_t speaker_wav_len,
float temperature,
float speed,
size_t* out_len
);
void* xtts_v2_stream_init(
void* model,
const char* text,
const char* language,
const float* speaker_wav,
size_t speaker_wav_len
);
float* xtts_v2_stream_chunk(
void* stream,
size_t chunk_size,
size_t* out_len
);
void xtts_v2_stream_free(void* stream);
void xtts_v2_free(void* model);
void xtts_v2_free_audio(float* audio);
}
} // namespace xtts_v2
#endif // XTTS_V2_FULL_H