xtts-gguf / cpp /xtts_v2_full.h

Add full XTTS v2 architecture header with GPT-2 encoder and HiFi-GAN vocoder

1667f3a verified 4 months ago

11.8 kB

	// xtts_v2_full.h - Full XTTS v2 GGUF Implementation
	#ifndef XTTS_V2_FULL_H
	#define XTTS_V2_FULL_H

	#include <ggml.h>
	#include <ggml-alloc.h>
	#include <ggml-backend.h>
	#include <cstdint>
	#include <string>
	#include <vector>
	#include <memory>
	#include <unordered_map>

	namespace xtts_v2 {

	// XTTS v2 Architecture Constants
	struct XTTSConfig {
	// GPT-2 Text Encoder
	static constexpr int GPT_N_VOCAB = 6681; // BPE vocab size
	static constexpr int GPT_N_CTX = 402; // Max context length
	static constexpr int GPT_N_EMBD = 1024; // Hidden dimension
	static constexpr int GPT_N_HEAD = 16; // Attention heads
	static constexpr int GPT_N_LAYER = 30; // Transformer layers
	static constexpr int GPT_INTERMEDIATE = 4096; // FFN intermediate size

	// Latent Diffusion Decoder
	static constexpr int LATENT_DIM = 1024; // Latent vector dimension
	static constexpr int MEL_CHANNELS = 80; // Mel spectrogram bins
	static constexpr int DECODER_LAYERS = 12; // Decoder depth
	static constexpr int REF_ENCODER_LAYERS = 6; // Reference encoder layers

	// HiFi-GAN Vocoder
	static constexpr int HIFIGAN_UPSAMPLE_RATES[4] = {8, 8, 2, 2};
	static constexpr int HIFIGAN_KERNEL_SIZES[4] = {16, 16, 4, 4};
	static constexpr int HIFIGAN_CHANNELS = 512;
	static constexpr int HIFIGAN_RESBLOCK_KERNELS[3] = {3, 7, 11};
	static constexpr int HIFIGAN_RESBLOCK_DILATIONS[3][3] = {
	{1, 3, 5}, {1, 3, 5}, {1, 3, 5}
	};

	// Audio settings
	static constexpr int SAMPLE_RATE = 24000;
	static constexpr int HOP_LENGTH = 256;
	static constexpr int WIN_LENGTH = 1024;

	// Languages (17 supported)
	static constexpr int N_LANGUAGES = 17;
	static constexpr int SPEAKER_EMBEDDING_DIM = 512;

	// Conditioning
	static constexpr int COND_LATENT_DIM = 1024;
	static constexpr int MAX_MEL_LENGTH = 605;
	static constexpr int MAX_AUDIO_LENGTH = 155520; // ~6.5 seconds @ 24kHz
	};

	// Full XTTS v2 Model Components
	struct XTTSv2Model {
	// Text Encoder (GPT-2 style)
	struct TextEncoder {
	ggml_tensor* wte; // Token embeddings [n_vocab, n_embd]
	ggml_tensor* wpe; // Position embeddings [n_ctx, n_embd]

	// Per-layer components
	struct Layer {
	// Attention
	ggml_tensor* ln1_g; // LayerNorm gain
	ggml_tensor* ln1_b; // LayerNorm bias
	ggml_tensor* attn_qkv; // Combined QKV projection
	ggml_tensor* attn_proj; // Output projection

	// FFN
	ggml_tensor* ln2_g; // LayerNorm gain
	ggml_tensor* ln2_b; // LayerNorm bias
	ggml_tensor* ffn_fc1; // FFN first layer
	ggml_tensor* ffn_fc2; // FFN second layer
	};
	std::vector<Layer> layers;

	ggml_tensor* ln_f_g; // Final LayerNorm gain
	ggml_tensor* ln_f_b; // Final LayerNorm bias
	} text_encoder;

	// Reference Encoder (for voice cloning)
	struct ReferenceEncoder {
	ggml_tensor* mel_conv1; // Initial mel convolution

	struct ConvBlock {
	ggml_tensor* conv;
	ggml_tensor* norm_g;
	ggml_tensor* norm_b;
	};
	std::vector<ConvBlock> conv_blocks;

	ggml_tensor* gru_ih; // GRU input-hidden weights
	ggml_tensor* gru_hh; // GRU hidden-hidden weights
	ggml_tensor* gru_bias; // GRU bias

	ggml_tensor* speaker_proj; // Project to speaker embedding
	} ref_encoder;

	// Latent Diffusion Decoder
	struct LatentDecoder {
	ggml_tensor* latent_proj; // Project latents to hidden

	struct DecoderLayer {
	// Self-attention
	ggml_tensor* sa_ln_g;
	ggml_tensor* sa_ln_b;
	ggml_tensor* sa_qkv;
	ggml_tensor* sa_proj;

	// Cross-attention (to text)
	ggml_tensor* ca_ln_g;
	ggml_tensor* ca_ln_b;
	ggml_tensor* ca_q;
	ggml_tensor* ca_kv;
	ggml_tensor* ca_proj;

	// FFN
	ggml_tensor* ffn_ln_g;
	ggml_tensor* ffn_ln_b;
	ggml_tensor* ffn_fc1;
	ggml_tensor* ffn_fc2;
	};
	std::vector<DecoderLayer> layers;

	ggml_tensor* mel_head; // Project to mel spectrogram
	ggml_tensor* stop_head; // Predict stop token
	} decoder;

	// HiFi-GAN Vocoder
	struct Vocoder {
	ggml_tensor* conv_pre; // Pre-conv [80, 512, 7]

	struct UpsampleBlock {
	ggml_tensor* conv_transpose; // Transposed convolution

	struct ResBlock {
	ggml_tensor* conv1;
	ggml_tensor* conv2;
	};
	std::vector<ResBlock> res_blocks;
	};
	std::vector<UpsampleBlock> upsample_blocks;

	ggml_tensor* conv_post; // Post-conv [512, 1, 7]
	} vocoder;

	// Conditioning layers
	struct Conditioning {
	ggml_tensor* speaker_embedding; // Speaker lookup table
	ggml_tensor* language_embedding; // Language embeddings
	ggml_tensor* style_embedding; // Style tokens (optional)
	} conditioning;

	// Model context
	ggml_context* ctx = nullptr;
	ggml_backend_t backend = nullptr;
	ggml_backend_buffer_t buffer = nullptr;
	size_t buffer_size = 0;
	};

	// KV Cache for autoregressive generation
	struct XTTSKVCache {
	// Text encoder cache
	struct {
	ggml_tensor* k[30]; // K cache per layer
	ggml_tensor* v[30]; // V cache per layer
	int n_cached = 0;
	} text_cache;

	// Decoder cache
	struct {
	ggml_tensor* k[12]; // K cache per layer
	ggml_tensor* v[12]; // V cache per layer
	ggml_tensor* cross_k[12]; // Cross-attention K cache
	ggml_tensor* cross_v[12]; // Cross-attention V cache
	int n_cached = 0;
	} decoder_cache;
	};

	// Main XTTS v2 Inference Engine
	class XTTSv2Inference {
	public:
	XTTSv2Inference();
	~XTTSv2Inference();

	// Load model from GGUF file
	bool load_model(const std::string& model_path, bool use_mmap = true);

	// High-level TTS interface
	std::vector<float> synthesize(
	const std::string& text,
	const std::string& language = "en",
	const std::vector<float>& speaker_wav = {}, // Optional reference audio
	float temperature = 0.65f,
	float length_penalty = 1.0f,
	float repetition_penalty = 2.0f,
	float top_k = 50,
	float top_p = 0.85f,
	float speed = 1.0f
	);

	// Component-wise inference (for debugging/testing)
	struct InferenceComponents {
	std::vector<int32_t> tokens; // BPE tokens
	ggml_tensor* text_embeddings; // Text encoder output
	ggml_tensor* speaker_embedding; // Speaker embedding
	ggml_tensor* latents; // Decoder latents
	ggml_tensor* mel_spectrogram; // Generated mel
	std::vector<float> audio; // Final audio
	};

	InferenceComponents synthesize_components(
	const std::string& text,
	const std::string& language = "en",
	const std::vector<float>& speaker_wav = {}
	);

	// Streaming interface
	class Stream {
	public:
	Stream(XTTSv2Inference* parent, const std::string& text,
	const std::string& language, const std::vector<float>& speaker_wav);
	~Stream();

	std::vector<float> get_chunk(size_t max_samples = 4096);
	bool is_done() const { return done; }

	private:
	XTTSv2Inference* parent;
	InferenceComponents components;
	size_t mel_offset = 0;
	size_t audio_offset = 0;
	bool done = false;

	void generate_next_mel_chunk();
	std::vector<float> vocoder_chunk(ggml_tensor* mel_chunk);
	};

	std::unique_ptr<Stream> create_stream(
	const std::string& text,
	const std::string& language = "en",
	const std::vector<float>& speaker_wav = {}
	);

	private:
	XTTSConfig config;
	XTTSv2Model model;
	XTTSKVCache kv_cache;

	// GGUF file handling
	struct gguf_context* gguf_ctx = nullptr;
	void* mapped_memory = nullptr;
	size_t mapped_size = 0;

	// Computation graph
	ggml_cgraph* gf = nullptr;
	ggml_gallocr* allocr = nullptr;

	// Tokenizer
	std::unordered_map<std::string, int32_t> bpe_vocab;
	std::vector<std::pair<std::string, std::string>> bpe_merges;

	// Internal methods
	bool load_gguf_weights(const std::string& path, bool use_mmap);
	void init_model_architecture();

	// Text processing
	std::vector<int32_t> tokenize(const std::string& text);
	std::vector<std::string> bpe_encode(const std::string& text);

	// Model forward passes
	ggml_tensor* text_encoder_forward(
	const std::vector<int32_t>& tokens,
	const std::string& language
	);

	ggml_tensor* reference_encoder_forward(
	const std::vector<float>& audio_wav
	);

	ggml_tensor* decoder_forward(
	ggml_tensor* text_embeddings,
	ggml_tensor* speaker_embedding,
	float temperature,
	float length_penalty
	);

	std::vector<float> vocoder_forward(
	ggml_tensor* mel_spectrogram
	);

	// Attention mechanisms
	ggml_tensor* multi_head_attention(
	ggml_tensor* q, ggml_tensor* k, ggml_tensor* v,
	int n_heads, bool use_cache = true
	);

	ggml_tensor* cross_attention(
	ggml_tensor* queries,
	ggml_tensor* keys,
	ggml_tensor* values,
	int n_heads
	);

	// Helper functions
	ggml_tensor* layer_norm(ggml_tensor* x, ggml_tensor* g, ggml_tensor* b, float eps = 1e-5f);
	ggml_tensor* gelu(ggml_tensor* x);
	ggml_tensor* conv1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding);
	ggml_tensor* conv_transpose1d(ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, int stride, int padding);

	// Sampling
	std::vector<int32_t> sample_latents(
	ggml_tensor* logits,
	float temperature,
	float top_k,
	float top_p,
	float repetition_penalty
	);
	};

	// NEON-optimized kernels for ARM
	namespace kernels {
	#ifdef __ARM_NEON

	void gemm_q4_neon(
	const uint8_t* a_q4,
	const float* b,
	float* c,
	int m, int k, int n,
	const float* scales
	);

	void conv1d_q8_neon(
	const uint8_t* input_q8,
	const uint8_t* kernel_q8,
	float* output,
	int batch, int in_c, int out_c,
	int length, int kernel_size,
	int stride, int padding,
	const float* input_scale,
	const float* kernel_scale
	);

	void attention_q4_neon(
	const uint8_t* q_q4,
	const uint8_t* k_q4,
	const uint8_t* v_q4,
	float* output,
	int seq_len, int n_heads, int head_dim,
	const float* q_scale,
	const float* k_scale,
	const float* v_scale
	);

	#endif // __ARM_NEON
	} // namespace kernels

	// C API for React Native / FFI
	extern "C" {
	void* xtts_v2_init(const char* model_path, bool use_mmap);

	float* xtts_v2_synthesize(
	void* model,
	const char* text,
	const char* language,
	const float* speaker_wav,
	size_t speaker_wav_len,
	float temperature,
	float speed,
	size_t* out_len
	);

	void* xtts_v2_stream_init(
	void* model,
	const char* text,
	const char* language,
	const float* speaker_wav,
	size_t speaker_wav_len
	);

	float* xtts_v2_stream_chunk(
	void* stream,
	size_t chunk_size,
	size_t* out_len
	);

	void xtts_v2_stream_free(void* stream);
	void xtts_v2_free(void* model);
	void xtts_v2_free_audio(float* audio);
	}

	} // namespace xtts_v2

	#endif // XTTS_V2_FULL_H