|
|
|
|
|
#ifndef XTTS_INFERENCE_H |
|
|
#define XTTS_INFERENCE_H |
|
|
|
|
|
#include <ggml.h> |
|
|
#include <ggml-alloc.h> |
|
|
#include <ggml-backend.h> |
|
|
#include <cstdint> |
|
|
#include <string> |
|
|
#include <vector> |
|
|
#include <memory> |
|
|
#include <unordered_map> |
|
|
|
|
|
namespace xtts { |
|
|
|
|
|
|
|
|
struct XTTSHyperParams { |
|
|
int32_t n_vocab = 256; |
|
|
int32_t n_ctx_text = 402; |
|
|
int32_t n_ctx_audio = 605; |
|
|
int32_t n_embd = 1024; |
|
|
int32_t n_head = 16; |
|
|
int32_t n_layer = 24; |
|
|
int32_t n_mel_channels = 80; |
|
|
int32_t n_audio_tokens = 1026; |
|
|
int32_t sample_rate = 24000; |
|
|
int32_t n_languages = 17; |
|
|
int32_t speaker_emb_dim = 512; |
|
|
}; |
|
|
|
|
|
|
|
|
enum Language { |
|
|
LANG_EN = 0, |
|
|
LANG_ES = 1, |
|
|
LANG_FR = 2, |
|
|
LANG_DE = 3, |
|
|
LANG_IT = 4, |
|
|
LANG_PT = 5, |
|
|
LANG_PL = 6, |
|
|
LANG_TR = 7, |
|
|
LANG_RU = 8, |
|
|
LANG_NL = 9, |
|
|
LANG_CS = 10, |
|
|
LANG_AR = 11, |
|
|
LANG_ZH = 12, |
|
|
LANG_JA = 13, |
|
|
LANG_KO = 14, |
|
|
LANG_HU = 15, |
|
|
LANG_HI = 16 |
|
|
}; |
|
|
|
|
|
|
|
|
struct ggml_context; |
|
|
struct ggml_tensor; |
|
|
struct gguf_context; |
|
|
|
|
|
|
|
|
struct XTTSModel { |
|
|
|
|
|
struct ggml_tensor* text_embedding; |
|
|
struct ggml_tensor* language_embedding; |
|
|
struct ggml_tensor* pos_encoding; |
|
|
|
|
|
|
|
|
std::vector<struct ggml_tensor*> ln1_weight; |
|
|
std::vector<struct ggml_tensor*> ln1_bias; |
|
|
std::vector<struct ggml_tensor*> attn_qkv; |
|
|
std::vector<struct ggml_tensor*> attn_out; |
|
|
std::vector<struct ggml_tensor*> ln2_weight; |
|
|
std::vector<struct ggml_tensor*> ln2_bias; |
|
|
std::vector<struct ggml_tensor*> ffn_up; |
|
|
std::vector<struct ggml_tensor*> ffn_down; |
|
|
|
|
|
|
|
|
struct ggml_tensor* audio_token_predictor; |
|
|
|
|
|
|
|
|
struct ggml_tensor* vocoder_preconv; |
|
|
std::vector<struct ggml_tensor*> vocoder_ups; |
|
|
std::vector<struct ggml_tensor*> vocoder_resblocks; |
|
|
struct ggml_tensor* vocoder_postconv; |
|
|
|
|
|
|
|
|
struct ggml_tensor* speaker_projection; |
|
|
|
|
|
|
|
|
struct ggml_context* ctx = nullptr; |
|
|
ggml_backend_t backend = nullptr; |
|
|
ggml_backend_buffer_t buffer = nullptr; |
|
|
|
|
|
~XTTSModel(); |
|
|
}; |
|
|
|
|
|
|
|
|
struct XTTSKVCache { |
|
|
struct ggml_tensor* k_cache; |
|
|
struct ggml_tensor* v_cache; |
|
|
int32_t n_cached = 0; |
|
|
}; |
|
|
|
|
|
|
|
|
class XTTSInference { |
|
|
public: |
|
|
XTTSInference(); |
|
|
~XTTSInference(); |
|
|
|
|
|
|
|
|
bool load_model(const std::string& model_path, bool use_mmap = true); |
|
|
|
|
|
|
|
|
std::vector<float> generate( |
|
|
const std::string& text, |
|
|
Language language = LANG_EN, |
|
|
int speaker_id = 0, |
|
|
float temperature = 0.8f, |
|
|
float speed = 1.0f |
|
|
); |
|
|
|
|
|
|
|
|
class StreamGenerator { |
|
|
public: |
|
|
StreamGenerator(XTTSInference* parent, const std::string& text, Language lang); |
|
|
~StreamGenerator(); |
|
|
|
|
|
|
|
|
std::vector<float> get_next_chunk(size_t chunk_samples = 8192); |
|
|
bool is_done() const { return done; } |
|
|
|
|
|
private: |
|
|
XTTSInference* parent_model; |
|
|
std::vector<int32_t> text_tokens; |
|
|
std::vector<int32_t> audio_tokens; |
|
|
Language language; |
|
|
size_t current_token = 0; |
|
|
bool done = false; |
|
|
|
|
|
void generate_next_tokens(size_t n_tokens); |
|
|
}; |
|
|
|
|
|
|
|
|
std::unique_ptr<StreamGenerator> create_stream( |
|
|
const std::string& text, |
|
|
Language language = LANG_EN |
|
|
); |
|
|
|
|
|
|
|
|
XTTSHyperParams get_params() const { return hparams; } |
|
|
size_t get_memory_usage() const; |
|
|
|
|
|
private: |
|
|
XTTSHyperParams hparams; |
|
|
XTTSModel model; |
|
|
XTTSKVCache kv_cache; |
|
|
|
|
|
|
|
|
struct gguf_context* gguf_ctx = nullptr; |
|
|
void* mapped_memory = nullptr; |
|
|
size_t mapped_size = 0; |
|
|
|
|
|
|
|
|
struct ggml_cgraph* gf = nullptr; |
|
|
struct ggml_gallocr* allocr = nullptr; |
|
|
|
|
|
|
|
|
bool load_gguf_file(const std::string& path, bool use_mmap); |
|
|
void create_computation_graph(); |
|
|
|
|
|
|
|
|
std::vector<int32_t> tokenize(const std::string& text); |
|
|
|
|
|
|
|
|
struct ggml_tensor* encode_text( |
|
|
const std::vector<int32_t>& tokens, |
|
|
Language language, |
|
|
const std::vector<float>& speaker_embedding |
|
|
); |
|
|
|
|
|
std::vector<int32_t> generate_audio_tokens( |
|
|
struct ggml_tensor* text_features, |
|
|
float temperature |
|
|
); |
|
|
|
|
|
std::vector<float> vocoder_forward( |
|
|
const std::vector<int32_t>& audio_tokens |
|
|
); |
|
|
|
|
|
|
|
|
struct ggml_tensor* attention( |
|
|
struct ggml_tensor* x, |
|
|
int layer_idx, |
|
|
bool use_cache = true |
|
|
); |
|
|
|
|
|
|
|
|
struct ggml_tensor* ffn( |
|
|
struct ggml_tensor* x, |
|
|
int layer_idx |
|
|
); |
|
|
|
|
|
|
|
|
struct ggml_tensor* layer_norm( |
|
|
struct ggml_tensor* x, |
|
|
struct ggml_tensor* weight, |
|
|
struct ggml_tensor* bias, |
|
|
float eps = 1e-5f |
|
|
); |
|
|
|
|
|
int32_t sample_token( |
|
|
struct ggml_tensor* logits, |
|
|
float temperature, |
|
|
float top_p = 0.9f |
|
|
); |
|
|
|
|
|
std::vector<float> create_speaker_embedding(int speaker_id); |
|
|
}; |
|
|
|
|
|
|
|
|
extern "C" { |
|
|
|
|
|
void* xtts_init(const char* model_path, bool use_mmap); |
|
|
|
|
|
|
|
|
float* xtts_generate( |
|
|
void* model_ptr, |
|
|
const char* text, |
|
|
int language, |
|
|
int speaker_id, |
|
|
float temperature, |
|
|
float speed, |
|
|
size_t* out_length |
|
|
); |
|
|
|
|
|
|
|
|
void* xtts_stream_init( |
|
|
void* model_ptr, |
|
|
const char* text, |
|
|
int language |
|
|
); |
|
|
|
|
|
float* xtts_stream_next( |
|
|
void* stream_ptr, |
|
|
size_t chunk_size, |
|
|
size_t* out_length |
|
|
); |
|
|
|
|
|
void xtts_stream_free(void* stream_ptr); |
|
|
|
|
|
|
|
|
void xtts_free(void* model_ptr); |
|
|
void xtts_free_audio(float* audio_ptr); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#endif |