| | #ifndef __LORA_HPP__ |
| | #define __LORA_HPP__ |
| |
|
| | #include "ggml_extend.hpp" |
| |
|
| | #define LORA_GRAPH_SIZE 10240 |
| |
|
| | struct LoraModel : public GGMLRunner { |
| | float multiplier = 1.0f; |
| | std::map<std::string, struct ggml_tensor*> lora_tensors; |
| | std::string file_path; |
| | ModelLoader model_loader; |
| | bool load_failed = false; |
| | bool applied = false; |
| | std::vector<int> zero_index_vec = {0}; |
| | ggml_tensor* zero_index = NULL; |
| |
|
| | LoraModel(ggml_backend_t backend, |
| | const std::string& file_path = "", |
| | const std::string prefix = "") |
| | : file_path(file_path), GGMLRunner(backend) { |
| | if (!model_loader.init_from_file(file_path, prefix)) { |
| | load_failed = true; |
| | } |
| | } |
| |
|
| | std::string get_desc() { |
| | return "lora"; |
| | } |
| |
|
| | bool load_from_file(bool filter_tensor = false) { |
| | LOG_INFO("loading LoRA from '%s'", file_path.c_str()); |
| |
|
| | if (load_failed) { |
| | LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str()); |
| | return false; |
| | } |
| |
|
| | bool dry_run = true; |
| | auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { |
| | const std::string& name = tensor_storage.name; |
| |
|
| | if (filter_tensor && !contains(name, "lora")) { |
| | |
| | return true; |
| | } |
| |
|
| | if (dry_run) { |
| | struct ggml_tensor* real = ggml_new_tensor(params_ctx, |
| | tensor_storage.type, |
| | tensor_storage.n_dims, |
| | tensor_storage.ne); |
| | lora_tensors[name] = real; |
| | } else { |
| | auto real = lora_tensors[name]; |
| | *dst_tensor = real; |
| | } |
| |
|
| | return true; |
| | }; |
| |
|
| | model_loader.load_tensors(on_new_tensor_cb, backend); |
| | alloc_params_buffer(); |
| |
|
| | dry_run = false; |
| | model_loader.load_tensors(on_new_tensor_cb, backend); |
| |
|
| | LOG_DEBUG("finished loaded lora"); |
| | return true; |
| | } |
| |
|
| | ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) { |
| | auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); |
| | out = ggml_get_rows(ctx, out, zero_index); |
| | out = ggml_reshape(ctx, out, a); |
| | return out; |
| | } |
| |
|
| | struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) { |
| | struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false); |
| |
|
| | zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1); |
| | set_backend_tensor_data(zero_index, zero_index_vec.data()); |
| | ggml_build_forward_expand(gf, zero_index); |
| |
|
| | std::set<std::string> applied_lora_tensors; |
| | for (auto it : model_tensors) { |
| | std::string k_tensor = it.first; |
| | struct ggml_tensor* weight = model_tensors[it.first]; |
| |
|
| | size_t k_pos = k_tensor.find(".weight"); |
| | if (k_pos == std::string::npos) { |
| | continue; |
| | } |
| | k_tensor = k_tensor.substr(0, k_pos); |
| | replace_all_chars(k_tensor, '.', '_'); |
| | |
| | std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight"; |
| | if (lora_tensors.find(lora_up_name) == lora_tensors.end()) { |
| | if (k_tensor == "model_diffusion_model_output_blocks_2_2_conv") { |
| | |
| | k_tensor = "model_diffusion_model_output_blocks_2_1_conv"; |
| | lora_up_name = "lora." + k_tensor + ".lora_up.weight"; |
| | } |
| | } |
| |
|
| | std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight"; |
| | std::string alpha_name = "lora." + k_tensor + ".alpha"; |
| | std::string scale_name = "lora." + k_tensor + ".scale"; |
| |
|
| | ggml_tensor* lora_up = NULL; |
| | ggml_tensor* lora_down = NULL; |
| |
|
| | if (lora_tensors.find(lora_up_name) != lora_tensors.end()) { |
| | lora_up = lora_tensors[lora_up_name]; |
| | } |
| |
|
| | if (lora_tensors.find(lora_down_name) != lora_tensors.end()) { |
| | lora_down = lora_tensors[lora_down_name]; |
| | } |
| |
|
| | if (lora_up == NULL || lora_down == NULL) { |
| | continue; |
| | } |
| |
|
| | applied_lora_tensors.insert(lora_up_name); |
| | applied_lora_tensors.insert(lora_down_name); |
| | applied_lora_tensors.insert(alpha_name); |
| | applied_lora_tensors.insert(scale_name); |
| |
|
| | |
| | int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1]; |
| | float scale_value = 1.0f; |
| | if (lora_tensors.find(scale_name) != lora_tensors.end()) { |
| | scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]); |
| | } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) { |
| | float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]); |
| | scale_value = alpha / dim; |
| | } |
| | scale_value *= multiplier; |
| |
|
| | |
| | int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; |
| | lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); |
| | auto lora_down_n_dims = ggml_n_dims(lora_down); |
| | |
| | lora_down_n_dims = (lora_down_n_dims + lora_down_n_dims % 2); |
| | int64_t lora_down_rows = lora_down->ne[lora_down_n_dims - 1]; |
| |
|
| | |
| | lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down)); |
| | struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down); |
| | updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown)); |
| | updown = ggml_reshape(compute_ctx, updown, weight); |
| | GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); |
| | updown = ggml_scale_inplace(compute_ctx, updown, scale_value); |
| | ggml_tensor* final_weight; |
| | if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { |
| | |
| | |
| | final_weight = to_f32(compute_ctx, weight); |
| | final_weight = ggml_add_inplace(compute_ctx, final_weight, updown); |
| | final_weight = ggml_cpy(compute_ctx, final_weight, weight); |
| | } else { |
| | final_weight = ggml_add_inplace(compute_ctx, weight, updown); |
| | } |
| | |
| | ggml_build_forward_expand(gf, final_weight); |
| | } |
| |
|
| | size_t total_lora_tensors_count = 0; |
| | size_t applied_lora_tensors_count = 0; |
| |
|
| | for (auto& kv : lora_tensors) { |
| | total_lora_tensors_count++; |
| | if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { |
| | LOG_WARN("unused lora tensor %s", kv.first.c_str()); |
| | } else { |
| | applied_lora_tensors_count++; |
| | } |
| | } |
| | |
| | |
| | |
| | if (applied_lora_tensors_count != total_lora_tensors_count) { |
| | LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied", |
| | applied_lora_tensors_count, total_lora_tensors_count); |
| | } else { |
| | LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully", |
| | applied_lora_tensors_count, total_lora_tensors_count); |
| | } |
| |
|
| | return gf; |
| | } |
| |
|
| | void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) { |
| | auto get_graph = [&]() -> struct ggml_cgraph* { |
| | return build_lora_graph(model_tensors); |
| | }; |
| | GGMLRunner::compute(get_graph, n_threads, true); |
| | } |
| | }; |
| |
|
| | #endif |
| |
|