| |
|
| |
|
| |
|
| | #include "gguf-model-data.h"
|
| |
|
| | #include "common.h"
|
| | #include "gguf.h"
|
| |
|
| | #include <algorithm>
|
| | #include <cstdio>
|
| | #include <cstring>
|
| | #include <filesystem>
|
| | #include <fstream>
|
| |
|
| | #include "http.h"
|
| | #define JSON_ASSERT GGML_ASSERT
|
| | #include <nlohmann/json.hpp>
|
| |
|
| |
|
| | struct gguf_buf_reader {
|
| | const char * data;
|
| | size_t size;
|
| | size_t pos;
|
| |
|
| | gguf_buf_reader(const std::vector<char> & buf) : data(buf.data()), size(buf.size()), pos(0) {}
|
| |
|
| | bool has_n_bytes(size_t n) const {
|
| | return pos + n <= size;
|
| | }
|
| |
|
| | template <typename T>
|
| | bool read_val(T & out) {
|
| | if (!has_n_bytes(sizeof(T))) {
|
| | return false;
|
| | }
|
| | memcpy(&out, data + pos, sizeof(T));
|
| | pos += sizeof(T);
|
| | return true;
|
| | }
|
| |
|
| | bool read_str(std::string & out) {
|
| | uint64_t len;
|
| | if (!read_val(len)) {
|
| | return false;
|
| | }
|
| | if (!has_n_bytes((size_t)len)) {
|
| | return false;
|
| | }
|
| | out.assign(data + pos, (size_t)len);
|
| | pos += (size_t)len;
|
| | return true;
|
| | }
|
| |
|
| | bool skip(size_t n) {
|
| | if (!has_n_bytes(n)) {
|
| | return false;
|
| | }
|
| | pos += n;
|
| | return true;
|
| | }
|
| | };
|
| |
|
| | static size_t gguf_val_type_size(int32_t vtype) {
|
| | switch (vtype) {
|
| | case GGUF_TYPE_UINT8: return 1;
|
| | case GGUF_TYPE_INT8: return 1;
|
| | case GGUF_TYPE_UINT16: return 2;
|
| | case GGUF_TYPE_INT16: return 2;
|
| | case GGUF_TYPE_UINT32: return 4;
|
| | case GGUF_TYPE_INT32: return 4;
|
| | case GGUF_TYPE_FLOAT32: return 4;
|
| | case GGUF_TYPE_BOOL: return 1;
|
| | case GGUF_TYPE_UINT64: return 8;
|
| | case GGUF_TYPE_INT64: return 8;
|
| | case GGUF_TYPE_FLOAT64: return 8;
|
| | default: return 0;
|
| | }
|
| | }
|
| |
|
| |
|
| | static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
|
| | if (vtype == GGUF_TYPE_STRING) {
|
| | std::string tmp;
|
| | return r.read_str(tmp);
|
| | }
|
| | if (vtype == GGUF_TYPE_ARRAY) {
|
| | int32_t elem_type;
|
| | uint64_t count;
|
| | if (!r.read_val(elem_type)) {
|
| | return false;
|
| | }
|
| | if (!r.read_val(count)) {
|
| | return false;
|
| | }
|
| | if (elem_type == GGUF_TYPE_STRING) {
|
| | for (uint64_t i = 0; i < count; i++) {
|
| | std::string tmp;
|
| | if (!r.read_str(tmp)) {
|
| | return false;
|
| | }
|
| | }
|
| | return true;
|
| | }
|
| | if (elem_type == GGUF_TYPE_ARRAY) {
|
| |
|
| | for (uint64_t i = 0; i < count; i++) {
|
| | if (!gguf_skip_value(r, GGUF_TYPE_ARRAY)) {
|
| | return false;
|
| | }
|
| | }
|
| | return true;
|
| | }
|
| | size_t elem_sz = gguf_val_type_size(elem_type);
|
| | if (elem_sz == 0) {
|
| | return false;
|
| | }
|
| | return r.skip((size_t)count * elem_sz);
|
| | }
|
| | size_t sz = gguf_val_type_size(vtype);
|
| | if (sz == 0) {
|
| | return false;
|
| | }
|
| | return r.skip(sz);
|
| | }
|
| |
|
| | static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
|
| | if (vtype == GGUF_TYPE_UINT8) {
|
| | uint8_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_INT8) {
|
| | int8_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = (uint32_t)v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_UINT16) {
|
| | uint16_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_INT16) {
|
| | int16_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = (uint32_t)v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_UINT32) {
|
| | uint32_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_INT32) {
|
| | int32_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = (uint32_t)v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_UINT64) {
|
| | uint64_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = (uint32_t)v;
|
| | return true;
|
| | }
|
| | if (vtype == GGUF_TYPE_INT64) {
|
| | int64_t v;
|
| | if (!r.read_val(v)) {
|
| | return false;
|
| | }
|
| | out = (uint32_t)v;
|
| | return true;
|
| | }
|
| | return false;
|
| | }
|
| |
|
| |
|
| | static std::optional<gguf_remote_model> gguf_parse_meta(const std::vector<char> & buf) {
|
| | gguf_buf_reader r(buf);
|
| |
|
| |
|
| | uint32_t magic_raw;
|
| | if (!r.read_val(magic_raw)) {
|
| | return std::nullopt;
|
| | }
|
| | if (memcmp(&magic_raw, "GGUF", 4) != 0) {
|
| | fprintf(stderr, "gguf_parse_meta: invalid magic\n");
|
| | return std::nullopt;
|
| | }
|
| |
|
| | uint32_t version;
|
| | if (!r.read_val(version)) {
|
| | return std::nullopt;
|
| | }
|
| | if (version < 2 || version > 3) {
|
| | fprintf(stderr, "gguf_parse_meta: unsupported version %u\n", version);
|
| | return std::nullopt;
|
| | }
|
| |
|
| | int64_t tensor_count_raw;
|
| | int64_t kv_count_raw;
|
| | if (!r.read_val(tensor_count_raw)) {
|
| | return std::nullopt;
|
| | }
|
| | if (!r.read_val(kv_count_raw)) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| | uint64_t tensor_count = (uint64_t)tensor_count_raw;
|
| | uint64_t kv_count = (uint64_t)kv_count_raw;
|
| |
|
| | gguf_remote_model model;
|
| |
|
| | std::string arch_prefix;
|
| |
|
| |
|
| | for (uint64_t i = 0; i < kv_count; i++) {
|
| | std::string key;
|
| | if (!r.read_str(key)) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| | int32_t vtype;
|
| | if (!r.read_val(vtype)) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| | if (key == "general.architecture" && vtype == GGUF_TYPE_STRING) {
|
| | if (!r.read_str(model.architecture)) {
|
| | return std::nullopt;
|
| | }
|
| | arch_prefix = model.architecture + ".";
|
| | continue;
|
| | }
|
| |
|
| |
|
| | if (key == "split.count") {
|
| | uint32_t v;
|
| | if (!gguf_read_uint32_val(r, vtype, v)) {
|
| | return std::nullopt;
|
| | }
|
| | model.n_split = (uint16_t)v;
|
| | continue;
|
| | }
|
| |
|
| |
|
| | if (key == "split.tensors.count") {
|
| | uint32_t v;
|
| | if (!gguf_read_uint32_val(r, vtype, v)) {
|
| | return std::nullopt;
|
| | }
|
| | model.n_split_tensors = v;
|
| | continue;
|
| | }
|
| |
|
| | if (!arch_prefix.empty()) {
|
| | uint32_t * target = nullptr;
|
| |
|
| | if (key == arch_prefix + "embedding_length") { target = &model.n_embd; }
|
| | else if (key == arch_prefix + "feed_forward_length") { target = &model.n_ff; }
|
| | else if (key == arch_prefix + "block_count") { target = &model.n_layer; }
|
| | else if (key == arch_prefix + "attention.head_count") { target = &model.n_head; }
|
| | else if (key == arch_prefix + "attention.head_count_kv") { target = &model.n_head_kv; }
|
| | else if (key == arch_prefix + "expert_count") { target = &model.n_expert; }
|
| | else if (key == arch_prefix + "attention.key_length") { target = &model.n_embd_head_k; }
|
| | else if (key == arch_prefix + "attention.value_length") { target = &model.n_embd_head_v; }
|
| |
|
| | if (target) {
|
| | if (!gguf_read_uint32_val(r, vtype, *target)) {
|
| | return std::nullopt;
|
| | }
|
| | continue;
|
| | }
|
| | }
|
| |
|
| | if (!gguf_skip_value(r, vtype)) {
|
| | return std::nullopt;
|
| | }
|
| | }
|
| |
|
| |
|
| | model.tensors.reserve((size_t)tensor_count);
|
| | for (uint64_t i = 0; i < tensor_count; i++) {
|
| | gguf_remote_tensor t;
|
| |
|
| | if (!r.read_str(t.name)) {
|
| | return std::nullopt;
|
| | }
|
| | if (!r.read_val(t.n_dims)) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| | if (t.n_dims > 4) {
|
| | fprintf(stderr, "gguf_parse_meta: tensor '%s' has %u dims (max 4)\n", t.name.c_str(), t.n_dims);
|
| | return std::nullopt;
|
| | }
|
| |
|
| | for (uint32_t d = 0; d < t.n_dims; d++) {
|
| | if (!r.read_val(t.ne[d])) {
|
| | return std::nullopt;
|
| | }
|
| | }
|
| |
|
| | int32_t type_raw;
|
| | if (!r.read_val(type_raw)) {
|
| | return std::nullopt;
|
| | }
|
| | t.type = (ggml_type)type_raw;
|
| |
|
| | uint64_t offset;
|
| | if (!r.read_val(offset)) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| |
|
| | if (t.name == "token_embd.weight") {
|
| | model.n_vocab = (uint32_t)t.ne[1];
|
| | }
|
| |
|
| | model.tensors.push_back(std::move(t));
|
| | }
|
| |
|
| | return model;
|
| | }
|
| |
|
| |
|
| | static std::string get_default_cache_dir() {
|
| | return fs_get_cache_directory() + "gguf-headers/";
|
| | }
|
| |
|
| | static std::string sanitize_for_path(const std::string & s) {
|
| | std::string out = s;
|
| | for (char & c : out) {
|
| | if (c == '/' || c == '\\' || c == ':') {
|
| | c = '_';
|
| | }
|
| | }
|
| | return out;
|
| | }
|
| |
|
| | static bool read_file(const std::string & path, std::vector<char> & out) {
|
| | std::ifstream f(path, std::ios::binary | std::ios::ate);
|
| | if (!f.good()) {
|
| | return false;
|
| | }
|
| | auto sz = f.tellg();
|
| | if (sz <= 0) {
|
| | return false;
|
| | }
|
| | out.resize((size_t)sz);
|
| | f.seekg(0);
|
| | f.read(out.data(), sz);
|
| | return f.good();
|
| | }
|
| |
|
| | static bool write_file(const std::string & path, const std::vector<char> & data) {
|
| | std::ofstream f(path, std::ios::binary | std::ios::trunc);
|
| | if (!f.good()) {
|
| | return false;
|
| | }
|
| | f.write(data.data(), (std::streamsize)data.size());
|
| | return f.good();
|
| | }
|
| |
|
| |
|
| | static std::pair<long, std::vector<char>> gguf_http_get(
|
| | const std::string & url,
|
| | const httplib::Headers & headers = {},
|
| | int timeout_sec = 60) {
|
| | try {
|
| | auto [cli, parts] = common_http_client(url);
|
| |
|
| | if (timeout_sec > 0) {
|
| | cli.set_read_timeout(timeout_sec, 0);
|
| | cli.set_write_timeout(timeout_sec, 0);
|
| | }
|
| | cli.set_connection_timeout(30, 0);
|
| |
|
| | std::vector<char> body;
|
| | auto res = cli.Get(parts.path, headers,
|
| | [&](const char * data, size_t len) {
|
| | body.insert(body.end(), data, data + len);
|
| | return true;
|
| | }, nullptr);
|
| |
|
| | if (!res) {
|
| | fprintf(stderr, "gguf_fetch: HTTP request failed for %s (error %d)\n",
|
| | url.c_str(), (int)res.error());
|
| | return {-1, {}};
|
| | }
|
| | return {res->status, std::move(body)};
|
| | } catch (const std::exception & e) {
|
| | fprintf(stderr, "gguf_fetch: HTTP error: %s\n", e.what());
|
| | return {-1, {}};
|
| | }
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| | static std::string detect_gguf_filename(const std::string & repo, const std::string & quant,
|
| | std::string & split_prefix) {
|
| | split_prefix.clear();
|
| | std::string api_url = "https://huggingface.co/api/models/" + repo;
|
| |
|
| | auto [code, body] = gguf_http_get(api_url, {}, 30);
|
| | if (code != 200 || body.empty()) {
|
| | fprintf(stderr, "gguf_fetch: failed to query HF API for %s (HTTP %ld)\n", repo.c_str(), code);
|
| | return "";
|
| | }
|
| |
|
| | nlohmann::json j;
|
| | try {
|
| | j = nlohmann::json::parse(body.begin(), body.end());
|
| | } catch (...) {
|
| | fprintf(stderr, "gguf_fetch: failed to parse HF API response\n");
|
| | return "";
|
| | }
|
| |
|
| | if (!j.contains("siblings") || !j["siblings"].is_array()) {
|
| | fprintf(stderr, "gguf_fetch: unexpected HF API response format\n");
|
| | return "";
|
| | }
|
| |
|
| | std::vector<std::string> matches;
|
| | std::string quant_upper = quant;
|
| | for (char & c : quant_upper) { c = (char)toupper(c); }
|
| |
|
| | for (const auto & sibling : j["siblings"]) {
|
| | if (!sibling.contains("rfilename")) { continue; }
|
| | std::string fname = sibling["rfilename"].get<std::string>();
|
| | if (fname.size() < 5 || fname.substr(fname.size() - 5) != ".gguf") {
|
| | continue;
|
| | }
|
| |
|
| | std::string fname_upper = fname;
|
| | for (char & c : fname_upper) { c = (char)toupper(c); }
|
| | if (fname_upper.find(quant_upper) != std::string::npos) {
|
| | matches.push_back(fname);
|
| | }
|
| | }
|
| |
|
| | if (matches.empty()) {
|
| | fprintf(stderr, "gguf_fetch: no .gguf files matching '%s' in %s\n", quant.c_str(), repo.c_str());
|
| | return "";
|
| | }
|
| |
|
| | std::sort(matches.begin(), matches.end());
|
| |
|
| |
|
| | for (const auto & m : matches) {
|
| | if (m.find("-of-") == std::string::npos && m.find("mmproj") == std::string::npos) {
|
| | return m;
|
| | }
|
| | }
|
| |
|
| |
|
| | for (const auto & m : matches) {
|
| | auto pos = m.find("-00001-of-");
|
| | if (pos != std::string::npos) {
|
| | split_prefix = m.substr(0, pos);
|
| | return m;
|
| | }
|
| | }
|
| |
|
| | return matches[0];
|
| | }
|
| |
|
| | static std::optional<gguf_remote_model> fetch_and_parse(
|
| | const std::string & repo,
|
| | const std::string & filename,
|
| | const std::string & cache_path) {
|
| | std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
|
| |
|
| |
|
| |
|
| | size_t chunk_size = 2 * 1024 * 1024;
|
| | const size_t max_chunk = 64 * 1024 * 1024;
|
| |
|
| | while (chunk_size <= max_chunk) {
|
| | fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
|
| |
|
| | char range_buf[64];
|
| | snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
|
| | httplib::Headers headers = {{"Range", range_buf}};
|
| |
|
| | auto [code, body] = gguf_http_get(url, headers, 120);
|
| | if (code != 200 && code != 206) {
|
| | fprintf(stderr, "gguf_fetch: HTTP %ld fetching %s\n", code, url.c_str());
|
| | return std::nullopt;
|
| | }
|
| |
|
| | if (body.empty()) {
|
| | fprintf(stderr, "gguf_fetch: empty response\n");
|
| | return std::nullopt;
|
| | }
|
| |
|
| | auto result = gguf_parse_meta(body);
|
| | if (result.has_value()) {
|
| | write_file(cache_path, body);
|
| | return result;
|
| | }
|
| |
|
| | if (code == 200) {
|
| | fprintf(stderr, "gguf_fetch: server returned full response but metadata parse failed\n");
|
| | return std::nullopt;
|
| | }
|
| |
|
| |
|
| | chunk_size *= 2;
|
| | }
|
| |
|
| | fprintf(stderr, "gguf_fetch: metadata exceeds 64MB, giving up\n");
|
| | return std::nullopt;
|
| | }
|
| |
|
| |
|
| | static std::optional<gguf_remote_model> fetch_or_cached(
|
| | const std::string & repo,
|
| | const std::string & filename,
|
| | const std::string & cdir,
|
| | const std::string & repo_part) {
|
| | std::string fname_part = sanitize_for_path(filename);
|
| | std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
|
| |
|
| | {
|
| | std::vector<char> cached;
|
| | if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
|
| | auto result = gguf_parse_meta(cached);
|
| | if (result.has_value()) {
|
| | fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
|
| | return result;
|
| | }
|
| | }
|
| | }
|
| |
|
| | fs_create_directory_with_parents(cdir);
|
| | return fetch_and_parse(repo, filename, cache_path);
|
| | }
|
| |
|
| | std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
| | const std::string & repo,
|
| | const std::string & quant,
|
| | const std::string & cache_dir) {
|
| | std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
|
| | std::string repo_part = sanitize_for_path(repo);
|
| |
|
| | std::string split_prefix;
|
| | std::string filename = detect_gguf_filename(repo, quant, split_prefix);
|
| | if (filename.empty()) {
|
| | return std::nullopt;
|
| | }
|
| |
|
| | auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
|
| | if (!model_opt.has_value()) {
|
| | fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
|
| | return std::nullopt;
|
| | }
|
| |
|
| | auto & model = model_opt.value();
|
| |
|
| |
|
| | if (model.n_split > 1) {
|
| | if (split_prefix.empty()) {
|
| | fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
|
| | return std::nullopt;
|
| | }
|
| |
|
| | fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
| | model.n_split, model.n_split - 1);
|
| |
|
| | for (int i = 2; i <= model.n_split; i++) {
|
| | char num_buf[6], total_buf[6];
|
| | snprintf(num_buf, sizeof(num_buf), "%05d", i);
|
| | snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
| | std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
| |
|
| | auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
|
| | if (!shard.has_value()) {
|
| | fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
|
| | return std::nullopt;
|
| | }
|
| |
|
| | model.tensors.insert(model.tensors.end(),
|
| | std::make_move_iterator(shard->tensors.begin()),
|
| | std::make_move_iterator(shard->tensors.end()));
|
| | }
|
| |
|
| | if (model.n_split_tensors > 0 && model.tensors.size() != model.n_split_tensors) {
|
| | fprintf(stderr, "gguf_fetch: WARNING: expected %u tensors from split.tensors.count, got %zu\n",
|
| | model.n_split_tensors, model.tensors.size());
|
| | }
|
| | }
|
| |
|
| | return model_opt;
|
| | }
|
| |
|