| | #ifndef MTMD_H
|
| | #define MTMD_H
|
| |
|
| | #include "ggml.h"
|
| | #include "llama.h"
|
| |
|
| | #include <stddef.h>
|
| | #include <stdint.h>
|
| | #include <stdbool.h>
|
| |
|
| | #ifdef __cplusplus
|
| | #include <string>
|
| | #include <vector>
|
| | #include <cinttypes>
|
| | #include <memory>
|
| | #endif
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | #ifdef LLAMA_SHARED
|
| | # if defined(_WIN32) && !defined(__MINGW32__)
|
| | # ifdef LLAMA_BUILD
|
| | # define MTMD_API __declspec(dllexport)
|
| | # else
|
| | # define MTMD_API __declspec(dllimport)
|
| | # endif
|
| | # else
|
| | # define MTMD_API __attribute__ ((visibility ("default")))
|
| | # endif
|
| | #else
|
| | # define MTMD_API
|
| | #endif
|
| |
|
| |
|
| | #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
| |
|
| | #ifdef __cplusplus
|
| | extern "C" {
|
| | #endif
|
| |
|
| | enum mtmd_input_chunk_type {
|
| | MTMD_INPUT_CHUNK_TYPE_TEXT,
|
| | MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
| | MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
| | };
|
| |
|
| |
|
| | struct mtmd_context;
|
| | struct mtmd_bitmap;
|
| | struct mtmd_image_tokens;
|
| | struct mtmd_input_chunk;
|
| | struct mtmd_input_chunks;
|
| |
|
| | struct mtmd_input_text {
|
| | const char * text;
|
| | bool add_special;
|
| | bool parse_special;
|
| | };
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | typedef struct mtmd_context mtmd_context;
|
| | typedef struct mtmd_bitmap mtmd_bitmap;
|
| | typedef struct mtmd_image_tokens mtmd_image_tokens;
|
| | typedef struct mtmd_input_chunk mtmd_input_chunk;
|
| | typedef struct mtmd_input_chunks mtmd_input_chunks;
|
| | typedef struct mtmd_input_text mtmd_input_text;
|
| |
|
| | struct mtmd_context_params {
|
| | bool use_gpu;
|
| | bool print_timings;
|
| | int n_threads;
|
| | const char * image_marker;
|
| | const char * media_marker;
|
| | enum llama_flash_attn_type flash_attn_type;
|
| | bool warmup;
|
| |
|
| |
|
| | int image_min_tokens;
|
| | int image_max_tokens;
|
| |
|
| |
|
| | ggml_backend_sched_eval_callback cb_eval;
|
| | void * cb_eval_user_data;
|
| | };
|
| |
|
| | MTMD_API const char * mtmd_default_marker(void);
|
| |
|
| | MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
|
| |
|
| |
|
| |
|
| | MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
| | const struct llama_model * text_model,
|
| | const struct mtmd_context_params ctx_params);
|
| |
|
| | MTMD_API void mtmd_free(mtmd_context * ctx);
|
| |
|
| |
|
| | MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
| |
|
| |
|
| | MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
| |
|
| |
|
| | MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
| |
|
| |
|
| | MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
| |
|
| |
|
| |
|
| | MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
| | MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
| | MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
| | MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
| | MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
|
| | MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
|
| | MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
|
| | MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
| |
|
| |
|
| | MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
| | MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
|
| | MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
|
| | MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
|
| | MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
|
| | MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
|
| | MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
|
| | MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
|
| |
|
| | MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
|
| |
|
| | MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
|
| | MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
| | MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
|
| | MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
|
| | MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
|
| |
|
| | MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
| | mtmd_input_chunks * output,
|
| | const mtmd_input_text * text,
|
| | const mtmd_bitmap ** bitmaps,
|
| | size_t n_bitmaps);
|
| |
|
| |
|
| |
|
| | MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
| | const mtmd_image_tokens * image_tokens);
|
| |
|
| |
|
| | MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
| | const mtmd_input_chunk * chunk);
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
| |
|
| |
|
| |
|
| | MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
|
| |
|
| |
|
| |
|
| |
|
| | MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
| |
|
| | #ifdef __cplusplus
|
| | }
|
| | #endif
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | #ifdef __cplusplus
|
| |
|
| | namespace mtmd {
|
| |
|
| | struct mtmd_context_deleter {
|
| | void operator()(mtmd_context * val) { mtmd_free(val); }
|
| | };
|
| | using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
| |
|
| | struct mtmd_bitmap_deleter {
|
| | void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
|
| | };
|
| | using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
|
| |
|
| | struct mtmd_input_chunks_deleter {
|
| | void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
| | };
|
| | using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
| |
|
| | struct mtmd_input_chunk_deleter {
|
| | void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
|
| | };
|
| | using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
|
| |
|
| | struct bitmap {
|
| | bitmap_ptr ptr;
|
| | bitmap() : ptr(nullptr) {}
|
| | bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
|
| | bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
|
| | bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
|
| | ptr.reset(mtmd_bitmap_init(nx, ny, data));
|
| | }
|
| | ~bitmap() = default;
|
| | uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
|
| | uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
|
| | const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
|
| | size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
| | std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
|
| | void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
|
| | };
|
| |
|
| | struct bitmaps {
|
| | std::vector<bitmap> entries;
|
| | ~bitmaps() = default;
|
| |
|
| |
|
| |
|
| |
|
| | std::vector<const mtmd_bitmap *> c_ptr() {
|
| | std::vector<const mtmd_bitmap *> res(entries.size());
|
| | for (size_t i = 0; i < entries.size(); i++) {
|
| | res[i] = entries[i].ptr.get();
|
| | }
|
| | return res;
|
| | }
|
| | };
|
| |
|
| | struct input_chunks {
|
| | input_chunks_ptr ptr;
|
| | input_chunks() = default;
|
| | input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
|
| | ~input_chunks() = default;
|
| | size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
|
| | const mtmd_input_chunk * operator[](size_t idx) const {
|
| | return mtmd_input_chunks_get(ptr.get(), idx);
|
| | }
|
| | };
|
| |
|
| | }
|
| |
|
| | #endif
|
| |
|
| | #endif
|
| |
|