Conversion to GGUF
Firstly, prepare a python environment and install the following dependencies:
pip install torch transformers gguf sentencepieceAnd then install
llama.cpp.Then, edit
llama.cpp/examples/llava/convert-image-encoder-to-gguf.pyto support SigLIP:when importing packages, chage
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModelto
from transformers import SiglipModel as CLIPModel from transformers import SiglipProcessor as CLIPProcessor from transformers import SiglipVisionModel as CLIPVisionModel
Then, edit
llama.cpp/convert-hf-to-gguf.pyto skip unknown parts:change
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: raise ValueError(f"Can not map tensor {name!r}") return new_nameto
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) return new_namechange
def write_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): ... for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): data: np.ndarray = data # type hint n_dims = len(data.shape) ...to
def write_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): ... for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): if new_name is None: continue data: np.ndarray = data # type hint n_dims = len(data.shape) ...
converting Bunny-Llama-3-8B-V
cd llama.cpp/examples/llavaDownload the weights and put under
./Extract the weights of vision tower and multimodel projector:
python llava-surgery-v2.py -C -m Bunny-Llama-3-8B-Vyou will find a
llava.projectorand allava.clipfile inBunny-Llama-3-8B-VCreate the visual gguf model:
prepare files
cd Bunny-Llama-3-8B-V mkdir vit cp llava.clip vit/pytorch_model.bin cp llava.projector vit/and put
config.jsonundervit/and then:
python ../convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 cd ..you will find a
mmproj-model-f16.gguffile inBunny-Llama-3-8B-V/vit
Convert the left language part:
edit
Bunny-Llama-3-8B-V/config.json:change
"architectures": [ "BunnyLlamaForCausalLM" ], "auto_map": { "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" },to
"architectures": [ "LlamaForCausalLM" ],And then:
python ../../convert-hf-to-gguf.py Bunny-Llama-3-8B-Vyou will find a
ggml-model-f16.gguffile inBunny-Llama-3-8B-V
converting Bunny-v1_0-4B
cd llama.cpp/examples/llavaDownload the weights and put under
./Extract the weights of vision tower and multimodel projector:
python llava-surgery-v2.py -C -m Bunny-v1_0-4Byou will find a
llava.projectorand allava.clipfile inBunny-v1_0-4BCreate the visual gguf model:
prepare files
cd Bunny-v1_0-4B mkdir vit cp llava.clip vit/pytorch_model.bin cp llava.projector vit/and put
config.jsonundervit/and then:
python ../convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 cd ..you will find a
mmproj-model-f16.gguffile inBunny-v1_0-4B/vit
Convert the left language part:
edit
Bunny-v1_0-4B/config.json:change
"architectures": [ "BunnyPhi3ForCausalLM" ], "attention_dropout": 0.0, "auto_map": { "AutoConfig": "configuration_bunny_phi3.BunnyPhi3Config", "AutoModelForCausalLM": "modeling_bunny_phi3.BunnyPhi3ForCausalLM" },to
"architectures": [ "Phi3ForCausalLM" ], "attention_dropout": 0.0, "auto_map": { "AutoConfig": "configuration_phi3.Phi3Config", "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" },And then:
python ../../convert-hf-to-gguf.py Bunny-v1_0-4Byou will find a
ggml-model-f16.gguffile inBunny-v1_0-4B
Appendix
vit/config.json
{
"architectures": [
"SiglipVisionModel"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": 128001,
"freeze_mm_mlp_adapter": false,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"image_size": 384,
"image_aspect_ratio": "pad",
"initializer_range": 0.02,
"intermediate_size": 4304,
"layer_norm_eps": 1e-6,
"max_position_embeddings": 8192,
"mm_hidden_size": 1152,
"mm_projector_lr": 1e-05,
"mm_projector_type": "mlp2x_gelu",
"mm_vision_tower": "google/siglip-so400m-patch14-384",
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"num_key_value_heads": 8,
"patch_size": 14,
"pretraining_tp": 1,
"projection_dim": 1152,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 500000.0,
"tie_word_embeddings": false,
"tokenizer_model_max_length": 2048,
"tokenizer_padding_side": "right",
"torch_dtype": "float16",
"transformers_version": "4.40.0",
"tune_mm_mlp_adapter": false,
"unfreeze_vision_tower": true,
"use_cache": true,
"use_mm_proj": true,
"vocab_size": 128256
}