Upload folder using huggingface_hub

Files changed (8) hide show

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 base/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .cache/

README.md CHANGED Viewed

@@ -75,11 +75,11 @@ The ~5% per-prompt embedding cosine drift (0.953 mean vs bf16) propagates into ~
 ```
 .
-├── base/                       # bnb-quantized Llama-3-8B + merged MNTP
-│   ├── config.json             # contains quantization_config (auto-applied on load)
-│   ├── model.safetensors       # bnb 4-bit weights
-│   ├── tokenizer.json
-│   └── tokenizer_config.json
 ├── supervised_adapter/         # second LoRA stack (kept un-merged)
 │   ├── adapter_config.json
 │   └── adapter_model.safetensors
@@ -99,12 +99,12 @@ CUDA_VISIBLE_DEVICES=0 \
 TEXT_ENCODER_DEVICE=cuda:0 \
 TEXT_ENCODER_MODE=local \
 LLM2VEC_QUANTIZE=nf4 \
-LLM2VEC_LOCAL_BASE=$HOME/llm2vec-nf4/base \
 LLM2VEC_LOCAL_PEFT=$HOME/llm2vec-nf4/supervised_adapter \
 python kimodo_daemon.py
 ```
-`LLM2VEC_QUANTIZE=nf4` tells the wrapper to honor the bnb config in `base/config.json`. The two `LLM2VEC_LOCAL_*` vars short-circuit the Hub download.
 ## Standalone use (without kimodo)
@@ -118,7 +118,7 @@ from llm2vec import LLM2Vec  # from McGill's llm2vec package
 # Load the bnb-quantized base; quantization_config is in config.json so
 # transformers re-applies bnb automatically.
-base_dir = "<local clone>/base"
 adapter_dir = "<local clone>/supervised_adapter"
 model = LLM2Vec.from_pretrained(

 ```
 .
+├── config.json                 # contains quantization_config (auto-applied on load)
+├── model.safetensors           # bnb 4-bit weights (Llama-3-8B + merged MNTP)
+├── tokenizer.json
+├── tokenizer_config.json
+├── chat_template.jinja
 ├── supervised_adapter/         # second LoRA stack (kept un-merged)
 │   ├── adapter_config.json
 │   └── adapter_model.safetensors
 TEXT_ENCODER_DEVICE=cuda:0 \
 TEXT_ENCODER_MODE=local \
 LLM2VEC_QUANTIZE=nf4 \
+LLM2VEC_LOCAL_BASE=$HOME/llm2vec-nf4 \
 LLM2VEC_LOCAL_PEFT=$HOME/llm2vec-nf4/supervised_adapter \
 python kimodo_daemon.py
 ```
+`LLM2VEC_QUANTIZE=nf4` tells the wrapper to honor the bnb config in `config.json`. The two `LLM2VEC_LOCAL_*` vars short-circuit the Hub download.
 ## Standalone use (without kimodo)
 # Load the bnb-quantized base; quantization_config is in config.json so
 # transformers re-applies bnb automatically.
+base_dir = "<local clone>"
 adapter_dir = "<local clone>/supervised_adapter"
 model = LLM2Vec.from_pretrained(

chat_template.jinja ADDED Viewed

+{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
+' }}{% endif %}

config.json ADDED Viewed

+{
+  "architectures": [
+    "LlamaBiModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": 128009,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "rope_theta": 500000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.1.0",
+  "use_cache": true,
+  "vocab_size": 128256
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:64d4bea7b029f5ea40c52eb84c11cb370ddb4d7d45f4e9f8d3babe92e3665c5d
+size 4652064697

tokenizer.json ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393
+size 17209961

tokenizer_config.json ADDED Viewed

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": true,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "TokenizersBackend"
+}