Overwrite adapter with checkpoint-125 (r64 epoch-1)
Browse files- README.md +55 -0
- adapter_config.json +15 -8
- adapter_model.safetensors +2 -2
- stats.json +11 -1
- train_config.json +33 -59
README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-3-27b-it
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- lora
|
| 7 |
+
- peft
|
| 8 |
+
- gemma
|
| 9 |
+
- entropy
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Entropy LoRA (Gemma 3 27B IT) - Updated Adapter
|
| 13 |
+
|
| 14 |
+
This repository contains a PEFT LoRA adapter for `google/gemma-3-27b-it`.
|
| 15 |
+
|
| 16 |
+
This upload supersedes the previous `entropy-v1` adapter with the **epoch-1 checkpoint** from the on-prem PEFT run (`r=64`).
|
| 17 |
+
|
| 18 |
+
## vLLM (runtime LoRA)
|
| 19 |
+
|
| 20 |
+
Important: this adapter is **rank 64**, so vLLM must be started with `--max-lora-rank 64` (or higher).
|
| 21 |
+
|
| 22 |
+
Example:
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
vllm serve google/gemma-3-27b-it \
|
| 26 |
+
--served-model-name google/gemma-3-27b-it \
|
| 27 |
+
--enable-lora \
|
| 28 |
+
--max-lora-rank 64 \
|
| 29 |
+
--lora-modules entropy-v1=ysong21/entropy-v1-lora
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## Transformers + PEFT
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 36 |
+
from peft import PeftModel
|
| 37 |
+
|
| 38 |
+
base = "google/gemma-3-27b-it"
|
| 39 |
+
adapter = "ysong21/entropy-v1-lora"
|
| 40 |
+
|
| 41 |
+
tok = AutoTokenizer.from_pretrained(base)
|
| 42 |
+
model = AutoModelForCausalLM.from_pretrained(base, device_map="auto")
|
| 43 |
+
model = PeftModel.from_pretrained(model, adapter)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Offline Eval (held-out)
|
| 47 |
+
|
| 48 |
+
Validation set: `data/validation.no_overlap.jsonl` (70 examples).
|
| 49 |
+
|
| 50 |
+
- Base `google/gemma-3-27b-it`: `bits_per_char=0.99565`
|
| 51 |
+
- Previous adapter (`ysong21/entropy-v1-lora`, old): `bits_per_char=0.36646`
|
| 52 |
+
- This adapter (epoch-1, `r=64`): `bits_per_char=0.35877`
|
| 53 |
+
- Baseline: `N8Programs/Unslopper-30B-A3B-bf16`: `bits_per_char=0.37522`
|
| 54 |
+
|
| 55 |
+
Note: token-based `ppl_cond` is not directly comparable across tokenizers/models; we rely on char-normalized `bits_per_char` for cross-model comparisons.
|
adapter_config.json
CHANGED
|
@@ -1,39 +1,46 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"alpha_pattern": {},
|
|
|
|
| 3 |
"auto_mapping": null,
|
| 4 |
-
"base_model_name_or_path":
|
| 5 |
"bias": "none",
|
| 6 |
"corda_config": null,
|
|
|
|
| 7 |
"eva_config": null,
|
| 8 |
"exclude_modules": null,
|
| 9 |
"fan_in_fan_out": false,
|
| 10 |
-
"inference_mode":
|
| 11 |
"init_lora_weights": true,
|
| 12 |
"layer_replication": null,
|
| 13 |
"layers_pattern": null,
|
| 14 |
"layers_to_transform": null,
|
| 15 |
"loftq_config": {},
|
| 16 |
-
"lora_alpha":
|
| 17 |
"lora_bias": false,
|
| 18 |
"lora_dropout": 0.05,
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
| 22 |
"peft_type": "LORA",
|
| 23 |
-
"
|
|
|
|
|
|
|
| 24 |
"rank_pattern": {},
|
| 25 |
"revision": null,
|
| 26 |
"target_modules": [
|
| 27 |
"up_proj",
|
| 28 |
-
"
|
|
|
|
| 29 |
"gate_proj",
|
| 30 |
"o_proj",
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"down_proj"
|
| 34 |
],
|
|
|
|
| 35 |
"task_type": "CAUSAL_LM",
|
| 36 |
"trainable_token_indices": null,
|
| 37 |
"use_dora": false,
|
|
|
|
| 38 |
"use_rslora": false
|
| 39 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-3-27b-it",
|
| 7 |
"bias": "none",
|
| 8 |
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
"eva_config": null,
|
| 11 |
"exclude_modules": null,
|
| 12 |
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
"init_lora_weights": true,
|
| 15 |
"layer_replication": null,
|
| 16 |
"layers_pattern": null,
|
| 17 |
"layers_to_transform": null,
|
| 18 |
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 128,
|
| 20 |
"lora_bias": false,
|
| 21 |
"lora_dropout": 0.05,
|
| 22 |
"megatron_config": null,
|
| 23 |
"megatron_core": "megatron.core",
|
| 24 |
"modules_to_save": null,
|
| 25 |
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 64,
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
"up_proj",
|
| 33 |
+
"k_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
"gate_proj",
|
| 36 |
"o_proj",
|
| 37 |
+
"q_proj",
|
| 38 |
+
"v_proj"
|
|
|
|
| 39 |
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
| 42 |
"trainable_token_indices": null,
|
| 43 |
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
"use_rslora": false
|
| 46 |
}
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba6ec6b5878095233f6c2c21b1429aa46469ca968a20a6086aef9400048c878e
|
| 3 |
+
size 1864199752
|
stats.json
CHANGED
|
@@ -1 +1,11 @@
|
|
| 1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_model": "google/gemma-3-27b-it",
|
| 3 |
+
"adapter_source": "data/entropy-v2-lora-r64-e5/checkpoint-125",
|
| 4 |
+
"eval": {
|
| 5 |
+
"data": "data/validation.no_overlap.jsonl",
|
| 6 |
+
"examples": 70,
|
| 7 |
+
"bits_per_char": 0.35876985070947004,
|
| 8 |
+
"ppl_cond": 3.075335556196064,
|
| 9 |
+
"loss_mean": 1.123414019271094
|
| 10 |
+
}
|
| 11 |
+
}
|
train_config.json
CHANGED
|
@@ -1,61 +1,35 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
"
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
"
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
"precision": null,
|
| 36 |
-
"status_file": "gs://fireworks-fine-tuning-job-status/sftj-maxx1999syp-bybv7vrv-v6vlpnyh-5d74d2ea-e4c7-4e3a-ae4d-deb107b98a9e",
|
| 37 |
-
"billing_file": "gs://fireworks-fine-tuning-metadata/sftj-maxx1999syp-bybv7vrv-v6vlpnyh/billing-5d74d2ea-e4c7-4e3a-ae4d-deb107b98a9e",
|
| 38 |
-
"metrics_file": "gs://fireworks-fine-tuning-metadata/sftj-maxx1999syp-bybv7vrv-v6vlpnyh/metrics.jsonl",
|
| 39 |
-
"trainer_logs_file": null,
|
| 40 |
-
"profile": null,
|
| 41 |
-
"weight_sharding": null,
|
| 42 |
-
"activation_sharding": null,
|
| 43 |
-
"empty_weights": false,
|
| 44 |
-
"nan_ratio_threshold": 0.05,
|
| 45 |
-
"fast_api_port": 80,
|
| 46 |
-
"optimizer": "adamw",
|
| 47 |
-
"optimizer_weight_decay": 0.01,
|
| 48 |
-
"target_shard_size_gb": null,
|
| 49 |
-
"enable_fast_processor": false,
|
| 50 |
-
"peft_addon_dir": null,
|
| 51 |
-
"lora_rank": 32,
|
| 52 |
-
"lora_dropout": 0.05,
|
| 53 |
-
"template_kind": "conversation",
|
| 54 |
-
"template": null,
|
| 55 |
-
"mtp_config": { "enable_mtp": false, "freeze_base_model": false, "num_draft_tokens": 1 },
|
| 56 |
-
"distillation_alpha": null,
|
| 57 |
-
"qat": true,
|
| 58 |
-
"kld": false,
|
| 59 |
-
"teft_tokens": [],
|
| 60 |
-
"skip_dataset_filtering": false
|
| 61 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"trainer": "transformers.Trainer",
|
| 3 |
+
"peft": "LoRA",
|
| 4 |
+
"base_model": "google/gemma-3-27b-it",
|
| 5 |
+
"dataset": "N8Programs/unslop-good",
|
| 6 |
+
"objective": "PPL_cond on assistant tokens only; prompt masked up to and including <start_of_turn>model",
|
| 7 |
+
"max_length": 8704,
|
| 8 |
+
"lora": {
|
| 9 |
+
"r": 64,
|
| 10 |
+
"alpha": 128,
|
| 11 |
+
"dropout": 0.05,
|
| 12 |
+
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
| 13 |
+
},
|
| 14 |
+
"optim": {
|
| 15 |
+
"optimizer": "adamw_torch_fused",
|
| 16 |
+
"learning_rate": 0.0001,
|
| 17 |
+
"lr_scheduler": "cosine",
|
| 18 |
+
"warmup_ratio": 0.03,
|
| 19 |
+
"weight_decay": 0.0
|
| 20 |
+
},
|
| 21 |
+
"batching": {
|
| 22 |
+
"per_device_train_batch_size": 1,
|
| 23 |
+
"gradient_accumulation_steps": 8
|
| 24 |
+
},
|
| 25 |
+
"precision": {
|
| 26 |
+
"bf16": true,
|
| 27 |
+
"tf32": true,
|
| 28 |
+
"gradient_checkpointing": true
|
| 29 |
+
},
|
| 30 |
+
"epochs": 5,
|
| 31 |
+
"selected_checkpoint": {
|
| 32 |
+
"checkpoint": "checkpoint-125",
|
| 33 |
+
"epoch": 1
|
| 34 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
}
|