ysong21 commited on
Commit
2227164
·
verified ·
1 Parent(s): 13a26a2

Overwrite adapter with checkpoint-125 (r64 epoch-1)

Browse files
Files changed (5) hide show
  1. README.md +55 -0
  2. adapter_config.json +15 -8
  3. adapter_model.safetensors +2 -2
  4. stats.json +11 -1
  5. train_config.json +33 -59
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-27b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - lora
7
+ - peft
8
+ - gemma
9
+ - entropy
10
+ ---
11
+
12
+ # Entropy LoRA (Gemma 3 27B IT) - Updated Adapter
13
+
14
+ This repository contains a PEFT LoRA adapter for `google/gemma-3-27b-it`.
15
+
16
+ This upload supersedes the previous `entropy-v1` adapter with the **epoch-1 checkpoint** from the on-prem PEFT run (`r=64`).
17
+
18
+ ## vLLM (runtime LoRA)
19
+
20
+ Important: this adapter is **rank 64**, so vLLM must be started with `--max-lora-rank 64` (or higher).
21
+
22
+ Example:
23
+
24
+ ```bash
25
+ vllm serve google/gemma-3-27b-it \
26
+ --served-model-name google/gemma-3-27b-it \
27
+ --enable-lora \
28
+ --max-lora-rank 64 \
29
+ --lora-modules entropy-v1=ysong21/entropy-v1-lora
30
+ ```
31
+
32
+ ## Transformers + PEFT
33
+
34
+ ```python
35
+ from transformers import AutoModelForCausalLM, AutoTokenizer
36
+ from peft import PeftModel
37
+
38
+ base = "google/gemma-3-27b-it"
39
+ adapter = "ysong21/entropy-v1-lora"
40
+
41
+ tok = AutoTokenizer.from_pretrained(base)
42
+ model = AutoModelForCausalLM.from_pretrained(base, device_map="auto")
43
+ model = PeftModel.from_pretrained(model, adapter)
44
+ ```
45
+
46
+ ## Offline Eval (held-out)
47
+
48
+ Validation set: `data/validation.no_overlap.jsonl` (70 examples).
49
+
50
+ - Base `google/gemma-3-27b-it`: `bits_per_char=0.99565`
51
+ - Previous adapter (`ysong21/entropy-v1-lora`, old): `bits_per_char=0.36646`
52
+ - This adapter (epoch-1, `r=64`): `bits_per_char=0.35877`
53
+ - Baseline: `N8Programs/Unslopper-30B-A3B-bf16`: `bits_per_char=0.37522`
54
+
55
+ Note: token-based `ppl_cond` is not directly comparable across tokenizers/models; we rely on char-normalized `bits_per_char` for cross-model comparisons.
adapter_config.json CHANGED
@@ -1,39 +1,46 @@
1
  {
 
2
  "alpha_pattern": {},
 
3
  "auto_mapping": null,
4
- "base_model_name_or_path": null,
5
  "bias": "none",
6
  "corda_config": null,
 
7
  "eva_config": null,
8
  "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
- "inference_mode": false,
11
  "init_lora_weights": true,
12
  "layer_replication": null,
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
- "lora_alpha": 64,
17
  "lora_bias": false,
18
  "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
- "r": 32,
 
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
  "up_proj",
28
- "q_proj",
 
29
  "gate_proj",
30
  "o_proj",
31
- "v_proj",
32
- "k_proj",
33
- "down_proj"
34
  ],
 
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
37
  "use_dora": false,
 
38
  "use_rslora": false
39
  }
 
1
  {
2
+ "alora_invocation_tokens": null,
3
  "alpha_pattern": {},
4
+ "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-27b-it",
7
  "bias": "none",
8
  "corda_config": null,
9
+ "ensure_weight_tying": false,
10
  "eva_config": null,
11
  "exclude_modules": null,
12
  "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
  "init_lora_weights": true,
15
  "layer_replication": null,
16
  "layers_pattern": null,
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
+ "lora_alpha": 128,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
25
  "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
  "up_proj",
33
+ "k_proj",
34
+ "down_proj",
35
  "gate_proj",
36
  "o_proj",
37
+ "q_proj",
38
+ "v_proj"
 
39
  ],
40
+ "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
42
  "trainable_token_indices": null,
43
  "use_dora": false,
44
+ "use_qalora": false,
45
  "use_rslora": false
46
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e0e196aa158d3a11ef4b0b023921fae6410a0eedf6d74a5b6ff10c5d4dff1d8
3
- size 454197288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6ec6b5878095233f6c2c21b1429aa46469ca968a20a6086aef9400048c878e
3
+ size 1864199752
stats.json CHANGED
@@ -1 +1,11 @@
1
- {"world_size": 4, "epochs": 1, "steps": 94, "seqs": 999, "tokens": 2906680, "last_epoch_steps": 0, "last_epoch_seqs": 0, "last_epoch_tokens": 0, "total_seqs": 999, "nan_in_loss_seqs": 0, "experiment_tracking_run_id": null, "loss_ema": 1.6103686253345062, "loss_sum": 33.14478254318237, "mtp_loss_ema": 0, "mtp_loss_sum": 0, "distillation_loss_ema": 0, "distillation_loss_sum": 0, "hard_loss_ema": 0, "hard_loss_sum": 0, "eval_losses_avg": []}
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "google/gemma-3-27b-it",
3
+ "adapter_source": "data/entropy-v2-lora-r64-e5/checkpoint-125",
4
+ "eval": {
5
+ "data": "data/validation.no_overlap.jsonl",
6
+ "examples": 70,
7
+ "bits_per_char": 0.35876985070947004,
8
+ "ppl_cond": 3.075335556196064,
9
+ "loss_mean": 1.123414019271094
10
+ }
11
+ }
train_config.json CHANGED
@@ -1,61 +1,35 @@
1
  {
2
- "comet": false,
3
- "comet_api_key": null,
4
- "comet_workspace": null,
5
- "comet_project": null,
6
- "comet_run_id": "v6vlpnyh",
7
- "wandb": true,
8
- "wandb_entity": "maxsong-carnegie-mellon-university",
9
- "wandb_project": "entropy",
10
- "wandb_run_id": "v6vlpnyh",
11
- "base_model_dir": "/llm-downloader-destination/base/fireworks/gemma-3-27b-it/hf",
12
- "output_model_dir": "gs://fireworks-artifacts-maxx1999syp-bybv7vrv-254f13/tuned-model-v6vlpnyh/5a2aa8/gemma-3-27b-entropy-02082026/checkpoint",
13
- "checkpoint_dir": "/dev/shm/checkpoints",
14
- "gcs_checkpoint_dir": "gs://fireworks-artifacts-maxx1999syp-bybv7vrv-254f13/tuned-model-v6vlpnyh/5a2aa8/gemma-3-27b-entropy-02082026/checkpoints/checkpoints",
15
- "max_checkpoints_to_keep": 1,
16
- "checkpoint_interval": 3600,
17
- "save_final_checkpoint": false,
18
- "train": true,
19
- "learning_rate": 0.0002,
20
- "learning_rate_warmup_steps": 0,
21
- "grad_accum_steps": 1,
22
- "epochs": 1,
23
- "early_stop": false,
24
- "seed": 42,
25
- "dataset_dir": "/mnt/staging/dataset",
26
- "eval_auto_carveout": false,
27
- "eval_dataset_dir": null,
28
- "train_limit": null,
29
- "max_context_len": 8192,
30
- "batch_size": 32768,
31
- "batch_size_samples": null,
32
- "max_data_workers": 0,
33
- "min_evals_per_epoch": 1,
34
- "max_evals_per_epoch": 5,
35
- "precision": null,
36
- "status_file": "gs://fireworks-fine-tuning-job-status/sftj-maxx1999syp-bybv7vrv-v6vlpnyh-5d74d2ea-e4c7-4e3a-ae4d-deb107b98a9e",
37
- "billing_file": "gs://fireworks-fine-tuning-metadata/sftj-maxx1999syp-bybv7vrv-v6vlpnyh/billing-5d74d2ea-e4c7-4e3a-ae4d-deb107b98a9e",
38
- "metrics_file": "gs://fireworks-fine-tuning-metadata/sftj-maxx1999syp-bybv7vrv-v6vlpnyh/metrics.jsonl",
39
- "trainer_logs_file": null,
40
- "profile": null,
41
- "weight_sharding": null,
42
- "activation_sharding": null,
43
- "empty_weights": false,
44
- "nan_ratio_threshold": 0.05,
45
- "fast_api_port": 80,
46
- "optimizer": "adamw",
47
- "optimizer_weight_decay": 0.01,
48
- "target_shard_size_gb": null,
49
- "enable_fast_processor": false,
50
- "peft_addon_dir": null,
51
- "lora_rank": 32,
52
- "lora_dropout": 0.05,
53
- "template_kind": "conversation",
54
- "template": null,
55
- "mtp_config": { "enable_mtp": false, "freeze_base_model": false, "num_draft_tokens": 1 },
56
- "distillation_alpha": null,
57
- "qat": true,
58
- "kld": false,
59
- "teft_tokens": [],
60
- "skip_dataset_filtering": false
61
  }
 
1
  {
2
+ "trainer": "transformers.Trainer",
3
+ "peft": "LoRA",
4
+ "base_model": "google/gemma-3-27b-it",
5
+ "dataset": "N8Programs/unslop-good",
6
+ "objective": "PPL_cond on assistant tokens only; prompt masked up to and including <start_of_turn>model",
7
+ "max_length": 8704,
8
+ "lora": {
9
+ "r": 64,
10
+ "alpha": 128,
11
+ "dropout": 0.05,
12
+ "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
13
+ },
14
+ "optim": {
15
+ "optimizer": "adamw_torch_fused",
16
+ "learning_rate": 0.0001,
17
+ "lr_scheduler": "cosine",
18
+ "warmup_ratio": 0.03,
19
+ "weight_decay": 0.0
20
+ },
21
+ "batching": {
22
+ "per_device_train_batch_size": 1,
23
+ "gradient_accumulation_steps": 8
24
+ },
25
+ "precision": {
26
+ "bf16": true,
27
+ "tf32": true,
28
+ "gradient_checkpointing": true
29
+ },
30
+ "epochs": 5,
31
+ "selected_checkpoint": {
32
+ "checkpoint": "checkpoint-125",
33
+ "epoch": 1
34
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }