Deepu1965 commited on
Commit
1f7d75c
·
verified ·
1 Parent(s): 976f3ae

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ lora_moe_training.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bonus 3: LoRA for MoE Experts
2
+
3
+ ## Model
4
+
5
+ Parameter-efficient fine-tuning of Mixture-of-Experts using **LoRA (Low-Rank Adaptation)**.
6
+
7
+ ## Architecture
8
+
9
+ - 4 transformer layers with MoE
10
+ - 8 experts per layer
11
+ - Top-2 routing
12
+ - LoRA rank: 16, alpha: 32
13
+
14
+ ## Parameter Efficiency
15
+
16
+ - **Total Parameters**: 55,228,676
17
+ - **Trainable (LoRA)**: 21,625,092 (39.16%)
18
+ - **Frozen (Base)**: 33,603,584 (60.84%)
19
+ - **Reduction**: 2.6x fewer trainable parameters
20
+
21
+ ## Performance
22
+
23
+ - **Validation Accuracy**: 0.6400
24
+ - **Dataset**: XSum (topic classification)
25
+ - **Training Samples**: 4,000
26
+
27
+ ## LoRA Benefits
28
+
29
+ 1. **Memory Efficient**: Only store small adapter matrices
30
+ 2. **Fast Training**: Fewer parameters to update
31
+ 3. **Task Switching**: Swap LoRA adapters for different tasks
32
+ 4. **Merge Friendly**: Can merge adapters back into base weights
33
+
34
+ ## Files
35
+
36
+ - `model.pt`: Full model checkpoint
37
+ - `lora_adapters.pt`: Only LoRA parameters (smaller file)
38
+ - `metrics.json`: Training metrics and config
39
+ - `history.csv`: Training history
40
+
41
+ ## Usage
42
+
43
+ ```python
44
+ # Load full model
45
+ checkpoint = torch.load('model.pt')
46
+ model.load_state_dict(checkpoint['model_state_dict'])
47
+
48
+ # Or load only LoRA adapters (requires base model)
49
+ lora_checkpoint = torch.load('lora_adapters.pt')
50
+ model.load_state_dict(lora_checkpoint['lora_state_dict'], strict=False)
51
+ ```
history.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ epoch,train_loss,train_accuracy,val_loss,val_accuracy
2
+ 1,0.8074325952529907,0.62525,0.8184478509426117,0.64
3
+ 2,0.7937552418708801,0.637,0.7908735847473145,0.64
4
+ 3,0.7901616661548615,0.6455,0.798002507686615,0.64
5
+ 4,0.7901241521835327,0.6365,0.8332968425750732,0.64
6
+ 5,0.7865016897916793,0.6465,0.7994629460573196,0.64
lora_adapters.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cc75c50d7fd0e92374fc126b34a569515c0b2dca5e282ccd8466ce563c41d31
3
+ size 6334282
lora_moe_training.png ADDED

Git LFS Details

  • SHA256: 46131a0ccd7250ffecb578c2f6450d229741d14c7be5ed6de2863e6fa196b542
  • Pointer size: 131 Bytes
  • Size of remote file: 121 kB
metrics.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": [
3
+ {
4
+ "epoch": 1,
5
+ "train_loss": 0.8074325952529907,
6
+ "train_accuracy": 0.62525,
7
+ "val_loss": 0.8184478509426117,
8
+ "val_accuracy": 0.64
9
+ },
10
+ {
11
+ "epoch": 2,
12
+ "train_loss": 0.7937552418708801,
13
+ "train_accuracy": 0.637,
14
+ "val_loss": 0.7908735847473145,
15
+ "val_accuracy": 0.64
16
+ },
17
+ {
18
+ "epoch": 3,
19
+ "train_loss": 0.7901616661548615,
20
+ "train_accuracy": 0.6455,
21
+ "val_loss": 0.798002507686615,
22
+ "val_accuracy": 0.64
23
+ },
24
+ {
25
+ "epoch": 4,
26
+ "train_loss": 0.7901241521835327,
27
+ "train_accuracy": 0.6365,
28
+ "val_loss": 0.8332968425750732,
29
+ "val_accuracy": 0.64
30
+ },
31
+ {
32
+ "epoch": 5,
33
+ "train_loss": 0.7865016897916793,
34
+ "train_accuracy": 0.6465,
35
+ "val_loss": 0.7994629460573196,
36
+ "val_accuracy": 0.64
37
+ }
38
+ ],
39
+ "config": {
40
+ "tokenizer": "bert-base-uncased",
41
+ "max_seq_len": 128,
42
+ "hidden_dim": 512,
43
+ "num_experts": 8,
44
+ "top_k": 2,
45
+ "lora_rank": 16,
46
+ "lora_alpha": 32,
47
+ "batch_size": 16,
48
+ "learning_rate": 0.001,
49
+ "num_epochs": 5,
50
+ "seed": 42,
51
+ "device": "cuda",
52
+ "hf_repo": "Deepu1965/bonus3-lora-moe"
53
+ },
54
+ "param_counts": {
55
+ "trainable": 21625092,
56
+ "frozen": 33603584,
57
+ "total": 55228676
58
+ },
59
+ "expert_usage": [
60
+ 270.3500061035156,
61
+ 583.625,
62
+ 598.9650268554688,
63
+ 359.67999267578125,
64
+ 425.7900085449219,
65
+ 603.489990234375,
66
+ 1022.885009765625,
67
+ 231.21499633789062
68
+ ]
69
+ }
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53be695e8fa32b0f3f871d66bcf00394b365fd99e0c747ad7a2db73979d059cd
3
+ size 221009538
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff