qwertyuiopasdfg commited on
Commit
ffe372d
·
verified ·
1 Parent(s): 956b539

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/config-checkpoint.json CHANGED
@@ -4,15 +4,19 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
 
 
7
  "bos_token_id": 128000,
8
  "eos_token_id": [
9
  128001,
10
  128008,
11
  128009
12
  ],
13
- "head_dim": 128,
14
  "hidden_act": "silu",
15
- "hidden_size": 4096,
16
  "id2label": {
17
  "0": "False_Correct:NA",
18
  "1": "False_Misconception:Adding_across",
@@ -81,7 +85,7 @@
81
  "64": "True_Neither:NA"
82
  },
83
  "initializer_range": 0.02,
84
- "intermediate_size": 14336,
85
  "label2id": {
86
  "False_Correct:NA": 0,
87
  "False_Misconception:Adding_across": 1,
@@ -153,35 +157,21 @@
153
  "mlp_bias": false,
154
  "model_type": "llama",
155
  "num_attention_heads": 32,
156
- "num_hidden_layers": 32,
157
  "num_key_value_heads": 8,
158
  "pad_token_id": 128004,
159
  "pretraining_tp": 1,
160
- "quantization_config": {
161
- "_load_in_4bit": true,
162
- "_load_in_8bit": false,
163
- "bnb_4bit_compute_dtype": "bfloat16",
164
- "bnb_4bit_quant_storage": "uint8",
165
- "bnb_4bit_quant_type": "nf4",
166
- "bnb_4bit_use_double_quant": true,
167
- "llm_int8_enable_fp32_cpu_offload": false,
168
- "llm_int8_has_fp16_weight": false,
169
- "llm_int8_skip_modules": null,
170
- "llm_int8_threshold": 6.0,
171
- "load_in_4bit": true,
172
- "load_in_8bit": false,
173
- "quant_method": "bitsandbytes"
174
- },
175
  "rms_norm_eps": 1e-05,
176
  "rope_scaling": {
177
- "factor": 8.0,
178
  "high_freq_factor": 4.0,
179
  "low_freq_factor": 1.0,
180
  "original_max_position_embeddings": 8192,
181
  "rope_type": "llama3"
182
  },
183
  "rope_theta": 500000.0,
184
- "tie_word_embeddings": false,
185
  "torch_dtype": "bfloat16",
186
  "transformers_version": "4.54.1",
187
  "use_cache": false,
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModelForSequenceClassification": "modeling_custom.LlamaForSequenceClassificationWithCustomHead",
9
+ "LlamaForSequenceClassification": "modeling_custom.LlamaForSequenceClassificationWithCustomHead"
10
+ },
11
  "bos_token_id": 128000,
12
  "eos_token_id": [
13
  128001,
14
  128008,
15
  128009
16
  ],
17
+ "head_dim": 64,
18
  "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
  "id2label": {
21
  "0": "False_Correct:NA",
22
  "1": "False_Misconception:Adding_across",
 
85
  "64": "True_Neither:NA"
86
  },
87
  "initializer_range": 0.02,
88
+ "intermediate_size": 8192,
89
  "label2id": {
90
  "False_Correct:NA": 0,
91
  "False_Misconception:Adding_across": 1,
 
157
  "mlp_bias": false,
158
  "model_type": "llama",
159
  "num_attention_heads": 32,
160
+ "num_hidden_layers": 16,
161
  "num_key_value_heads": 8,
162
  "pad_token_id": 128004,
163
  "pretraining_tp": 1,
164
+ "problem_type": "single_label_classification",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  "rms_norm_eps": 1e-05,
166
  "rope_scaling": {
167
+ "factor": 32.0,
168
  "high_freq_factor": 4.0,
169
  "low_freq_factor": 1.0,
170
  "original_max_position_embeddings": 8192,
171
  "rope_type": "llama3"
172
  },
173
  "rope_theta": 500000.0,
174
+ "tie_word_embeddings": true,
175
  "torch_dtype": "bfloat16",
176
  "transformers_version": "4.54.1",
177
  "use_cache": false,
.ipynb_checkpoints/modeling_custom-checkpoint.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from typing import Optional, Union, Tuple, List
5
+ from transformers import LlamaForSequenceClassification, Cache
6
+ from transformers.modeling_outputs import SequenceClassifierOutputWithPast
7
+
8
+
9
+ # 1. Label smoothing cross entropy loss
10
+ class LabelSmoothingCrossEntropy(nn.Module):
11
+ def __init__(self, eps: float = 0.1):
12
+ super().__init__()
13
+ self.eps = eps
14
+
15
+ def forward(self, pred, target):
16
+ num_classes = pred.size(-1)
17
+ log_preds = F.log_softmax(pred, dim=-1)
18
+ nll = F.nll_loss(log_preds, target, reduction='none')
19
+ smooth_loss = -log_preds.sum(dim=-1)
20
+ loss = (1 - self.eps) * nll + self.eps * smooth_loss / num_classes
21
+ return loss.mean()
22
+
23
+
24
+ # 2. Custom classification head
25
+ class Weights(nn.Module):
26
+ def __init__(self, hidden_size: int = 4096, num_labels: int = 65):
27
+ super().__init__()
28
+ self.fc = nn.Sequential(
29
+ nn.Linear(hidden_size, hidden_size),
30
+ nn.SELU(),
31
+ nn.Linear(hidden_size, hidden_size),
32
+ nn.SELU(),
33
+ nn.Linear(hidden_size, num_labels)
34
+ )
35
+
36
+ def forward(self, x):
37
+ return self.fc(x) # logits, not probabilities
38
+
39
+
40
+ # 3. Modified LLaMA model
41
+ class LlamaForSequenceClassificationWithCustomHead(LlamaForSequenceClassification):
42
+ def __init__(self, config):
43
+ super().__init__(config)
44
+ self.weights = Weights(hidden_size=config.hidden_size, num_labels=65)
45
+ self.loss_fn = LabelSmoothingCrossEntropy(eps=0.1)
46
+
47
+ def forward(
48
+ self,
49
+ input_ids: Optional[torch.LongTensor] = None,
50
+ attention_mask: Optional[torch.Tensor] = None,
51
+ position_ids: Optional[torch.LongTensor] = None,
52
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
53
+ inputs_embeds: Optional[torch.FloatTensor] = None,
54
+ labels: Optional[torch.LongTensor] = None,
55
+ use_cache: Optional[bool] = None,
56
+ output_attentions: Optional[bool] = None,
57
+ output_hidden_states: Optional[bool] = None,
58
+ return_dict: Optional[bool] = None,
59
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
60
+
61
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
62
+
63
+ transformer_outputs = self.model(
64
+ input_ids=input_ids,
65
+ attention_mask=attention_mask,
66
+ position_ids=position_ids,
67
+ past_key_values=past_key_values,
68
+ inputs_embeds=inputs_embeds,
69
+ use_cache=use_cache,
70
+ output_attentions=output_attentions,
71
+ output_hidden_states=output_hidden_states,
72
+ return_dict=return_dict,
73
+ )
74
+
75
+ hidden_states = transformer_outputs[0] # [batch_size, seq_len, hidden_size]
76
+
77
+ if input_ids is not None:
78
+ batch_size = input_ids.shape[0]
79
+ else:
80
+ batch_size = inputs_embeds.shape[0]
81
+
82
+ if self.config.pad_token_id is None and batch_size != 1:
83
+ raise ValueError("Padding token ID must be set for batch size > 1.")
84
+
85
+ # Get sequence length index (last non-padding token)
86
+ if self.config.pad_token_id is not None and input_ids is not None:
87
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(dim=-1) - 1
88
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
89
+ else:
90
+ sequence_lengths = -1 # assume last token if pad token is undefined
91
+
92
+ pooled_hidden = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths]
93
+ logits = self.weights(pooled_hidden)
94
+
95
+ loss = None
96
+ if labels is not None:
97
+ labels = labels.to(logits.device)
98
+ loss = self.loss_fn(logits, labels)
99
+
100
+ if not return_dict:
101
+ output = (logits,)
102
+ return ((loss,) + output) if loss is not None else output
103
+
104
+ return SequenceClassifierOutputWithPast(
105
+ loss=loss,
106
+ logits=logits, # shape [batch_size, 65]
107
+ past_key_values=transformer_outputs.past_key_values,
108
+ hidden_states=transformer_outputs.hidden_states,
109
+ attentions=transformer_outputs.attentions,
110
+ )
config.json CHANGED
@@ -4,15 +4,19 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
 
 
7
  "bos_token_id": 128000,
8
  "eos_token_id": [
9
  128001,
10
  128008,
11
  128009
12
  ],
13
- "head_dim": 128,
14
  "hidden_act": "silu",
15
- "hidden_size": 4096,
16
  "id2label": {
17
  "0": "False_Correct:NA",
18
  "1": "False_Misconception:Adding_across",
@@ -81,7 +85,7 @@
81
  "64": "True_Neither:NA"
82
  },
83
  "initializer_range": 0.02,
84
- "intermediate_size": 14336,
85
  "label2id": {
86
  "False_Correct:NA": 0,
87
  "False_Misconception:Adding_across": 1,
@@ -153,35 +157,21 @@
153
  "mlp_bias": false,
154
  "model_type": "llama",
155
  "num_attention_heads": 32,
156
- "num_hidden_layers": 32,
157
  "num_key_value_heads": 8,
158
  "pad_token_id": 128004,
159
  "pretraining_tp": 1,
160
- "quantization_config": {
161
- "_load_in_4bit": true,
162
- "_load_in_8bit": false,
163
- "bnb_4bit_compute_dtype": "bfloat16",
164
- "bnb_4bit_quant_storage": "uint8",
165
- "bnb_4bit_quant_type": "nf4",
166
- "bnb_4bit_use_double_quant": true,
167
- "llm_int8_enable_fp32_cpu_offload": false,
168
- "llm_int8_has_fp16_weight": false,
169
- "llm_int8_skip_modules": null,
170
- "llm_int8_threshold": 6.0,
171
- "load_in_4bit": true,
172
- "load_in_8bit": false,
173
- "quant_method": "bitsandbytes"
174
- },
175
  "rms_norm_eps": 1e-05,
176
  "rope_scaling": {
177
- "factor": 8.0,
178
  "high_freq_factor": 4.0,
179
  "low_freq_factor": 1.0,
180
  "original_max_position_embeddings": 8192,
181
  "rope_type": "llama3"
182
  },
183
  "rope_theta": 500000.0,
184
- "tie_word_embeddings": false,
185
  "torch_dtype": "bfloat16",
186
  "transformers_version": "4.54.1",
187
  "use_cache": false,
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModelForSequenceClassification": "modeling_custom.LlamaForSequenceClassificationWithCustomHead",
9
+ "LlamaForSequenceClassification": "modeling_custom.LlamaForSequenceClassificationWithCustomHead"
10
+ },
11
  "bos_token_id": 128000,
12
  "eos_token_id": [
13
  128001,
14
  128008,
15
  128009
16
  ],
17
+ "head_dim": 64,
18
  "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
  "id2label": {
21
  "0": "False_Correct:NA",
22
  "1": "False_Misconception:Adding_across",
 
85
  "64": "True_Neither:NA"
86
  },
87
  "initializer_range": 0.02,
88
+ "intermediate_size": 8192,
89
  "label2id": {
90
  "False_Correct:NA": 0,
91
  "False_Misconception:Adding_across": 1,
 
157
  "mlp_bias": false,
158
  "model_type": "llama",
159
  "num_attention_heads": 32,
160
+ "num_hidden_layers": 16,
161
  "num_key_value_heads": 8,
162
  "pad_token_id": 128004,
163
  "pretraining_tp": 1,
164
+ "problem_type": "single_label_classification",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  "rms_norm_eps": 1e-05,
166
  "rope_scaling": {
167
+ "factor": 32.0,
168
  "high_freq_factor": 4.0,
169
  "low_freq_factor": 1.0,
170
  "original_max_position_embeddings": 8192,
171
  "rope_type": "llama3"
172
  },
173
  "rope_theta": 500000.0,
174
+ "tie_word_embeddings": true,
175
  "torch_dtype": "bfloat16",
176
  "transformers_version": "4.54.1",
177
  "use_cache": false,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:343e6679cb37fbd9fd7ee26287ee71b692cf91f4d02780526d58e33050fa9c89
3
- size 4652605573
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ce145be6003c8e4f68e8306b761a8285fcd8b362b285d19d737ff39df6b0766
3
+ size 2471911936
modeling_custom.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from typing import Optional, Union, Tuple, List
5
+ from transformers import LlamaForSequenceClassification, Cache
6
+ from transformers.modeling_outputs import SequenceClassifierOutputWithPast
7
+
8
+
9
+ # 1. Label smoothing cross entropy loss
10
+ class LabelSmoothingCrossEntropy(nn.Module):
11
+ def __init__(self, eps: float = 0.1):
12
+ super().__init__()
13
+ self.eps = eps
14
+
15
+ def forward(self, pred, target):
16
+ num_classes = pred.size(-1)
17
+ log_preds = F.log_softmax(pred, dim=-1)
18
+ nll = F.nll_loss(log_preds, target, reduction='none')
19
+ smooth_loss = -log_preds.sum(dim=-1)
20
+ loss = (1 - self.eps) * nll + self.eps * smooth_loss / num_classes
21
+ return loss.mean()
22
+
23
+
24
+ # 2. Custom classification head
25
+ class Weights(nn.Module):
26
+ def __init__(self, hidden_size: int = 4096, num_labels: int = 65):
27
+ super().__init__()
28
+ self.fc = nn.Sequential(
29
+ nn.Linear(hidden_size, hidden_size),
30
+ nn.SELU(),
31
+ nn.Linear(hidden_size, hidden_size),
32
+ nn.SELU(),
33
+ nn.Linear(hidden_size, num_labels)
34
+ )
35
+
36
+ def forward(self, x):
37
+ return self.fc(x) # logits, not probabilities
38
+
39
+
40
+ # 3. Modified LLaMA model
41
+ class LlamaForSequenceClassificationWithCustomHead(LlamaForSequenceClassification):
42
+ def __init__(self, config):
43
+ super().__init__(config)
44
+ self.weights = Weights(hidden_size=config.hidden_size, num_labels=65)
45
+ self.loss_fn = LabelSmoothingCrossEntropy(eps=0.1)
46
+
47
+ def forward(
48
+ self,
49
+ input_ids: Optional[torch.LongTensor] = None,
50
+ attention_mask: Optional[torch.Tensor] = None,
51
+ position_ids: Optional[torch.LongTensor] = None,
52
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
53
+ inputs_embeds: Optional[torch.FloatTensor] = None,
54
+ labels: Optional[torch.LongTensor] = None,
55
+ use_cache: Optional[bool] = None,
56
+ output_attentions: Optional[bool] = None,
57
+ output_hidden_states: Optional[bool] = None,
58
+ return_dict: Optional[bool] = None,
59
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
60
+
61
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
62
+
63
+ transformer_outputs = self.model(
64
+ input_ids=input_ids,
65
+ attention_mask=attention_mask,
66
+ position_ids=position_ids,
67
+ past_key_values=past_key_values,
68
+ inputs_embeds=inputs_embeds,
69
+ use_cache=use_cache,
70
+ output_attentions=output_attentions,
71
+ output_hidden_states=output_hidden_states,
72
+ return_dict=return_dict,
73
+ )
74
+
75
+ hidden_states = transformer_outputs[0] # [batch_size, seq_len, hidden_size]
76
+
77
+ if input_ids is not None:
78
+ batch_size = input_ids.shape[0]
79
+ else:
80
+ batch_size = inputs_embeds.shape[0]
81
+
82
+ if self.config.pad_token_id is None and batch_size != 1:
83
+ raise ValueError("Padding token ID must be set for batch size > 1.")
84
+
85
+ # Get sequence length index (last non-padding token)
86
+ if self.config.pad_token_id is not None and input_ids is not None:
87
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(dim=-1) - 1
88
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
89
+ else:
90
+ sequence_lengths = -1 # assume last token if pad token is undefined
91
+
92
+ pooled_hidden = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths]
93
+ logits = self.weights(pooled_hidden)
94
+
95
+ loss = None
96
+ if labels is not None:
97
+ labels = labels.to(logits.device)
98
+ loss = self.loss_fn(logits, labels)
99
+
100
+ if not return_dict:
101
+ output = (logits,)
102
+ return ((loss,) + output) if loss is not None else output
103
+
104
+ return SequenceClassifierOutputWithPast(
105
+ loss=loss,
106
+ logits=logits, # shape [batch_size, 65]
107
+ past_key_values=transformer_outputs.past_key_values,
108
+ hidden_states=transformer_outputs.hidden_states,
109
+ attentions=transformer_outputs.attentions,
110
+ )
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28d7ddb1196b0200360e3a8e353a50a066a80af2e417d8f4ec2352dc8cc92e68
3
+ size 5304