yezdata commited on
Commit
296800d
·
verified ·
1 Parent(s): 8cc43a7

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -1 +1,2 @@
1
  emcoder/model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
1
  emcoder/model.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
configuration_emcoder.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class EmCoderConfig(PretrainedConfig):
4
+ model_type = "emcoder"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=50265,
9
+ max_seq_len=512,
10
+ d_model=768,
11
+ n_head=12,
12
+ n_layers=6,
13
+ d_ffn=3072,
14
+ dropout=0.15,
15
+ num_labels=28,
16
+ base_encoder_path="",
17
+ id2label=None,
18
+ label2id=None,
19
+ **kwargs
20
+ ):
21
+ # id2label konverze na int klíče (kvůli JSON standardu)
22
+ if id2label is not None:
23
+ id2label = {int(k): v for k, v in id2label.items()}
24
+
25
+ super().__init__(
26
+ id2label=id2label,
27
+ label2id=label2id,
28
+ **kwargs
29
+ )
30
+ self.vocab_size = vocab_size
31
+ self.max_seq_len = max_seq_len
32
+ self.d_model = d_model
33
+ self.n_head = n_head
34
+ self.n_layers = n_layers
35
+ self.d_ffn = d_ffn
36
+ self.dropout = dropout
37
+ self.num_labels = num_labels
38
+ self.base_encoder_path = base_encoder_path
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79307191a44f91b6c9b7e2373062bd655a38efef31a16831e7629d18ce33f50
3
+ size 328565600
model_config.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoConfig": "configuration_emcoder.EmCoderConfig",
4
+ "AutoModel": "modeling_emcoder.EmCoder"
5
+ },
6
+ "architectures": [
7
+ "EmCoder"
8
+ ],
9
+ "vocab_size": 50265,
10
+ "max_seq_len": 512,
11
+ "d_model": 768,
12
+ "n_head": 12,
13
+ "n_layers": 6,
14
+ "d_ffn": 3072,
15
+ "dropout": 0.15,
16
+ "num_labels": 28,
17
+ "id2label": {
18
+ "0": "admiration",
19
+ "1": "amusement",
20
+ "2": "anger",
21
+ "3": "annoyance",
22
+ "4": "approval",
23
+ "5": "caring",
24
+ "6": "confusion",
25
+ "7": "curiosity",
26
+ "8": "desire",
27
+ "9": "disappointment",
28
+ "10": "disapproval",
29
+ "11": "disgust",
30
+ "12": "embarrassment",
31
+ "13": "excitement",
32
+ "14": "fear",
33
+ "15": "gratitude",
34
+ "16": "grief",
35
+ "17": "joy",
36
+ "18": "love",
37
+ "19": "nervousness",
38
+ "20": "optimism",
39
+ "21": "pride",
40
+ "22": "realization",
41
+ "23": "relief",
42
+ "24": "remorse",
43
+ "25": "sadness",
44
+ "26": "surprise",
45
+ "27": "neutral"
46
+ },
47
+ "label2id": {
48
+ "admiration": 0,
49
+ "amusement": 1,
50
+ "anger": 2,
51
+ "annoyance": 3,
52
+ "approval": 4,
53
+ "caring": 5,
54
+ "confusion": 6,
55
+ "curiosity": 7,
56
+ "desire": 8,
57
+ "disappointment": 9,
58
+ "disapproval": 10,
59
+ "disgust": 11,
60
+ "embarrassment": 12,
61
+ "excitement": 13,
62
+ "fear": 14,
63
+ "gratitude": 15,
64
+ "grief": 16,
65
+ "joy": 17,
66
+ "love": 18,
67
+ "nervousness": 19,
68
+ "optimism": 20,
69
+ "pride": 21,
70
+ "realization": 22,
71
+ "relief": 23,
72
+ "remorse": 24,
73
+ "sadness": 25,
74
+ "surprise": 26,
75
+ "neutral": 27
76
+ },
77
+ "base_encoder_path": "models/v1/pretrain/checkpoints/epoch_2/step_40000"
78
+ }
model_state.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "train_loss": 0.264223575592041,
3
+ "eval_loss": 0.2328128303236821
4
+ }
modeling_emcoder.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from .configuration_emcoder import EmCoderConfig
5
+
6
+
7
+ class EmCoderCore(nn.Module):
8
+ """The core encoder architecture of EmCoder, without the classification head."""
9
+ def __init__(self, config: EmCoderConfig):
10
+ super().__init__()
11
+
12
+ self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
13
+ self.pos_embedding = nn.Embedding(config.max_seq_len, config.d_model)
14
+ self.embed_norm = nn.LayerNorm(config.d_model)
15
+
16
+ encoder_layer = nn.TransformerEncoderLayer(
17
+ d_model=config.d_model,
18
+ nhead=config.n_head,
19
+ dim_feedforward=config.d_ffn,
20
+ dropout=config.dropout,
21
+ activation="gelu",
22
+ norm_first=True,
23
+ batch_first=True
24
+ )
25
+ self.encoder = nn.TransformerEncoder(
26
+ encoder_layer=encoder_layer,
27
+ num_layers=config.n_layers
28
+ )
29
+
30
+ self.final_norm = nn.LayerNorm(config.d_model)
31
+ self.dropout = nn.Dropout(config.dropout)
32
+
33
+
34
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
35
+ """Standard forward pass through the encoder."""
36
+ seq_len = x.size(1)
37
+ pos_ids = torch.arange(seq_len, device=x.device).unsqueeze(0)
38
+
39
+ x = self.token_embedding(x) + self.pos_embedding(pos_ids)
40
+
41
+ x = self.embed_norm(x)
42
+ x = self.dropout(x)
43
+
44
+ padding_mask = (mask == 0)
45
+
46
+ encoded = self.encoder(x, src_key_padding_mask=padding_mask)
47
+ return self.final_norm(encoded)
48
+
49
+
50
+ class EmCoder(PreTrainedModel):
51
+ """The full EmCoder model, including the classification head."""
52
+ config_class = EmCoderConfig
53
+
54
+ def __init__(self, config: EmCoderConfig):
55
+ super().__init__(config)
56
+
57
+ self.encoder = EmCoderCore(config)
58
+ self.classifier = nn.Sequential(
59
+ nn.Linear(config.d_model, config.d_model),
60
+ nn.GELU(),
61
+ nn.Dropout(config.dropout),
62
+ nn.Linear(config.d_model, config.num_labels)
63
+ )
64
+
65
+ self.post_init()
66
+
67
+ def _set_mc_dropout(self, active: bool = True):
68
+ for m in self.modules():
69
+ if isinstance(m, nn.Dropout):
70
+ m.train(active)
71
+
72
+ @staticmethod
73
+ def _masked_mean_pooling(features: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
74
+ mask = mask.unsqueeze(-1) # (B, S, 1)
75
+ masked_features = features * mask # (B, S, D)
76
+ sum_masked_features = masked_features.sum(dim=1) # (B, D)
77
+ count_tokens = torch.clamp(mask.sum(dim=1), min=1e-9) # (B, 1)
78
+ return sum_masked_features / count_tokens # (B, D)
79
+
80
+ def mc_forward(self, x: torch.Tensor, mask: torch.Tensor, n_samples: int) -> torch.Tensor:
81
+ """Performs Monte Carlo Dropout inference to quantify epistemic uncertainty."""
82
+ self._set_mc_dropout(active=True)
83
+
84
+ B, S = x.shape
85
+ x_stacked = x.repeat(n_samples, 1) # (n_samples * B, S)
86
+ mask_stacked = mask.repeat(n_samples, 1)
87
+
88
+ features = self.encoder(x_stacked, mask_stacked)
89
+ pooled = self._masked_mean_pooling(features, mask_stacked)
90
+ logits = self.classifier(pooled) # (n_samples * B, num_labels)
91
+
92
+ return logits.view(n_samples, B, -1)
93
+
94
+
95
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
96
+ """Standard forward pass without MC Dropout."""
97
+ features = self.encoder(x, mask)
98
+
99
+ pooled = self._masked_mean_pooling(features, mask)
100
+ return self.classifier(pooled)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch>=2.11.0
2
+ transformers>=5.7.0
3
+ safetensors>=0.7.0
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
train_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bayesian_train": true,
3
+ "loss_weights": "log",
4
+ "tokenized_ds_dir": "data/goemotions_v1_seq512",
5
+ "encoder_lr": 0.00001,
6
+ "head_lr": 0.0005,
7
+ "lr_warmup": 0.05,
8
+ "weight_decay": 0.01,
9
+ "batch_size": 32,
10
+ "gradient_accumulation_steps": 8,
11
+ "num_epochs": 10
12
+ }