phngahn commited on
Commit
e127b3e
·
verified ·
1 Parent(s): 49f06ce

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
config.json CHANGED
@@ -1,23 +1,13 @@
1
- {
2
- "architectures": [
3
- "RobertaForMaskedLM"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "bos_token_id": 0,
7
- "eos_token_id": 2,
8
- "gradient_checkpointing": false,
9
- "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.1,
11
- "hidden_size": 768,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 3072,
14
- "layer_norm_eps": 1e-05,
15
- "max_position_embeddings": 258,
16
- "model_type": "roberta",
17
- "num_attention_heads": 12,
18
- "num_hidden_layers": 12,
19
- "pad_token_id": 1,
20
- "tokenizer_class": "PhobertTokenizer",
21
- "type_vocab_size": 1,
22
- "vocab_size": 64001
23
- }
 
1
+ {
2
+ "architectures": ["PhoBERTClassifier"],
3
+ "model_type": "phobert_aspect",
4
+ "base_model_name": "vinai/phobert-base",
5
+ "num_labels": 3,
6
+ "num_aspects": 4,
7
+ "dropout": 0.3,
8
+ "dtype": "float32",
9
+ "auto_map": {
10
+ "AutoConfig": "modeling_phobert_aspect.PhoBERTAspectConfig",
11
+ "AutoModel": "modeling_phobert_aspect.PhoBERTClassifier"
12
+ }
13
+ }
 
 
 
 
 
 
 
 
 
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb579e74bb0081c48014e687c582a4e542154b4042fe56775e3267eb681e0f69
3
+ size 540054608
modeling_phobert_aspect.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, PreTrainedModel, AutoModel
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+ class PhoBERTAspectConfig(PretrainedConfig):
6
+ model_type = "phobert_aspect"
7
+
8
+ def __init__(
9
+ self,
10
+ num_labels=3,
11
+ num_aspects=4,
12
+ dropout=0.3,
13
+ base_model_name="vinai/phobert-base",
14
+ **kwargs
15
+ ):
16
+ super().__init__(**kwargs)
17
+ self.num_labels = num_labels
18
+ self.num_aspects = num_aspects
19
+ self.dropout = dropout
20
+ self.base_model_name = base_model_name
21
+
22
+
23
+ class PhoBERTClassifier(PreTrainedModel):
24
+ config_class = PhoBERTAspectConfig
25
+
26
+ def __init__(self, config):
27
+ super().__init__(config)
28
+
29
+ self.phobert = AutoModel.from_pretrained(
30
+ config.base_model_name
31
+ )
32
+
33
+ hidden_size = self.phobert.config.hidden_size
34
+ self.dropout = nn.Dropout(config.dropout)
35
+
36
+ self.classifiers = nn.ModuleList([
37
+ nn.Linear(hidden_size, config.num_labels)
38
+ for _ in range(config.num_aspects)
39
+ ])
40
+
41
+ self.post_init()
42
+
43
+ def forward(self, input_ids, attention_mask):
44
+ outputs = self.phobert(
45
+ input_ids=input_ids,
46
+ attention_mask=attention_mask
47
+ )
48
+
49
+ pooled = outputs.last_hidden_state[:, 0, :]
50
+ pooled = self.dropout(pooled)
51
+
52
+ logits = [clf(pooled) for clf in self.classifiers]
53
+ return torch.stack(logits, dim=1)
special_tokens_map.json CHANGED
@@ -1,6 +1,9 @@
1
  {
2
- "unk_token": "<unk>",
3
- "sep_token": "</s>",
4
- "pad_token": "<pad>",
5
- "cls_token": "<s>"
6
- }
 
 
 
 
1
  {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer_config.json CHANGED
@@ -1,11 +1,55 @@
1
  {
2
- "do_lower_case": false,
3
- "bos_token": "<s>",
4
- "eos_token": "</s>",
5
- "unk_token": "<unk>",
6
- "pad_token": "<pad>",
7
- "sep_token": "</s>",
8
- "cls_token": "<s>",
9
- "model_max_length": 120,
10
- "tokenizer_class": "RobertaTokenizer"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "64000": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "PhobertTokenizer",
54
+ "unk_token": "<unk>"
55
+ }