AhmedGaver commited on
Commit
6d46009
·
verified ·
1 Parent(s): f3d444f

Upload v2 of URL classifier model (hybrid BERT + tabular)

Browse files

Training Metrics:
- Eval Loss: 0.07472482323646545
- Eval F1 Macro: 0.9319480242737234
- Eval Accuracy: 0.9817232375979112

config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "architectures": [
3
- "BertForSequenceClassification"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "classifier_dropout": null,
@@ -16,7 +16,7 @@
16
  "model_type": "bert",
17
  "num_attention_heads": 12,
18
  "num_hidden_layers": 12,
19
- "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "transformers_version": "4.57.0",
22
  "type_vocab_size": 2,
 
1
  {
2
  "architectures": [
3
+ "BertModel"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "classifier_dropout": null,
 
16
  "model_type": "bert",
17
  "num_attention_heads": 12,
18
  "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
  "position_embedding_type": "absolute",
21
  "transformers_version": "4.57.0",
22
  "type_vocab_size": 2,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d9a3e637d2b82a295a3ea5555e3a0040b5645d59093f47df6bdeac1f6486c68
3
- size 442499064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f225c4b06118f3c2848477bd64c4b76a2928ce8e330b42ba114faa706dcc0e
3
+ size 442503856
special_tokens_map.json CHANGED
@@ -1,4 +1,10 @@
1
  {
 
 
 
 
 
 
2
  "cls_token": {
3
  "content": "[CLS]",
4
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[DOMAIN]",
4
+ "[PATH]",
5
+ "[IP]",
6
+ "[IPv6]"
7
+ ],
8
  "cls_token": {
9
  "content": "[CLS]",
10
  "lstrip": false,
tabular_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "num_labels": 2,
3
+ "num_tabular_features": 6
4
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -39,28 +39,53 @@
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  },
 
 
 
 
 
 
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
  "extra_special_tokens": {},
49
  "mask_token": "[MASK]",
50
- "max_len": 64,
51
- "max_length": 64,
52
- "model_max_length": 64,
53
- "never_split": null,
54
- "pad_to_multiple_of": null,
55
  "pad_token": "[PAD]",
56
- "pad_token_type_id": 0,
57
- "padding_side": "right",
58
  "sep_token": "[SEP]",
59
- "stride": 0,
60
- "strip_accents": null,
61
- "tokenize_chinese_chars": true,
62
- "tokenizer_class": "BertTokenizer",
63
- "truncation_side": "right",
64
- "truncation_strategy": "longest_first",
65
  "unk_token": "[UNK]"
66
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[UNK]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
+ },
43
+ "5": {
44
+ "content": "[DOMAIN]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[PATH]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "[IP]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "[IPv6]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
  }
75
  },
76
+ "additional_special_tokens": [
77
+ "[DOMAIN]",
78
+ "[PATH]",
79
+ "[IP]",
80
+ "[IPv6]"
81
+ ],
82
  "clean_up_tokenization_spaces": true,
83
  "cls_token": "[CLS]",
 
 
84
  "extra_special_tokens": {},
85
  "mask_token": "[MASK]",
86
+ "model_max_length": 1024,
 
 
 
 
87
  "pad_token": "[PAD]",
 
 
88
  "sep_token": "[SEP]",
89
+ "tokenizer_class": "PreTrainedTokenizerFast",
 
 
 
 
 
90
  "unk_token": "[UNK]"
91
  }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf205e4a14c0b396ee497711eeaf558cb79f3225536ee3e5fea5d95bccf5321
3
+ size 5777