Transformers

Add special token for table tokenization

#1
by August-GKD - opened
added_tokens.json CHANGED
@@ -1,13 +1,23 @@
1
  {
 
2
  "</citation>": 128013,
3
  "</code>": 128011,
 
4
  "</judgement>": 128020,
 
 
5
  "</think>": 128009,
6
  "</tool_call>": 128005,
7
  "</tool_response>": 128007,
 
8
  "<citation>": 128012,
9
  "<code>": 128010,
 
 
10
  "<judgement>": 128019,
 
 
 
11
  "<think>": 128008,
12
  "<tool_call>": 128004,
13
  "<tool_response>": 128006,
 
1
  {
2
+ "</cell>": 128264,
3
  "</citation>": 128013,
4
  "</code>": 128011,
5
+ "</header>": 128260,
6
  "</judgement>": 128020,
7
+ "</row>": 128262,
8
+ "</table>": 128258,
9
  "</think>": 128009,
10
  "</tool_call>": 128005,
11
  "</tool_response>": 128007,
12
+ "<cell>": 128263,
13
  "<citation>": 128012,
14
  "<code>": 128010,
15
+ "<empty>": 128265,
16
+ "<header>": 128259,
17
  "<judgement>": 128019,
18
+ "<mask>": 128256,
19
+ "<row>": 128261,
20
+ "<table>": 128257,
21
  "<think>": 128008,
22
  "<tool_call>": 128004,
23
  "<tool_response>": 128006,
special_tokens_map.json CHANGED
@@ -1,4 +1,69 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|begin_of_text|>",
4
  "lstrip": false,
@@ -13,6 +78,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
 
 
 
 
 
 
16
  "pad_token": {
17
  "content": "<|end_of_text|>",
18
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<table>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</table>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<header>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</header>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<row>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "</row>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<cell>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "</cell>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<empty>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ }
66
+ ],
67
  "bos_token": {
68
  "content": "<|begin_of_text|>",
69
  "lstrip": false,
 
78
  "rstrip": false,
79
  "single_word": false
80
  },
81
+ "mask_token": {
82
+ "content": "<mask>",
83
+ "lstrip": false,
84
+ "normalized": false,
85
+ "rstrip": false,
86
+ "single_word": false
87
+ },
88
  "pad_token": {
89
  "content": "<|end_of_text|>",
90
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47a2732154db3c5893e460b2ea810422106e59a0e5689d7285b118fb1eb02b04
3
- size 17209485
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:849280aad4f0bc56ac303b5415adb979de8ea15fd32c4dfd379c633ef0310f23
3
+ size 17211324
tokenizer_config.json CHANGED
@@ -2049,13 +2049,105 @@
2049
  "rstrip": false,
2050
  "single_word": false,
2051
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2052
  }
2053
  },
 
 
 
 
 
 
 
 
 
 
 
2054
  "bos_token": "<|begin_of_text|>",
2055
  "clean_up_tokenization_spaces": true,
2056
  "eos_token": "<|im_end|>",
2057
  "errors": "replace",
2058
  "extra_special_tokens": {},
 
2059
  "model_input_names": [
2060
  "input_ids",
2061
  "attention_mask"
 
2049
  "rstrip": false,
2050
  "single_word": false,
2051
  "special": true
2052
+ },
2053
+ "128256": {
2054
+ "content": "<mask>",
2055
+ "lstrip": false,
2056
+ "normalized": false,
2057
+ "rstrip": false,
2058
+ "single_word": false,
2059
+ "special": true
2060
+ },
2061
+ "128257": {
2062
+ "content": "<table>",
2063
+ "lstrip": false,
2064
+ "normalized": false,
2065
+ "rstrip": false,
2066
+ "single_word": false,
2067
+ "special": true
2068
+ },
2069
+ "128258": {
2070
+ "content": "</table>",
2071
+ "lstrip": false,
2072
+ "normalized": false,
2073
+ "rstrip": false,
2074
+ "single_word": false,
2075
+ "special": true
2076
+ },
2077
+ "128259": {
2078
+ "content": "<header>",
2079
+ "lstrip": false,
2080
+ "normalized": false,
2081
+ "rstrip": false,
2082
+ "single_word": false,
2083
+ "special": true
2084
+ },
2085
+ "128260": {
2086
+ "content": "</header>",
2087
+ "lstrip": false,
2088
+ "normalized": false,
2089
+ "rstrip": false,
2090
+ "single_word": false,
2091
+ "special": true
2092
+ },
2093
+ "128261": {
2094
+ "content": "<row>",
2095
+ "lstrip": false,
2096
+ "normalized": false,
2097
+ "rstrip": false,
2098
+ "single_word": false,
2099
+ "special": true
2100
+ },
2101
+ "128262": {
2102
+ "content": "</row>",
2103
+ "lstrip": false,
2104
+ "normalized": false,
2105
+ "rstrip": false,
2106
+ "single_word": false,
2107
+ "special": true
2108
+ },
2109
+ "128263": {
2110
+ "content": "<cell>",
2111
+ "lstrip": false,
2112
+ "normalized": false,
2113
+ "rstrip": false,
2114
+ "single_word": false,
2115
+ "special": true
2116
+ },
2117
+ "128264": {
2118
+ "content": "</cell>",
2119
+ "lstrip": false,
2120
+ "normalized": false,
2121
+ "rstrip": false,
2122
+ "single_word": false,
2123
+ "special": true
2124
+ },
2125
+ "128265": {
2126
+ "content": "<empty>",
2127
+ "lstrip": false,
2128
+ "normalized": false,
2129
+ "rstrip": false,
2130
+ "single_word": false,
2131
+ "special": true
2132
  }
2133
  },
2134
+ "additional_special_tokens": [
2135
+ "<table>",
2136
+ "</table>",
2137
+ "<header>",
2138
+ "</header>",
2139
+ "<row>",
2140
+ "</row>",
2141
+ "<cell>",
2142
+ "</cell>",
2143
+ "<empty>"
2144
+ ],
2145
  "bos_token": "<|begin_of_text|>",
2146
  "clean_up_tokenization_spaces": true,
2147
  "eos_token": "<|im_end|>",
2148
  "errors": "replace",
2149
  "extra_special_tokens": {},
2150
+ "mask_token": "<mask>",
2151
  "model_input_names": [
2152
  "input_ids",
2153
  "attention_mask"