Merge branch 'main' of https://huggingface.co/internlm/internlm2-chat-1_8b-sft into main
Browse files- special_tokens_map.json +1 -1
- tokenization_internlm2_fast.py +8 -8
- tokenizer_config.json +1 -1
special_tokens_map.json
CHANGED
|
@@ -35,4 +35,4 @@
|
|
| 35 |
"rstrip": false,
|
| 36 |
"single_word": false
|
| 37 |
}
|
| 38 |
-
}
|
|
|
|
| 35 |
"rstrip": false,
|
| 36 |
"single_word": false
|
| 37 |
}
|
| 38 |
+
}
|
tokenization_internlm2_fast.py
CHANGED
|
@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
|
|
| 56 |
return unk_id
|
| 57 |
|
| 58 |
def decoder(self, replacement, add_prefix_space):
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
)
|
| 67 |
|
| 68 |
def tokenizer(self, proto):
|
| 69 |
model_type = proto.trainer_spec.model_type
|
|
|
|
| 56 |
return unk_id
|
| 57 |
|
| 58 |
def decoder(self, replacement, add_prefix_space):
|
| 59 |
+
decoders_sequence = [
|
| 60 |
+
decoders.Replace("▁", " "),
|
| 61 |
+
decoders.ByteFallback(),
|
| 62 |
+
decoders.Fuse(),
|
| 63 |
+
]
|
| 64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
| 65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
| 66 |
+
return decoders.Sequence(decoders_sequence)
|
| 67 |
|
| 68 |
def tokenizer(self, proto):
|
| 69 |
model_type = proto.trainer_spec.model_type
|
tokenizer_config.json
CHANGED
|
@@ -99,4 +99,4 @@
|
|
| 99 |
"sp_model_kwargs": null,
|
| 100 |
"tokenizer_class": "InternLM2Tokenizer",
|
| 101 |
"unk_token": "<unk>"
|
| 102 |
-
}
|
|
|
|
| 99 |
"sp_model_kwargs": null,
|
| 100 |
"tokenizer_class": "InternLM2Tokenizer",
|
| 101 |
"unk_token": "<unk>"
|
| 102 |
+
}
|