Commit ·
0f0fa1a
1
Parent(s): 653c0e4
update chat template
Browse files- config.json +1 -1
- configuration.json +1 -0
- generation_utils.py +5 -1
- pytorch_model-00001-of-00003.bin +1 -1
- pytorch_model-00002-of-00003.bin +1 -1
- pytorch_model-00003-of-00003.bin +1 -1
- tokenization_orion.py +0 -14
config.json
CHANGED
|
@@ -28,4 +28,4 @@
|
|
| 28 |
"transformers_version": "4.34.0",
|
| 29 |
"use_cache": true,
|
| 30 |
"vocab_size": 84608
|
| 31 |
-
}
|
|
|
|
| 28 |
"transformers_version": "4.34.0",
|
| 29 |
"use_cache": true,
|
| 30 |
"vocab_size": 84608
|
| 31 |
+
}
|
configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework":"Pytorch","task":"text-generation"}
|
generation_utils.py
CHANGED
|
@@ -3,6 +3,10 @@ from queue import Queue
|
|
| 3 |
|
| 4 |
# build chat input prompt
|
| 5 |
def build_chat_input(tokenizer, messages: List[dict]):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
prompt = "<s>"
|
| 7 |
for msg in messages:
|
| 8 |
role = msg["role"]
|
|
@@ -10,7 +14,7 @@ def build_chat_input(tokenizer, messages: List[dict]):
|
|
| 10 |
if message is None :
|
| 11 |
continue
|
| 12 |
if role == "user":
|
| 13 |
-
prompt += "Human: " + message + "\nAssistant: "
|
| 14 |
if role == "assistant":
|
| 15 |
prompt += message + "</s>"
|
| 16 |
|
|
|
|
| 3 |
|
| 4 |
# build chat input prompt
|
| 5 |
def build_chat_input(tokenizer, messages: List[dict]):
|
| 6 |
+
# chat format:
|
| 7 |
+
# single-turn: <s>Human: Hello!\n\nAssistant: </s>
|
| 8 |
+
# multi-turn: <s>Human: Hello!\n\nAssistant: </s>Hi!</s>Human: How are you?\n\nAssistant: </s>I'm fine</s>
|
| 9 |
+
|
| 10 |
prompt = "<s>"
|
| 11 |
for msg in messages:
|
| 12 |
role = msg["role"]
|
|
|
|
| 14 |
if message is None :
|
| 15 |
continue
|
| 16 |
if role == "user":
|
| 17 |
+
prompt += "Human: " + message + "\n\nAssistant: </s>"
|
| 18 |
if role == "assistant":
|
| 19 |
prompt += message + "</s>"
|
| 20 |
|
pytorch_model-00001-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9937152090
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50ad84420f47d71980877bb76d3320bd1346374370c79a04ed634f893fc8c333
|
| 3 |
size 9937152090
|
pytorch_model-00002-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9857241994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f11df7ddc630b02893f71e9a2cfdb4035cd3ac884cec74dbc38a19f592b862e0
|
| 3 |
size 9857241994
|
pytorch_model-00003-of-00003.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9203166530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:074a2e42d9ab0024293c7bb4d11c8ebdc689b404f3dc42b2c45f58ebf5f15e76
|
| 3 |
size 9203166530
|
tokenization_orion.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
import os
|
| 4 |
from shutil import copyfile
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple
|
| 6 |
-
import re
|
| 7 |
|
| 8 |
import sentencepiece as spm
|
| 9 |
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
|
@@ -71,7 +70,6 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
| 71 |
self.add_eos_token = add_eos_token
|
| 72 |
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 73 |
self.sp_model.Load(vocab_file)
|
| 74 |
-
|
| 75 |
super().__init__(
|
| 76 |
bos_token=bos_token,
|
| 77 |
eos_token=eos_token,
|
|
@@ -120,8 +118,6 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
| 120 |
|
| 121 |
def convert_tokens_to_string(self, tokens):
|
| 122 |
"""Converts a sequence of tokens (string) in a single string."""
|
| 123 |
-
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
|
| 124 |
-
need_convert_punctuation=(",",";","!","?",":","(",")")
|
| 125 |
current_sub_tokens = []
|
| 126 |
out_string = ""
|
| 127 |
prev_is_special = False
|
|
@@ -133,22 +129,12 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
| 133 |
out_string += self.sp_model.decode(current_sub_tokens) + token
|
| 134 |
prev_is_special = True
|
| 135 |
current_sub_tokens = []
|
| 136 |
-
if any([True if punctuation in token else False for punctuation in need_convert_punctuation]):
|
| 137 |
-
out_string += self.sp_model.decode(current_sub_tokens)
|
| 138 |
-
token=self.sp_model.decode(token)
|
| 139 |
-
if zhPattern.search(out_string[-20:]):
|
| 140 |
-
token = self.to_zh_punctuation(token)
|
| 141 |
-
out_string += token
|
| 142 |
-
current_sub_tokens = []
|
| 143 |
else:
|
| 144 |
current_sub_tokens.append(token)
|
| 145 |
prev_is_special = False
|
| 146 |
out_string += self.sp_model.decode(current_sub_tokens)
|
| 147 |
return out_string
|
| 148 |
|
| 149 |
-
def to_zh_punctuation(self, token):
|
| 150 |
-
return token.replace(",",",").replace(";",";").replace("!","!").replace("?","?").replace(":",":").replace("(","(").replace(")",")")
|
| 151 |
-
|
| 152 |
def save_vocabulary(
|
| 153 |
self, save_directory, filename_prefix: Optional[str] = None
|
| 154 |
) -> Tuple[str]:
|
|
|
|
| 3 |
import os
|
| 4 |
from shutil import copyfile
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
| 6 |
|
| 7 |
import sentencepiece as spm
|
| 8 |
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
|
|
|
| 70 |
self.add_eos_token = add_eos_token
|
| 71 |
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 72 |
self.sp_model.Load(vocab_file)
|
|
|
|
| 73 |
super().__init__(
|
| 74 |
bos_token=bos_token,
|
| 75 |
eos_token=eos_token,
|
|
|
|
| 118 |
|
| 119 |
def convert_tokens_to_string(self, tokens):
|
| 120 |
"""Converts a sequence of tokens (string) in a single string."""
|
|
|
|
|
|
|
| 121 |
current_sub_tokens = []
|
| 122 |
out_string = ""
|
| 123 |
prev_is_special = False
|
|
|
|
| 129 |
out_string += self.sp_model.decode(current_sub_tokens) + token
|
| 130 |
prev_is_special = True
|
| 131 |
current_sub_tokens = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
else:
|
| 133 |
current_sub_tokens.append(token)
|
| 134 |
prev_is_special = False
|
| 135 |
out_string += self.sp_model.decode(current_sub_tokens)
|
| 136 |
return out_string
|
| 137 |
|
|
|
|
|
|
|
|
|
|
| 138 |
def save_vocabulary(
|
| 139 |
self, save_directory, filename_prefix: Optional[str] = None
|
| 140 |
) -> Tuple[str]:
|