|
|
from transformers import LlamaTokenizerFast, PreTrainedTokenizer |
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler |
|
|
import json |
|
|
import argparse |
|
|
from typing import List |
|
|
|
|
|
def mask_multichar_chinese_tokens(tokenizer: PreTrainedTokenizer): |
|
|
"""Create a tokenizer wrapper that converts multi-character Chinese tokens to single characters. |
|
|
|
|
|
This function creates a wrapper around the provided tokenizer that automatically |
|
|
splits multi-character Chinese tokens into individual characters. This is useful |
|
|
for ensuring consistent tokenization of Chinese text. |
|
|
|
|
|
Args: |
|
|
tokenizer: The base tokenizer to wrap |
|
|
|
|
|
Returns: |
|
|
A CharTokenizerWrapper instance that handles multi-character Chinese tokens |
|
|
|
|
|
Example: |
|
|
>>> from transformers import LlamaTokenizerFast |
|
|
>>> tokenizer = LlamaTokenizerFast.from_pretrained("path/to/tokenizer") |
|
|
>>> wrapped_tokenizer = mask_multichar_chinese_tokens(tokenizer) |
|
|
>>> tokens = wrapped_tokenizer("你好世界") |
|
|
""" |
|
|
|
|
|
multichar_tokens = { |
|
|
token for token in tokenizer.vocab.keys() |
|
|
if len(token) >= 2 and all("\u4e00" <= c <= "\u9fff" for c in token) |
|
|
} |
|
|
|
|
|
class CharTokenizerWrapper: |
|
|
"""Wrapper class for tokenizers that handles multi-character Chinese tokens. |
|
|
|
|
|
This wrapper automatically splits multi-character Chinese tokens into |
|
|
individual characters while preserving the original tokenizer's interface. |
|
|
""" |
|
|
|
|
|
def __init__(self, base_tokenizer: PreTrainedTokenizer) -> None: |
|
|
"""Initialize the wrapper with a base tokenizer. |
|
|
|
|
|
Args: |
|
|
base_tokenizer: The tokenizer to wrap |
|
|
""" |
|
|
self.tokenizer = base_tokenizer |
|
|
self.multichar_tokens = multichar_tokens |
|
|
|
|
|
def tokenize(self, text: str, **kwargs) -> List[str]: |
|
|
"""Tokenize text and split multi-character Chinese tokens into single characters. |
|
|
|
|
|
Args: |
|
|
text: Input text to tokenize |
|
|
**kwargs: Additional arguments passed to the base tokenizer |
|
|
|
|
|
Returns: |
|
|
List of processed tokens with multi-character Chinese tokens split |
|
|
|
|
|
Example: |
|
|
>>> wrapper = CharTokenizerWrapper(tokenizer) |
|
|
>>> tokens = wrapper.tokenize("你好世界") |
|
|
>>> # Returns ["你", "好", "世", "界"] instead of ["你好", "世界"] |
|
|
""" |
|
|
if not isinstance(text, str): |
|
|
raise TypeError(f"Expected string input, got {type(text)}") |
|
|
|
|
|
tokens = self.tokenizer.tokenize(text, **kwargs) |
|
|
processed = [] |
|
|
|
|
|
for token in tokens: |
|
|
|
|
|
clean_token = token.replace("▁", "") |
|
|
|
|
|
if clean_token in self.multichar_tokens: |
|
|
|
|
|
chars = list(clean_token) |
|
|
processed.extend(chars) |
|
|
else: |
|
|
processed.append(token) |
|
|
|
|
|
return processed |
|
|
|
|
|
def __call__(self, text: str, **kwargs) -> List[int]: |
|
|
"""Call the tokenizer and return token IDs. |
|
|
|
|
|
This method provides the same interface as the original tokenizer |
|
|
but with multi-character Chinese token handling. |
|
|
|
|
|
Args: |
|
|
text: Input text to tokenize |
|
|
**kwargs: Additional arguments passed to the base tokenizer |
|
|
|
|
|
Returns: |
|
|
List of token IDs |
|
|
|
|
|
Raises: |
|
|
TypeError: If input is not a string |
|
|
ValueError: If tokenization fails |
|
|
""" |
|
|
try: |
|
|
tokens = self.tokenize(text, **kwargs) |
|
|
result = self.tokenizer.convert_tokens_to_ids(tokens) |
|
|
return result |
|
|
except Exception as e: |
|
|
raise ValueError(f"Tokenization failed: {str(e)}") from e |
|
|
|
|
|
return CharTokenizerWrapper(tokenizer) |
|
|
|
|
|
class Tokenizer_Http(): |
|
|
|
|
|
def __init__(self): |
|
|
tokenizer = LlamaTokenizerFast.from_pretrained("./VoxCPM-0.5B") |
|
|
self.tokenizer = mask_multichar_chinese_tokens(tokenizer) |
|
|
|
|
|
def encode(self, prompt): |
|
|
|
|
|
token_ids = self.tokenizer(prompt) |
|
|
return token_ids |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
def eos_id(self): |
|
|
return 1773 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
def eos_token(self): |
|
|
return "<|eot_id|>" |
|
|
|
|
|
|
|
|
tokenizer = Tokenizer_Http() |
|
|
|
|
|
|
|
|
print(tokenizer.encode("hello world")) |
|
|
|
|
|
|
|
|
class Request(BaseHTTPRequestHandler): |
|
|
|
|
|
timeout = 5 |
|
|
server_version = 'Apache' |
|
|
|
|
|
def do_GET(self): |
|
|
print(self.path) |
|
|
|
|
|
self.send_response(200) |
|
|
self.send_header("type", "get") |
|
|
self.end_headers() |
|
|
|
|
|
if self.path == '/bos_id': |
|
|
bos_id = tokenizer.bos_id |
|
|
|
|
|
|
|
|
if bos_id is None: |
|
|
msg = json.dumps({'bos_id': -1}) |
|
|
else: |
|
|
msg = json.dumps({'bos_id': bos_id}) |
|
|
elif self.path == '/eos_id': |
|
|
eos_id = tokenizer.eos_id |
|
|
if eos_id is None: |
|
|
msg = json.dumps({'eos_id': -1}) |
|
|
else: |
|
|
msg = json.dumps({'eos_id': eos_id}) |
|
|
else: |
|
|
msg = 'error' |
|
|
|
|
|
print(msg) |
|
|
msg = str(msg).encode() |
|
|
|
|
|
self.wfile.write(msg) |
|
|
|
|
|
def do_POST(self): |
|
|
|
|
|
data = self.rfile.read(int( |
|
|
self.headers['content-length'])) |
|
|
data = data.decode() |
|
|
|
|
|
self.send_response(200) |
|
|
self.send_header("type", "post") |
|
|
self.end_headers() |
|
|
|
|
|
if self.path == '/encode': |
|
|
req = json.loads(data) |
|
|
prompt = req['text'] |
|
|
|
|
|
token_ids = tokenizer.encode(prompt) |
|
|
if token_ids is None: |
|
|
msg = json.dumps({'token_ids': -1}) |
|
|
else: |
|
|
msg = json.dumps({'token_ids': token_ids}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
msg = 'error' |
|
|
print(msg) |
|
|
msg = str(msg).encode() |
|
|
|
|
|
self.wfile.write(msg) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
args = argparse.ArgumentParser() |
|
|
args.add_argument('--host', type=str, default='localhost') |
|
|
args.add_argument('--port', type=int, default=9999) |
|
|
args = args.parse_args() |
|
|
|
|
|
host = (args.host, args.port) |
|
|
print('http://%s:%s' % host) |
|
|
server = HTTPServer(host, Request) |
|
|
server.serve_forever() |
|
|
|