Upload 5 files

Browse files

Files changed (5) hide show

__init__.py +1 -0
hf_model.py +102 -0
libfastllm_tools.so +0 -0
llm.py +166 -0
torch2flm.py +89 -0

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __all__ = ["llm"]

hf_model.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from fastllm_pytools import llm;
+import torch;
+import ctypes;
+import numpy as np;
+fastllm_data_type_dict = {
+    "int4": 8,
+    "int8": 3,
+    "float16": 7
+}
+fastllm_weight_type_dict = {
+    "linear": 1,
+    "embedding": 2
+}
+def create(model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None,
+           dtype = "float16"):
+    if (dtype not in fastllm_data_type_dict):
+        print("dtype should in ", list(fastllm_data_type_dict.keys()));
+        exit(0);
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt;
+    if (user_role):
+        modelInfo["user_role"] = user_role;
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role;
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep;
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1";
+        modelInfo["pre_prompt"] = "";
+        modelInfo["user_role"] = tokenizer.decode([model.generation_config.user_token_id]);
+        modelInfo["bot_role"] = tokenizer.decode([model.generation_config.assistant_token_id]);
+        modelInfo["history_sep"] = "";
+    weight_type_dict = {};
+    module_dict = {};
+    for key, m in model.named_modules():
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear";
+            module_dict[key + ".weight"] = m;
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding";
+    model = model.cpu();
+    dict = model.state_dict();
+    model_type = model.config.__dict__["model_type"];
+    model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
+    for it in modelInfo.keys():
+        llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size();
+            for i in range(piece_size):
+                llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(), i);
+        else:
+            vocab = tokenizer.get_vocab();
+            for v in vocab.keys():
+                llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v]);
+    tot = 0;
+    for key in dict:
+        ori_data_type = 0;
+        ori_np_data_type = np.float32;
+        cur_weight_type = 0;
+        if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
+        to_data_type = 0;
+        if (cur_weight_type == 1):
+            to_data_type = fastllm_data_type_dict[dtype];
+            if (to_data_type == 7):
+                ori_data_type = 7;
+                ori_np_data_type = np.float16;
+        elif (cur_weight_type == 2):
+            # TODO bfloat
+            to_data_type = 0;
+        llm.fastllm_lib.add_weight_llm_model(model, key.encode(),
+                                             len(dict[key].shape),
+                                             (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
+                                             to_data_type, cur_weight_type, ori_data_type,
+                                             dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
+        tot += 1;
+        print("convert (", tot, "/", len(dict), end = " )\r");
+    print("");
+    llm.fastllm_lib.init_params_llm_model(model);
+    llm.fastllm_lib.warmup_llm_model(model);
+    ret = llm.model("", id = model);
+    return ret;

libfastllm_tools.so ADDED Viewed

Binary file (746 kB). View file

llm.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import ctypes;
+import os;
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
+import platform
+if platform.system() == 'Windows':
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
+else:
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
+fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
+fastllm_lib.create_llm_model.restype = ctypes.c_int
+fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
+                                                  ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                  ctypes.c_float, ctypes.c_float]
+fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
+fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int
+fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
+                                               ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                               ctypes.c_float, ctypes.c_float]
+fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
+                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                     ctypes.c_float, ctypes.c_float]
+fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
+fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
+fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
+fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
+def set_cpu_threads(threads: int):
+    fastllm_lib.set_cpu_threads(threads);
+def get_cpu_threads() -> int:
+    return fastllm_lib.get_cpu_threads();
+def print_ins_info():
+    fastllm_lib.print_cpu_ins();
+def set_cpu_kvcache(cpu_kvcache):
+    fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
+def get_cpu_kvcache():
+    return fastllm_lib.get_kvcache_in_cpu();
+def set_cpu_low_mem(low_mem):
+    fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
+def get_cpu_low_mem():
+    return fastllm_lib.get_cpu_low_mem();
+def from_hf(model,
+            tokenizer = None,
+            dtype = "float16"):
+    from fastllm_pytools import hf_model;
+    return hf_model.create(model, tokenizer, dtype = dtype);
+class model:
+    def __init__ (self, path : str,
+                  id : int = -99999):
+        if (id != -99999):
+            self.model = id;
+        else:
+            self.model = fastllm_lib.create_llm_model(path.encode());
+        self.direct_query = False;
+    def get_prompt(self,
+                   query: str,
+                   history: List[Tuple[str, str]] = None) -> str:
+        if (not(history)):
+            history = [];
+        prompt = "";
+        for i, (old_query, response) in enumerate(history):
+            prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode();
+        prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode();
+        return prompt;
+    def save(self, path : str):
+        fastllm_lib.save_llm_model(self.model, path.encode());
+    def response(self,
+                 query: str,
+                 history: List[Tuple[str, str]] = None,
+                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
+        prompt = query if self.direct_query else self.get_prompt(query, history);
+        ret = fastllm_lib.response_str_llm_model(self.model, prompt.encode(),
+                                                 max_length, do_sample, top_p, top_k, temperature, repeat_penalty).decode();
+        return ret;
+    def stream_response(self,
+                        query: str,
+                        history: List[Tuple[str, str]] = None,
+                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                        one_by_one = True):
+        prompt = query if self.direct_query else self.get_prompt(query, history);
+        handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
+                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty));
+        res = "";
+        ret = b'';
+        while True:
+            ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+            cur = "";
+            try:
+                cur = ret.decode();
+                ret = b'';
+            except:
+                pass;
+            if (cur == "<flmeos>"):
+                break;
+            if one_by_one:
+                yield cur;
+            else:
+                res += cur;
+                yield res;
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
+             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
+        if (not(history)):
+            history = [];
+        prompt = query if self.direct_query else self.get_prompt(query, history);
+        input = tokenizer.encode(prompt);
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty);
+        result = [];
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+            if (cur == -1):
+                break;
+            result.append(cur);
+        response = tokenizer.decode(result);
+        history = history + [(query, response)];
+        return response, history;
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
+                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                    return_past_key_values = False, **kwargs) -> str:
+        if (not(history)):
+            history = [];
+        prompt = query if self.direct_query else self.get_prompt(query, history);
+        input = tokenizer.encode(prompt);
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty);
+        tokens = [];
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+            if (cur == -1):
+                break;
+            tokens.append(cur);
+            response = tokenizer.decode(tokens);
+            new_history = history + [(query, response)];
+            if return_past_key_values:
+                yield response, new_history, None;
+            else:
+                yield response, new_history;

torch2flm.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import struct
+import numpy as np
+def writeString(fo, s):
+    fo.write(struct.pack('i', len(s)));
+    fo.write(s.encode());
+def writeKeyValue(fo, key, value):
+    writeString(fo, key);
+    writeString(fo, value);
+def tofile(exportPath,
+           model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None):
+    dict = model.state_dict();
+    fo = open(exportPath, "wb");
+    # 0. version id
+    fo.write(struct.pack('i', 2));
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if ("model_type" not in modelInfo):
+        print("unknown model_type.");
+        exit(0);
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt;
+    if (user_role):
+        modelInfo["user_role"] = user_role;
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role;
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep;
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1";
+        modelInfo["pre_prompt"] = "";
+        modelInfo["user_role"] = tokenizer.decode([model.generation_config.user_token_id]);
+        modelInfo["bot_role"] = tokenizer.decode([model.generation_config.assistant_token_id]);
+        modelInfo["history_sep"] = "";
+    fo.write(struct.pack('i', len(modelInfo)));
+    for it in modelInfo.keys():
+        writeKeyValue(fo, str(it), str(modelInfo[it]));
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size();
+            fo.write(struct.pack('i', piece_size));
+            for i in range(piece_size):
+                s = tokenizer.sp_model.id_to_piece(i).encode();
+                fo.write(struct.pack('i', len(s)));
+                for c in s:
+                    fo.write(struct.pack('i', c));
+                fo.write(struct.pack('i', i));
+        else:
+            vocab = tokenizer.get_vocab();
+            fo.write(struct.pack('i', len(vocab)));
+            for v in vocab.keys():
+                s = v.encode();
+                fo.write(struct.pack('i', len(s)));
+                for c in s:
+                    fo.write(struct.pack('i', c));
+                fo.write(struct.pack('i', vocab[v]));
+    else:
+        fo.write(struct.pack('i', 0));
+    # 2. weight
+    fo.write(struct.pack('i', len(dict)));
+    tot = 0;
+    for key in dict:
+        cur = dict[key].numpy().astype(np.float32);
+        fo.write(struct.pack('i', len(key)));
+        fo.write(key.encode());
+        fo.write(struct.pack('i', len(cur.shape)));
+        for i in cur.shape:
+            fo.write(struct.pack('i', i));
+        fo.write(struct.pack('i', 0));
+        fo.write(cur.data);
+        tot += 1;
+        print("output (", tot, "/", len(dict), end = " )\r");
+    print("\nfinish.");
+    fo.close();