| from jinja2 import Template | |
| from transformers import AutoModel, AutoTokenizer | |
| from .logging import logging_info | |
| def initEmbedding(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): | |
| return AutoModel.from_pretrained(model_name, **model_wargs) | |
| def initTokenizer(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): | |
| return AutoTokenizer.from_pretrained(model_name, **model_wargs) | |
| def detectEncoding(b: bytes): | |
| import chardet | |
| logging_info(f"chardet.detect(b): {chardet.detect(b)}") | |
| return chardet.detect(b)["encoding"] | |
| def convertToUTF8(b: bytes): | |
| if detectEncoding(b): | |
| return b.decode(detectEncoding(b)) | |
| return b.decode("utf-8") | |