Tyrex_Chatbot / utils /functions.py
pritmanvar-bacancy's picture
initial commit
accb514 verified
import tiktoken
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE,ROOT_DIR
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function
def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
def get_vectorstore():
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
f = open(os.path.join(ROOT_DIR,str(os.getenv('TYREX_DATA_PATH'))), "r")
data = f.read()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=VECTOR_MAX_TOKENS,
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
length_function=tiktoken_len,
separators=["\n\n\n","\n\n", "\n", " ", ""]
)
all_splits = text_splitter.split_text(data)
vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
return vectorstore