import tiktoken from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceBgeEmbeddings from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE,ROOT_DIR from dotenv import load_dotenv from pathlib import Path import os env_path = Path('.') / '.env' load_dotenv(dotenv_path=env_path) tokenizer = tiktoken.get_encoding('cl100k_base') # create the length function def tiktoken_len(text): tokens = tokenizer.encode( text, disallowed_special=() ) return len(tokens) def get_vectorstore(): model_name = "BAAI/bge-small-en" model_kwargs = {"device": "cpu"} encode_kwargs = {"normalize_embeddings": True} hf = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) f = open(os.path.join(ROOT_DIR,str(os.getenv('TYREX_DATA_PATH'))), "r") data = f.read() text_splitter = RecursiveCharacterTextSplitter( chunk_size=VECTOR_MAX_TOKENS, chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, length_function=tiktoken_len, separators=["\n\n\n","\n\n", "\n", " ", ""] ) all_splits = text_splitter.split_text(data) vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) return vectorstore