Spaces:
Sleeping
Sleeping
| import tiktoken | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE,ROOT_DIR | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| import os | |
| env_path = Path('.') / '.env' | |
| load_dotenv(dotenv_path=env_path) | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| # create the length function | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| def get_vectorstore(): | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| hf = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| f = open(os.path.join(ROOT_DIR,str(os.getenv('TYREX_DATA_PATH'))), "r") | |
| data = f.read() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=VECTOR_MAX_TOKENS, | |
| chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, | |
| length_function=tiktoken_len, | |
| separators=["\n\n\n","\n\n", "\n", " ", ""] | |
| ) | |
| all_splits = text_splitter.split_text(data) | |
| vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) | |
| return vectorstore | |