File size: 1,362 Bytes
accb514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import tiktoken
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE,ROOT_DIR
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

def get_vectorstore():
    model_name = "BAAI/bge-small-en"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    hf = HuggingFaceBgeEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )
    
    f = open(os.path.join(ROOT_DIR,str(os.getenv('TYREX_DATA_PATH'))), "r")
    data = f.read()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=VECTOR_MAX_TOKENS,
        chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
        length_function=tiktoken_len,
        separators=["\n\n\n","\n\n", "\n", " ", ""]
    )

    all_splits = text_splitter.split_text(data)
    
    vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
    return vectorstore