| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader | |
| import tiktoken | |
| loader = DirectoryLoader( | |
| "./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader | |
| ) | |
| documents = loader.load() | |
| # loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader) | |
| # documents = loader.load() | |
| # print(documents) | |
| def tiktoken_len(text): | |
| tokenizer = tiktoken.encoding_for_model("gpt-4") | |
| tokens = tokenizer.encode(text, disallowed_special=()) | |
| return len(tokens) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=4000, | |
| chunk_overlap=400, | |
| length_function=tiktoken_len, | |
| separators=["\n\n", "\n", " ", ""], | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| persist_direcory = "db_index" | |
| # embeddings = OpenAIEmbeddings() | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| db = Chroma.from_documents( | |
| texts, embedding=embeddings, persist_directory=persist_direcory | |
| ) | |
| db.persist() | |
| print("done") | |