# -*- coding: utf-8 -*- """ Created on Tue Jul 25 10:36:41 2023 This script uses LangChain and Chroma to load, split and store PID data @author: intern.giwon.kim """ from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import UnstructuredURLLoader import os from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import Docx2txtLoader from langchain.document_loaders import TextLoader import datetime #Set Open AI API Key os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # urls = [ # "https://www.adb.org/sites/default/files/project-documents/49006/49006-003-pcr-en.pdf", # "https://www.adb.org/sites/default/files/project-documents/38412/38412-013-38412-023-38412-033-43069-012-pcr-en.pdf", # ] def preProcess(): # Data Ingestion now = datetime.datetime.now() start_time = now.time() print("Loading Document - " + str(start_time)) documents = [] doc_num = 0 for file in os.listdir('DataSource'): if file.endswith('.pdf'): pdf_path = './DataSource/' + file loader = PyPDFLoader(pdf_path) documents.extend(loader.load()) elif file.endswith('.docx') or file.endswith('.doc'): doc_path = './DataSource/' + file loader = Docx2txtLoader(doc_path) documents.extend(loader.load()) elif file.endswith('.txt'): text_path = './DataSource/' + file loader = TextLoader(text_path, encoding='latin-1') documents.extend(loader.load()) doc_num = doc_num + 1 print(f"{doc_num} number of document loaded") #Document Loading # loader = UnstructuredURLLoader(urls=urls) #Document Chunking now = datetime.datetime.now() print("Splitting Document - " + str(now.time())) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(documents) #Save Chroma Vector data now = datetime.datetime.now() print("Embedding Document - " + str(now.time())) embeddings = OpenAIEmbeddings() db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/") db2.persist() db2 = None