File size: 2,345 Bytes
a96044d ecfa431 a96044d ec03e73 c34bbf2 cec6d94 c34bbf2 f6aedeb c34bbf2 a96044d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | # -*- coding: utf-8 -*-
"""
Created on Tue Jul 25 10:36:41 2023
This script uses LangChain and Chroma to load, split and store PID data
@author: intern.giwon.kim
"""
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
import datetime
#Set Open AI API Key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# urls = [
# "https://www.adb.org/sites/default/files/project-documents/49006/49006-003-pcr-en.pdf",
# "https://www.adb.org/sites/default/files/project-documents/38412/38412-013-38412-023-38412-033-43069-012-pcr-en.pdf",
# ]
def preProcess():
# Data Ingestion
now = datetime.datetime.now()
start_time = now.time()
print("Loading Document - " + str(start_time))
documents = []
doc_num = 0
for file in os.listdir('DataSource'):
if file.endswith('.pdf'):
pdf_path = './DataSource/' + file
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
elif file.endswith('.docx') or file.endswith('.doc'):
doc_path = './DataSource/' + file
loader = Docx2txtLoader(doc_path)
documents.extend(loader.load())
elif file.endswith('.txt'):
text_path = './DataSource/' + file
loader = TextLoader(text_path, encoding='latin-1')
documents.extend(loader.load())
doc_num = doc_num + 1
print(f"{doc_num} number of document loaded")
#Document Loading
# loader = UnstructuredURLLoader(urls=urls)
#Document Chunking
now = datetime.datetime.now()
print("Splitting Document - " + str(now.time()))
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)
#Save Chroma Vector data
now = datetime.datetime.now()
print("Embedding Document - " + str(now.time()))
embeddings = OpenAIEmbeddings()
db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/")
db2.persist()
db2 = None
|