gkim93 commited on
Commit
c34bbf2
·
1 Parent(s): a8c56d0

Update PreProcessing.py

Browse files
Files changed (1) hide show
  1. PreProcessing.py +38 -38
PreProcessing.py CHANGED
@@ -22,42 +22,42 @@ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
22
  # "https://www.adb.org/sites/default/files/project-documents/49006/49006-003-pcr-en.pdf",
23
  # "https://www.adb.org/sites/default/files/project-documents/38412/38412-013-38412-023-38412-033-43069-012-pcr-en.pdf",
24
  # ]
25
-
26
- # Data Ingestion
27
- now = datetime.datetime.now()
28
- start_time = now.time()
29
- print("Loading Document - " + str(start_time))
30
- documents = []
31
- doc_num = 0
32
- for file in os.listdir('DataSource'):
33
- if file.endswith('.pdf'):
34
- pdf_path = './DataSource/' + file
35
- loader = PyPDFLoader(pdf_path)
36
- documents.extend(loader.load())
37
- elif file.endswith('.docx') or file.endswith('.doc'):
38
- doc_path = './DataSource/' + file
39
- loader = Docx2txtLoader(doc_path, decoding='latin-1')
40
- documents.extend(loader.load())
41
- elif file.endswith('.txt'):
42
- text_path = './DataSource/' + file
43
- loader = TextLoader(text_path)
44
- documents.extend(loader.load())
45
- doc_num = doc_num + 1
46
- print(f"{doc_num} number of document loaded")
47
- #Document Loading
48
- # loader = UnstructuredURLLoader(urls=urls)
49
-
50
- #Document Chunking
51
- now = datetime.datetime.now()
52
- print("Splitting Document - " + str(now.time()))
53
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
54
- documents = text_splitter.split_documents(documents)
55
-
56
- #Save Chroma Vector data
57
- now = datetime.datetime.now()
58
- print("Embedding Document - " + str(now.time()))
59
- embeddings = OpenAIEmbeddings()
60
- db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/")
61
- db2.persist()
62
- db2 = None
63
 
 
22
  # "https://www.adb.org/sites/default/files/project-documents/49006/49006-003-pcr-en.pdf",
23
  # "https://www.adb.org/sites/default/files/project-documents/38412/38412-013-38412-023-38412-033-43069-012-pcr-en.pdf",
24
  # ]
25
+ def execute():
26
+ # Data Ingestion
27
+ now = datetime.datetime.now()
28
+ start_time = now.time()
29
+ print("Loading Document - " + str(start_time))
30
+ documents = []
31
+ doc_num = 0
32
+ for file in os.listdir('DataSource'):
33
+ if file.endswith('.pdf'):
34
+ pdf_path = './DataSource/' + file
35
+ loader = PyPDFLoader(pdf_path)
36
+ documents.extend(loader.load())
37
+ elif file.endswith('.docx') or file.endswith('.doc'):
38
+ doc_path = './DataSource/' + file
39
+ loader = Docx2txtLoader(doc_path, decoding='latin-1')
40
+ documents.extend(loader.load())
41
+ elif file.endswith('.txt'):
42
+ text_path = './DataSource/' + file
43
+ loader = TextLoader(text_path)
44
+ documents.extend(loader.load())
45
+ doc_num = doc_num + 1
46
+ print(f"{doc_num} number of document loaded")
47
+ #Document Loading
48
+ # loader = UnstructuredURLLoader(urls=urls)
49
+
50
+ #Document Chunking
51
+ now = datetime.datetime.now()
52
+ print("Splitting Document - " + str(now.time()))
53
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
54
+ documents = text_splitter.split_documents(documents)
55
+
56
+ #Save Chroma Vector data
57
+ now = datetime.datetime.now()
58
+ print("Embedding Document - " + str(now.time()))
59
+ embeddings = OpenAIEmbeddings()
60
+ db2 = Chroma.from_documents(documents, embeddings, persist_directory="ChromaDB/")
61
+ db2.persist()
62
+ db2 = None
63