File size: 2,142 Bytes
7b295db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
class DocProcessor:
def __init__(self,LIST_DIR, PATH_SAVE):
self.LIST_DIR = LIST_DIR
self.PATH_SAVE = PATH_SAVE
def process_data(self):
if os.path.exists(self.PATH_SAVE):
self.load_docs_from_jsonl()
else:
self.create_chuncks()
self.save_docs_to_jsonl()
self.docs_size = len(self.chunks)
def create_chuncks(self, nb_char=1000, chunk_overlap=100):
data = []
# for filename in os.listdir(self.LIST_DIR):
# if filename.endswith(".json"):
# path = os.path.join(self.LIST_DIR, filename)
# with open(path, 'r') as f:
# data += [Document(page_content=str(json.load(f)), metadata={"source": path})]
for path in self.LIST_DIR:
if path.endswith(".json"):
# path = os.path.join(self.LIST_DIR, filename)
with open(path, 'r') as f:
data += [Document(page_content=str(json.load(f)), metadata={"source": path})]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=nb_char,
chunk_overlap=chunk_overlap,
add_start_index=True,
strip_whitespace=True,
separators=["}", "]", "\n\n", "\n", ".", " ", ""],
)
chunks = text_splitter.split_documents(data)
self.chunks = chunks
print("Chunks created")
def save_docs_to_jsonl(self):
with open(self.PATH_SAVE, 'w') as jsonl_file:
for doc in self.chunks:
jsonl_file.write(doc.json() + '\n')
print("Data saved")
def load_docs_from_jsonl(self):
self.chunks = []
with open(self.PATH_SAVE, 'r') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
obj = Document(**data)
self.chunks.append(obj)
# print("Data loaded")
# return self.chunks
|