|
|
import os.path |
|
|
import json |
|
|
from langchain.docstore.document import Document |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
class DocProcessor: |
|
|
def __init__(self,LIST_DIR, PATH_SAVE): |
|
|
self.LIST_DIR = LIST_DIR |
|
|
self.PATH_SAVE = PATH_SAVE |
|
|
|
|
|
def process_data(self): |
|
|
if os.path.exists(self.PATH_SAVE): |
|
|
self.load_docs_from_jsonl() |
|
|
else: |
|
|
self.create_chuncks() |
|
|
self.save_docs_to_jsonl() |
|
|
|
|
|
self.docs_size = len(self.chunks) |
|
|
|
|
|
|
|
|
def create_chuncks(self, nb_char=1000, chunk_overlap=100): |
|
|
data = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for path in self.LIST_DIR: |
|
|
if path.endswith(".json"): |
|
|
|
|
|
with open(path, 'r') as f: |
|
|
data += [Document(page_content=str(json.load(f)), metadata={"source": path})] |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=nb_char, |
|
|
chunk_overlap=chunk_overlap, |
|
|
add_start_index=True, |
|
|
strip_whitespace=True, |
|
|
separators=["}", "]", "\n\n", "\n", ".", " ", ""], |
|
|
) |
|
|
|
|
|
chunks = text_splitter.split_documents(data) |
|
|
|
|
|
self.chunks = chunks |
|
|
print("Chunks created") |
|
|
|
|
|
def save_docs_to_jsonl(self): |
|
|
with open(self.PATH_SAVE, 'w') as jsonl_file: |
|
|
for doc in self.chunks: |
|
|
jsonl_file.write(doc.json() + '\n') |
|
|
print("Data saved") |
|
|
|
|
|
def load_docs_from_jsonl(self): |
|
|
self.chunks = [] |
|
|
with open(self.PATH_SAVE, 'r') as jsonl_file: |
|
|
for line in jsonl_file: |
|
|
data = json.loads(line) |
|
|
obj = Document(**data) |
|
|
self.chunks.append(obj) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|