Agent_UB / doc_processor.py
t-pris's picture
Upload folder using huggingface_hub
7b295db verified
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
class DocProcessor:
def __init__(self,LIST_DIR, PATH_SAVE):
self.LIST_DIR = LIST_DIR
self.PATH_SAVE = PATH_SAVE
def process_data(self):
if os.path.exists(self.PATH_SAVE):
self.load_docs_from_jsonl()
else:
self.create_chuncks()
self.save_docs_to_jsonl()
self.docs_size = len(self.chunks)
def create_chuncks(self, nb_char=1000, chunk_overlap=100):
data = []
# for filename in os.listdir(self.LIST_DIR):
# if filename.endswith(".json"):
# path = os.path.join(self.LIST_DIR, filename)
# with open(path, 'r') as f:
# data += [Document(page_content=str(json.load(f)), metadata={"source": path})]
for path in self.LIST_DIR:
if path.endswith(".json"):
# path = os.path.join(self.LIST_DIR, filename)
with open(path, 'r') as f:
data += [Document(page_content=str(json.load(f)), metadata={"source": path})]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=nb_char,
chunk_overlap=chunk_overlap,
add_start_index=True,
strip_whitespace=True,
separators=["}", "]", "\n\n", "\n", ".", " ", ""],
)
chunks = text_splitter.split_documents(data)
self.chunks = chunks
print("Chunks created")
def save_docs_to_jsonl(self):
with open(self.PATH_SAVE, 'w') as jsonl_file:
for doc in self.chunks:
jsonl_file.write(doc.json() + '\n')
print("Data saved")
def load_docs_from_jsonl(self):
self.chunks = []
with open(self.PATH_SAVE, 'r') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
obj = Document(**data)
self.chunks.append(obj)
# print("Data loaded")
# return self.chunks