File size: 2,142 Bytes
7b295db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

class DocProcessor:
    def __init__(self,LIST_DIR, PATH_SAVE):
        self.LIST_DIR = LIST_DIR               
        self.PATH_SAVE = PATH_SAVE 

    def process_data(self):
        if os.path.exists(self.PATH_SAVE):
            self.load_docs_from_jsonl()
        else:
            self.create_chuncks()
            self.save_docs_to_jsonl()

        self.docs_size = len(self.chunks)


    def create_chuncks(self, nb_char=1000, chunk_overlap=100):
        data = []
        # for filename in os.listdir(self.LIST_DIR):
        #     if filename.endswith(".json"):
        #         path = os.path.join(self.LIST_DIR, filename)
        #         with open(path, 'r') as f:
        #             data += [Document(page_content=str(json.load(f)), metadata={"source": path})]

        for path in self.LIST_DIR:
            if path.endswith(".json"):
                # path = os.path.join(self.LIST_DIR, filename)
                with open(path, 'r') as f:
                    data += [Document(page_content=str(json.load(f)), metadata={"source": path})]

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=nb_char,
            chunk_overlap=chunk_overlap,
            add_start_index=True,
            strip_whitespace=True,
            separators=["}", "]", "\n\n", "\n", ".", " ", ""],
        )

        chunks = text_splitter.split_documents(data)

        self.chunks = chunks
        print("Chunks created")

    def save_docs_to_jsonl(self):
        with open(self.PATH_SAVE, 'w') as jsonl_file:
            for doc in self.chunks:
                jsonl_file.write(doc.json() + '\n')
        print("Data saved")

    def load_docs_from_jsonl(self):
        self.chunks = []
        with open(self.PATH_SAVE, 'r') as jsonl_file:
            for line in jsonl_file:
                data = json.loads(line)
                obj = Document(**data)
                self.chunks.append(obj)
        # print("Data loaded")
        # return self.chunks