File size: 2,590 Bytes
7b295db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from huggingface_hub import InferenceClient
from tqdm import tqdm

from doc_processor import DocProcessor

class ContextualDocProcessor(DocProcessor):
    def __init__(self,LIST_DIR, PATH_SAVE):
        DocProcessor.__init__(self,LIST_DIR, PATH_SAVE)

    def create_chuncks(self, nb_char=1000, chunk_overlap=100):
        split_nb_char = nb_char//2
        data = []
        context = {}
        for path in tqdm(self.LIST_DIR):
            if path.endswith(".json"):
                # path = os.path.join(self.LIST_DIR, filename)
                with open(path, 'r') as f:
                    doc = str(json.load(f))

                    data += [Document(page_content=doc, metadata={"source": path})]

                    prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}"
                    response = InferenceClient().chat_completion(
                        model="deepseek-ai/DeepSeek-V3-0324",
                        temperature = 0,
                        messages=[
                            {
                                "role": "user",
                                "content": prompt,
                            },
                        ],
                    )

                    context[path] = response.choices[0].message.content

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=split_nb_char,
            chunk_overlap=chunk_overlap,
            add_start_index=True,
            strip_whitespace=True,
            separators=["}", "]", "\n\n", "\n", ".", " ", ""],
        )

        chunks = text_splitter.split_documents(data)

        for chunk in chunks:
            chunk.page_content = context[chunk.metadata['source']] + "\n\nVoici un extrait du document:\n" + chunk.page_content

        self.chunks = chunks
        print("Chunks with context created")

    # def save_context_to_jsonl(self):
    #     with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file:
    #         for doc in self.context:
    #             jsonl_file.write(doc.json() + '\n')
    #     print("Context saved")

    # def load_context_from_jsonl(self):
    #     self.context = []
    #     with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file:
    #         for line in jsonl_file:
    #             data = json.loads(line)
    #             obj = Document(**data)
    #             self.context.append(obj)
    #     print("Context loaded")