File size: 2,175 Bytes
7b295db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import ollama
from tqdm import tqdm

from doc_processor import DocProcessor

class ContextualDocProcessor(DocProcessor):
    def __init__(self,DATA_DIR, PATH_SAVE, PATH_SAVE_CONTEXT):
        DocProcessor.__init__(self,DATA_DIR, PATH_SAVE)
        self.PATH_SAVE_CONTEXT = PATH_SAVE_CONTEXT

    def process_data(self):
        DocProcessor.process_data(self)
        if os.path.exists(self.PATH_SAVE_CONTEXT):
            self.load_context_from_jsonl()
        else:
            self.create_context()
            self.save_context_to_jsonl()


    def create_context(self):
        self.context = []
        for filename in tqdm(os.listdir(self.DATA_DIR)):
            if filename.endswith(".json"):
                path = os.path.join(self.DATA_DIR, filename)
                with open(path, 'r') as f:
                    doc = str(json.load(f))
                    prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}"
                    response = ollama.chat(
                        model="llama3",
                        messages=[
                            {
                                "role": "user",
                                "content": prompt,
                            },
                        ],
                    )

                    self.context += [Document(page_content=response["message"]["content"], metadata={"source": path}, id=path)]

        print("Chunks created")

    def save_context_to_jsonl(self):
        with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file:
            for doc in self.context:
                jsonl_file.write(doc.json() + '\n')
        print("Context saved")

    def load_context_from_jsonl(self):
        self.context = []
        with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file:
            for line in jsonl_file:
                data = json.loads(line)
                obj = Document(**data)
                self.context.append(obj)
        print("Context loaded")
        # return self.chunks