import os.path import json from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from huggingface_hub import InferenceClient from tqdm import tqdm from doc_processor import DocProcessor class ContextualDocProcessor(DocProcessor): def __init__(self,LIST_DIR, PATH_SAVE): DocProcessor.__init__(self,LIST_DIR, PATH_SAVE) def create_chuncks(self, nb_char=1000, chunk_overlap=100): split_nb_char = nb_char//2 data = [] context = {} for path in tqdm(self.LIST_DIR): if path.endswith(".json"): # path = os.path.join(self.LIST_DIR, filename) with open(path, 'r') as f: doc = str(json.load(f)) data += [Document(page_content=doc, metadata={"source": path})] prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}" response = InferenceClient().chat_completion( model="deepseek-ai/DeepSeek-V3-0324", temperature = 0, messages=[ { "role": "user", "content": prompt, }, ], ) context[path] = response.choices[0].message.content text_splitter = RecursiveCharacterTextSplitter( chunk_size=split_nb_char, chunk_overlap=chunk_overlap, add_start_index=True, strip_whitespace=True, separators=["}", "]", "\n\n", "\n", ".", " ", ""], ) chunks = text_splitter.split_documents(data) for chunk in chunks: chunk.page_content = context[chunk.metadata['source']] + "\n\nVoici un extrait du document:\n" + chunk.page_content self.chunks = chunks print("Chunks with context created") # def save_context_to_jsonl(self): # with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file: # for doc in self.context: # jsonl_file.write(doc.json() + '\n') # print("Context saved") # def load_context_from_jsonl(self): # self.context = [] # with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file: # for line in jsonl_file: # data = json.loads(line) # obj = Document(**data) # self.context.append(obj) # print("Context loaded")