|
|
import os.path |
|
|
import json |
|
|
from langchain.docstore.document import Document |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from huggingface_hub import InferenceClient |
|
|
from tqdm import tqdm |
|
|
|
|
|
from doc_processor import DocProcessor |
|
|
|
|
|
class ContextualDocProcessor(DocProcessor): |
|
|
def __init__(self,LIST_DIR, PATH_SAVE): |
|
|
DocProcessor.__init__(self,LIST_DIR, PATH_SAVE) |
|
|
|
|
|
def create_chuncks(self, nb_char=1000, chunk_overlap=100): |
|
|
split_nb_char = nb_char//2 |
|
|
data = [] |
|
|
context = {} |
|
|
for path in tqdm(self.LIST_DIR): |
|
|
if path.endswith(".json"): |
|
|
|
|
|
with open(path, 'r') as f: |
|
|
doc = str(json.load(f)) |
|
|
|
|
|
data += [Document(page_content=doc, metadata={"source": path})] |
|
|
|
|
|
prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}" |
|
|
response = InferenceClient().chat_completion( |
|
|
model="deepseek-ai/DeepSeek-V3-0324", |
|
|
temperature = 0, |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
}, |
|
|
], |
|
|
) |
|
|
|
|
|
context[path] = response.choices[0].message.content |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=split_nb_char, |
|
|
chunk_overlap=chunk_overlap, |
|
|
add_start_index=True, |
|
|
strip_whitespace=True, |
|
|
separators=["}", "]", "\n\n", "\n", ".", " ", ""], |
|
|
) |
|
|
|
|
|
chunks = text_splitter.split_documents(data) |
|
|
|
|
|
for chunk in chunks: |
|
|
chunk.page_content = context[chunk.metadata['source']] + "\n\nVoici un extrait du document:\n" + chunk.page_content |
|
|
|
|
|
self.chunks = chunks |
|
|
print("Chunks with context created") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|