Agent_UB / contextual_doc_processor.py
t-pris's picture
Upload folder using huggingface_hub
7b295db verified
import os.path
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from huggingface_hub import InferenceClient
from tqdm import tqdm
from doc_processor import DocProcessor
class ContextualDocProcessor(DocProcessor):
def __init__(self,LIST_DIR, PATH_SAVE):
DocProcessor.__init__(self,LIST_DIR, PATH_SAVE)
def create_chuncks(self, nb_char=1000, chunk_overlap=100):
split_nb_char = nb_char//2
data = []
context = {}
for path in tqdm(self.LIST_DIR):
if path.endswith(".json"):
# path = os.path.join(self.LIST_DIR, filename)
with open(path, 'r') as f:
doc = str(json.load(f))
data += [Document(page_content=doc, metadata={"source": path})]
prompt = f"Résume le document ci-dessous en 500 caractères maximum:\n\nDocument:\n{doc}"
response = InferenceClient().chat_completion(
model="deepseek-ai/DeepSeek-V3-0324",
temperature = 0,
messages=[
{
"role": "user",
"content": prompt,
},
],
)
context[path] = response.choices[0].message.content
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=split_nb_char,
chunk_overlap=chunk_overlap,
add_start_index=True,
strip_whitespace=True,
separators=["}", "]", "\n\n", "\n", ".", " ", ""],
)
chunks = text_splitter.split_documents(data)
for chunk in chunks:
chunk.page_content = context[chunk.metadata['source']] + "\n\nVoici un extrait du document:\n" + chunk.page_content
self.chunks = chunks
print("Chunks with context created")
# def save_context_to_jsonl(self):
# with open(self.PATH_SAVE_CONTEXT, 'w') as jsonl_file:
# for doc in self.context:
# jsonl_file.write(doc.json() + '\n')
# print("Context saved")
# def load_context_from_jsonl(self):
# self.context = []
# with open(self.PATH_SAVE_CONTEXT, 'r') as jsonl_file:
# for line in jsonl_file:
# data = json.loads(line)
# obj = Document(**data)
# self.context.append(obj)
# print("Context loaded")