fhoc / src /utils /chunking.py
Francisco Zanartu
refactor: streamline chunk creation by removing total_chunks parameter and updating chunk handling
c48f556
"""Context-Enriched Chunking"""
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
def get_context_enriched_chunks(
document_text,
llm,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
base_chunks = get_base_chunks(document_text, chunk_size, chunk_overlap)
document_overview = document_summary(document_text, llm)
enriched_documents = []
for i, chunk in enumerate(base_chunks):
print(f"Processing chunk {i+1}/{len(base_chunks)}")
doc = create_enriched_document(
document_overview,
chunk,
i,
llm,
)
enriched_documents.append(doc)
return enriched_documents
def create_enriched_document(
document_overview,
chunk,
chunk_id,
llm,
):
metadata = {
"chunk_id": chunk_id,
"chunk_length": len(chunk.page_content),
"start_index": chunk.metadata.get("start_index", 0),
"chunk": chunk.page_content,
"document_summary": document_overview,
}
chunk_summary = summarize_context(document_overview, chunk.page_content, llm)
metadata["chunk_summary"] = chunk_summary
return Document(page_content=chunk.page_content, metadata=metadata)
def document_summary(document_text, llm):
prompt = PromptTemplate.from_template(
"Summarize the main topic and purpose of this document in 2-3 sentences: "
"\n\n"
"Document:"
"{document}"
"\n\n"
"Summary:"
)
chain = prompt | llm
# roughly 1 long essay = 2k words; 1 word ~ 5 characters
# limit to first 10k characters for cost/speed processing.
response = chain.invoke({"document": document_text[:10000]})
return response.content
def summarize_context(document_overview, context_text, llm):
prompt = PromptTemplate.from_template(
"You are an expert document analyst tasked with creating concise, clear summaries of text passages. "
"Your summaries help readers quickly grasp the core message of each section.\n\n"
"Overall Document Context: {global_summary}\n\n"
"Instructions:\n"
"- Provide a brief 1-2 sentence summary that states the main claim or idea expressed in the text below\n"
"- Begin with a concrete subject (e.g., a concept, actor, or phenomenon)\n"
"- Avoid meta-phrases such as 'this snippet', 'this excerpt', 'the text', or similar references\n"
"- Focus on what is being discussed, not that something is being discussed\n\n"
"Text: {text}\n\n"
"Summary:"
)
chain = prompt | llm
response = chain.invoke({"global_summary": document_overview, "text": context_text})
return response.content
def get_base_chunks(document_text, chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ".", " "], # Paragraph → Line → Sentence → Word
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=True,
strip_whitespace=False,
)
base_chunks = text_splitter.create_documents([document_text])
return base_chunks