"""Context-Enriched Chunking""" from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.documents import Document from langchain_core.prompts import PromptTemplate def get_context_enriched_chunks( document_text, llm, chunk_size: int = 1000, chunk_overlap: int = 200, ): base_chunks = get_base_chunks(document_text, chunk_size, chunk_overlap) document_overview = document_summary(document_text, llm) enriched_documents = [] for i, chunk in enumerate(base_chunks): print(f"Processing chunk {i+1}/{len(base_chunks)}") doc = create_enriched_document( document_overview, chunk, i, llm, ) enriched_documents.append(doc) return enriched_documents def create_enriched_document( document_overview, chunk, chunk_id, llm, ): metadata = { "chunk_id": chunk_id, "chunk_length": len(chunk.page_content), "start_index": chunk.metadata.get("start_index", 0), "chunk": chunk.page_content, "document_summary": document_overview, } chunk_summary = summarize_context(document_overview, chunk.page_content, llm) metadata["chunk_summary"] = chunk_summary return Document(page_content=chunk.page_content, metadata=metadata) def document_summary(document_text, llm): prompt = PromptTemplate.from_template( "Summarize the main topic and purpose of this document in 2-3 sentences: " "\n\n" "Document:" "{document}" "\n\n" "Summary:" ) chain = prompt | llm # roughly 1 long essay = 2k words; 1 word ~ 5 characters # limit to first 10k characters for cost/speed processing. response = chain.invoke({"document": document_text[:10000]}) return response.content def summarize_context(document_overview, context_text, llm): prompt = PromptTemplate.from_template( "You are an expert document analyst tasked with creating concise, clear summaries of text passages. " "Your summaries help readers quickly grasp the core message of each section.\n\n" "Overall Document Context: {global_summary}\n\n" "Instructions:\n" "- Provide a brief 1-2 sentence summary that states the main claim or idea expressed in the text below\n" "- Begin with a concrete subject (e.g., a concept, actor, or phenomenon)\n" "- Avoid meta-phrases such as 'this snippet', 'this excerpt', 'the text', or similar references\n" "- Focus on what is being discussed, not that something is being discussed\n\n" "Text: {text}\n\n" "Summary:" ) chain = prompt | llm response = chain.invoke({"global_summary": document_overview, "text": context_text}) return response.content def get_base_chunks(document_text, chunk_size, chunk_overlap): text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", ".", " "], # Paragraph → Line → Sentence → Word chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True, strip_whitespace=False, ) base_chunks = text_splitter.create_documents([document_text]) return base_chunks