Francisco Zanartu
refactor: streamline chunk creation by removing total_chunks parameter and updating chunk handling
c48f556 | """Context-Enriched Chunking""" | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| from langchain_core.prompts import PromptTemplate | |
| def get_context_enriched_chunks( | |
| document_text, | |
| llm, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| ): | |
| base_chunks = get_base_chunks(document_text, chunk_size, chunk_overlap) | |
| document_overview = document_summary(document_text, llm) | |
| enriched_documents = [] | |
| for i, chunk in enumerate(base_chunks): | |
| print(f"Processing chunk {i+1}/{len(base_chunks)}") | |
| doc = create_enriched_document( | |
| document_overview, | |
| chunk, | |
| i, | |
| llm, | |
| ) | |
| enriched_documents.append(doc) | |
| return enriched_documents | |
| def create_enriched_document( | |
| document_overview, | |
| chunk, | |
| chunk_id, | |
| llm, | |
| ): | |
| metadata = { | |
| "chunk_id": chunk_id, | |
| "chunk_length": len(chunk.page_content), | |
| "start_index": chunk.metadata.get("start_index", 0), | |
| "chunk": chunk.page_content, | |
| "document_summary": document_overview, | |
| } | |
| chunk_summary = summarize_context(document_overview, chunk.page_content, llm) | |
| metadata["chunk_summary"] = chunk_summary | |
| return Document(page_content=chunk.page_content, metadata=metadata) | |
| def document_summary(document_text, llm): | |
| prompt = PromptTemplate.from_template( | |
| "Summarize the main topic and purpose of this document in 2-3 sentences: " | |
| "\n\n" | |
| "Document:" | |
| "{document}" | |
| "\n\n" | |
| "Summary:" | |
| ) | |
| chain = prompt | llm | |
| # roughly 1 long essay = 2k words; 1 word ~ 5 characters | |
| # limit to first 10k characters for cost/speed processing. | |
| response = chain.invoke({"document": document_text[:10000]}) | |
| return response.content | |
| def summarize_context(document_overview, context_text, llm): | |
| prompt = PromptTemplate.from_template( | |
| "You are an expert document analyst tasked with creating concise, clear summaries of text passages. " | |
| "Your summaries help readers quickly grasp the core message of each section.\n\n" | |
| "Overall Document Context: {global_summary}\n\n" | |
| "Instructions:\n" | |
| "- Provide a brief 1-2 sentence summary that states the main claim or idea expressed in the text below\n" | |
| "- Begin with a concrete subject (e.g., a concept, actor, or phenomenon)\n" | |
| "- Avoid meta-phrases such as 'this snippet', 'this excerpt', 'the text', or similar references\n" | |
| "- Focus on what is being discussed, not that something is being discussed\n\n" | |
| "Text: {text}\n\n" | |
| "Summary:" | |
| ) | |
| chain = prompt | llm | |
| response = chain.invoke({"global_summary": document_overview, "text": context_text}) | |
| return response.content | |
| def get_base_chunks(document_text, chunk_size, chunk_overlap): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| separators=["\n\n", "\n", ".", " "], # Paragraph → Line → Sentence → Word | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| add_start_index=True, | |
| strip_whitespace=False, | |
| ) | |
| base_chunks = text_splitter.create_documents([document_text]) | |
| return base_chunks | |