Spaces:

PercivalFletcher
/

ParentHackRx

Sleeping

App Files Files Community

ParentHackRx / chunking_parent.py

PercivalFletcher

Upload 7 files

84f4fa5 verified 7 months ago

raw

history blame contribute delete

2.93 kB

	# file: chunking.py
	import uuid
	from typing import List, Tuple, Dict, Any
	from langchain_core.documents import Document
	from langchain.storage import InMemoryStore
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# --- Configuration for Parent-Child Splitting ---
	# Parent chunks are the larger documents passed to the LLM for context.
	PARENT_CHUNK_SIZE = 2000
	PARENT_CHUNK_OVERLAP = 200

	# Child chunks are the smaller, more granular documents used for retrieval.
	CHILD_CHUNK_SIZE = 400
	CHILD_CHUNK_OVERLAP = 100

	def create_parent_child_chunks(
	full_text: str
	) -> Tuple[List[Document], InMemoryStore, Dict[str, str]]:
	"""
	Implements the Parent Document strategy for chunking.

	1. Splits the document into larger "parent" chunks.
	2. Splits the parent chunks into smaller "child" chunks.
	3. The child chunks are used for retrieval, while the parent chunks
	are used to provide context to the LLM.

	Args:
	full_text: The entire text content of the document.

	Returns:
	A tuple containing:
	- A list of the small "child" documents for the vector store.
	- An in-memory store mapping parent document IDs to the parent documents.
	- A dictionary mapping child document IDs to their parent's ID.
	"""
	if not full_text:
	print("Warning: Input text for chunking is empty.")
	return [], InMemoryStore(), {}

	print("Creating parent and child chunks...")

	# This splitter creates the large documents that will be stored.
	parent_splitter = RecursiveCharacterTextSplitter(
	chunk_size=PARENT_CHUNK_SIZE,
	chunk_overlap=PARENT_CHUNK_OVERLAP,
	)

	# This splitter creates the small, granular chunks for retrieval.
	child_splitter = RecursiveCharacterTextSplitter(
	chunk_size=CHILD_CHUNK_SIZE,
	chunk_overlap=CHILD_CHUNK_OVERLAP,
	)

	parent_documents = parent_splitter.create_documents([full_text])

	docstore = InMemoryStore()
	child_documents = []
	child_to_parent_id_map = {}

	# Generate unique IDs for each parent document and add them to the store
	parent_ids = [str(uuid.uuid4()) for _ in parent_documents]
	docstore.mset(list(zip(parent_ids, parent_documents)))

	# Split each parent document into smaller child documents
	for i, p_doc in enumerate(parent_documents):
	parent_id = parent_ids[i]
	_child_docs = child_splitter.split_documents([p_doc])

	for _child_doc in _child_docs:
	child_id = str(uuid.uuid4())
	_child_doc.metadata["parent_id"] = parent_id
	_child_doc.metadata["child_id"] = child_id
	child_to_parent_id_map[child_id] = parent_id

	child_documents.extend(_child_docs)

	print(f"Created {len(parent_documents)} parent chunks and {len(child_documents)} child chunks.")
	return child_documents, docstore, child_to_parent_id_map