Spaces:

fzanartu
/

fhoc

Runtime error

fhoc / src /utils /chunking.py

Francisco Zanartu

refactor: streamline chunk creation by removing total_chunks parameter and updating chunk handling

c48f556 2 months ago

3.26 kB

	"""Context-Enriched Chunking"""

	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_core.prompts import PromptTemplate


	def get_context_enriched_chunks(
	document_text,
	llm,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	):

	base_chunks = get_base_chunks(document_text, chunk_size, chunk_overlap)

	document_overview = document_summary(document_text, llm)

	enriched_documents = []
	for i, chunk in enumerate(base_chunks):
	print(f"Processing chunk {i+1}/{len(base_chunks)}")

	doc = create_enriched_document(
	document_overview,
	chunk,
	i,
	llm,
	)

	enriched_documents.append(doc)

	return enriched_documents


	def create_enriched_document(
	document_overview,
	chunk,
	chunk_id,
	llm,
	):

	metadata = {
	"chunk_id": chunk_id,
	"chunk_length": len(chunk.page_content),
	"start_index": chunk.metadata.get("start_index", 0),
	"chunk": chunk.page_content,
	"document_summary": document_overview,
	}

	chunk_summary = summarize_context(document_overview, chunk.page_content, llm)

	metadata["chunk_summary"] = chunk_summary

	return Document(page_content=chunk.page_content, metadata=metadata)


	def document_summary(document_text, llm):
	prompt = PromptTemplate.from_template(
	"Summarize the main topic and purpose of this document in 2-3 sentences: "
	"\n\n"
	"Document:"
	"{document}"
	"\n\n"
	"Summary:"
	)
	chain = prompt \| llm
	# roughly 1 long essay = 2k words; 1 word ~ 5 characters
	# limit to first 10k characters for cost/speed processing.
	response = chain.invoke({"document": document_text[:10000]})
	return response.content


	def summarize_context(document_overview, context_text, llm):
	prompt = PromptTemplate.from_template(
	"You are an expert document analyst tasked with creating concise, clear summaries of text passages. "
	"Your summaries help readers quickly grasp the core message of each section.\n\n"
	"Overall Document Context: {global_summary}\n\n"
	"Instructions:\n"
	"- Provide a brief 1-2 sentence summary that states the main claim or idea expressed in the text below\n"
	"- Begin with a concrete subject (e.g., a concept, actor, or phenomenon)\n"
	"- Avoid meta-phrases such as 'this snippet', 'this excerpt', 'the text', or similar references\n"
	"- Focus on what is being discussed, not that something is being discussed\n\n"
	"Text: {text}\n\n"
	"Summary:"
	)
	chain = prompt \| llm
	response = chain.invoke({"global_summary": document_overview, "text": context_text})
	return response.content


	def get_base_chunks(document_text, chunk_size, chunk_overlap):

	text_splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n", ".", " "], # Paragraph → Line → Sentence → Word
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	add_start_index=True,
	strip_whitespace=False,
	)

	base_chunks = text_splitter.create_documents([document_text])

	return base_chunks