Spaces:

Agents-MCP-Hackathon
/

pdf_explainer

Sleeping

App Files Files Community

pdf_explainer / src /processors /text_chunker.py

spagestic

feat: Update audio processing to support parallel chunking and enhance text chunking logic

91e586d 7 months ago

raw

history blame contribute delete

7.01 kB

	"""Text chunking utility for breaking down large text into smaller chunks for audio processing."""

	import re
	from typing import List


	class TextChunker:
	"""Handles intelligent text chunking for audio processing."""

	def __init__(self, max_chunk_size: int = 800, overlap_sentences: int = 0):
	"""
	Initialize the text chunker.

	Args:
	max_chunk_size: Maximum number of characters per chunk
	overlap_sentences: Number of sentences to overlap between chunks for continuity
	"""
	self.max_chunk_size = max_chunk_size
	self.overlap_sentences = overlap_sentences

	def chunk_text(self, text: str) -> List[str]:
	"""
	Break text into smaller chunks based on paragraphs and sentence boundaries.

	Args:
	text: The input text to chunk

	Returns:
	List of text chunks
	"""
	if not text or not text.strip():
	return []

	# Clean the text
	text = text.strip()

	# If text is within the limit, return as single chunk
	if len(text) <= self.max_chunk_size:
	return [text]

	chunks = []

	# First, try to split by paragraphs
	paragraphs = self._split_into_paragraphs(text)

	current_chunk = ""

	for paragraph in paragraphs:
	# If adding this paragraph would exceed the limit
	if len(current_chunk) + len(paragraph) + 1 > self.max_chunk_size:
	# If we have content in current chunk, save it
	if current_chunk.strip():
	chunks.append(current_chunk.strip())
	current_chunk = ""

	# If the paragraph itself is too long, split it by sentences
	if len(paragraph) > self.max_chunk_size:
	sentence_chunks = self._split_paragraph_into_sentences(paragraph)
	for sentence_chunk in sentence_chunks:
	if len(current_chunk) + len(sentence_chunk) + 1 > self.max_chunk_size:
	if current_chunk.strip():
	chunks.append(current_chunk.strip())
	current_chunk = sentence_chunk
	else:
	if current_chunk:
	current_chunk += " " + sentence_chunk
	else:
	current_chunk = sentence_chunk
	else:
	current_chunk = paragraph
	else:
	# Add paragraph to current chunk
	if current_chunk:
	current_chunk += "\n\n" + paragraph
	else:
	current_chunk = paragraph

	# Add any remaining content
	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	# Apply overlap if specified
	if self.overlap_sentences > 0 and len(chunks) > 1:
	chunks = self._add_overlap(chunks)

	return chunks
	def _split_into_paragraphs(self, text: str) -> List[str]:
	"""Split text into paragraphs."""
	# Split by double newlines or multiple spaces
	paragraphs = re.split(r'\n\s\n\|(?:\n\s){2,}', text)
	# Filter out empty paragraphs and strip whitespace
	return [p.strip() for p in paragraphs if p.strip()]

	def _split_paragraph_into_sentences(self, paragraph: str) -> List[str]:
	"""Split a long paragraph into sentence-based chunks."""
	# Split by sentence boundaries
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	# If a single sentence is longer than max_chunk_size, we need to force-split it
	if len(sentence) > self.max_chunk_size:
	# Save current chunk if it has content
	if current_chunk.strip():
	chunks.append(current_chunk.strip())
	current_chunk = ""

	# Force-split the long sentence into smaller pieces
	while len(sentence) > self.max_chunk_size:
	# Find a good breaking point (prefer spaces)
	break_point = self.max_chunk_size
	if ' ' in sentence[:self.max_chunk_size]:
	# Find the last space within the limit
	break_point = sentence[:self.max_chunk_size].rfind(' ')

	chunk_part = sentence[:break_point]
	chunks.append(chunk_part)
	sentence = sentence[break_point:].strip()

	# Add the remaining part of the sentence
	if sentence:
	current_chunk = sentence

	elif len(current_chunk) + len(sentence) + 1 > self.max_chunk_size:
	if current_chunk.strip():
	chunks.append(current_chunk.strip())
	current_chunk = sentence
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence

	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	return chunks

	def _add_overlap(self, chunks: List[str]) -> List[str]:
	"""Add sentence overlap between chunks for better continuity."""
	if len(chunks) <= 1:
	return chunks

	overlapped_chunks = [chunks[0]] # First chunk stays the same

	for i in range(1, len(chunks)):
	# Get last few sentences from previous chunk
	prev_chunk = chunks[i - 1]
	current_chunk = chunks[i]

	prev_sentences = re.split(r'(?<=[.!?])\s+', prev_chunk)
	overlap_text = " ".join(prev_sentences[-self.overlap_sentences:]) if len(prev_sentences) > self.overlap_sentences else ""

	if overlap_text:
	overlapped_chunk = overlap_text + " " + current_chunk
	else:
	overlapped_chunk = current_chunk

	overlapped_chunks.append(overlapped_chunk)

	return overlapped_chunks

	def get_chunk_info(self, chunks: List[str]) -> dict:
	"""Get information about the chunks."""
	return {
	"total_chunks": len(chunks),
	"total_characters": sum(len(chunk) for chunk in chunks),
	"avg_chunk_size": sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0,
	"max_chunk_size": max(len(chunk) for chunk in chunks) if chunks else 0,
	"min_chunk_size": min(len(chunk) for chunk in chunks) if chunks else 0
	}