Spaces:

Harshdhsvguyt
/

policy_rag_assistant

Sleeping

Upload 19 files

754d8d3 verified about 2 months ago

1.59 kB

	from typing import List


	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
	"""
	Split text into overlapping chunks based on word count.

	Args:
	text: Input text to chunk
	chunk_size: Number of words per chunk
	overlap: Number of overlapping words between chunks

	Returns:
	List of text chunks
	"""
	words = text.split()
	chunks = []

	if len(words) <= chunk_size:
	return [text]

	start = 0
	while start < len(words):
	end = start + chunk_size
	chunk_words = words[start:end]
	chunks.append(" ".join(chunk_words))

	if end >= len(words):
	break

	start = end - overlap

	return chunks


	def chunk_documents(documents: List[dict], chunk_size: int = 500, overlap: int = 100) -> List[dict]:
	"""
	Chunk multiple documents while preserving metadata.

	Returns:
	List of dicts with 'text' and 'metadata' keys
	"""
	chunked_docs = []

	for doc in documents:
	text = doc["text"]
	metadata = doc.get("metadata", {})

	chunks = chunk_text(text, chunk_size, overlap)

	for i, chunk in enumerate(chunks):
	chunked_docs.append({
	"text": chunk,
	"metadata": {
	**metadata,
	"chunk_id": i,
	"total_chunks": len(chunks)
	}
	})

	return chunked_docs