Spaces:

Abdullahcoder54
/

Hackaton1_BOOK_chatbot

Runtime error

App Files Files Community

Hackaton1_BOOK_chatbot / app /utils /chunking.py

Abdullahcoder54

'Upload'

36425a4 3 months ago

raw

history blame contribute delete

2.02 kB

	import markdown
	import re
	from typing import List, Dict, Any

	def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]:
	"""
	Splits markdown content into semantic chunks, respecting heading boundaries.

	Args:
	markdown_content: The full markdown content of a document.
	min_tokens: Minimum token count for a chunk.
	max_tokens: Maximum token count for a chunk.
	overlap_tokens: Number of tokens to overlap between chunks.

	Returns:
	A list of dictionaries, where each dictionary represents a chunk
	and contains 'content' and 'metadata' (e.g., 'source', 'heading').
	"""
	chunks = []
	# This is a placeholder. A real implementation would involve tokenization
	# and more sophisticated logic to respect headings and token limits.
	# For now, we'll split by paragraphs as a simple approximation.
	paragraphs = markdown_content.split('\n\n')

	current_chunk_content = ""
	current_heading = "Introduction" # Default heading

	for paragraph in paragraphs:
	if paragraph.strip().startswith('#'): # Detect a new heading
	current_heading = paragraph.strip().lstrip('# ').strip()

	# Simple token count approximation (word count)
	paragraph_tokens = len(paragraph.split())

	if len(current_chunk_content.split()) + paragraph_tokens > max_tokens:
	if current_chunk_content.strip():
	chunks.append({
	"content": current_chunk_content.strip(),
	"metadata": {"source": "unknown", "heading": current_heading}
	})
	current_chunk_content = paragraph
	else:
	current_chunk_content += "\n\n" + paragraph

	if current_chunk_content.strip():
	chunks.append({
	"content": current_chunk_content.strip(),
	"metadata": {"source": "unknown", "heading": current_heading}
	})

	return chunks