Abdullahcoder54's picture
'Upload'
36425a4
import markdown
import re
from typing import List, Dict, Any
def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]:
"""
Splits markdown content into semantic chunks, respecting heading boundaries.
Args:
markdown_content: The full markdown content of a document.
min_tokens: Minimum token count for a chunk.
max_tokens: Maximum token count for a chunk.
overlap_tokens: Number of tokens to overlap between chunks.
Returns:
A list of dictionaries, where each dictionary represents a chunk
and contains 'content' and 'metadata' (e.g., 'source', 'heading').
"""
chunks = []
# This is a placeholder. A real implementation would involve tokenization
# and more sophisticated logic to respect headings and token limits.
# For now, we'll split by paragraphs as a simple approximation.
paragraphs = markdown_content.split('\n\n')
current_chunk_content = ""
current_heading = "Introduction" # Default heading
for paragraph in paragraphs:
if paragraph.strip().startswith('#'): # Detect a new heading
current_heading = paragraph.strip().lstrip('# ').strip()
# Simple token count approximation (word count)
paragraph_tokens = len(paragraph.split())
if len(current_chunk_content.split()) + paragraph_tokens > max_tokens:
if current_chunk_content.strip():
chunks.append({
"content": current_chunk_content.strip(),
"metadata": {"source": "unknown", "heading": current_heading}
})
current_chunk_content = paragraph
else:
current_chunk_content += "\n\n" + paragraph
if current_chunk_content.strip():
chunks.append({
"content": current_chunk_content.strip(),
"metadata": {"source": "unknown", "heading": current_heading}
})
return chunks