Spaces:
Runtime error
Runtime error
File size: 2,023 Bytes
36425a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import markdown
import re
from typing import List, Dict, Any
def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]:
"""
Splits markdown content into semantic chunks, respecting heading boundaries.
Args:
markdown_content: The full markdown content of a document.
min_tokens: Minimum token count for a chunk.
max_tokens: Maximum token count for a chunk.
overlap_tokens: Number of tokens to overlap between chunks.
Returns:
A list of dictionaries, where each dictionary represents a chunk
and contains 'content' and 'metadata' (e.g., 'source', 'heading').
"""
chunks = []
# This is a placeholder. A real implementation would involve tokenization
# and more sophisticated logic to respect headings and token limits.
# For now, we'll split by paragraphs as a simple approximation.
paragraphs = markdown_content.split('\n\n')
current_chunk_content = ""
current_heading = "Introduction" # Default heading
for paragraph in paragraphs:
if paragraph.strip().startswith('#'): # Detect a new heading
current_heading = paragraph.strip().lstrip('# ').strip()
# Simple token count approximation (word count)
paragraph_tokens = len(paragraph.split())
if len(current_chunk_content.split()) + paragraph_tokens > max_tokens:
if current_chunk_content.strip():
chunks.append({
"content": current_chunk_content.strip(),
"metadata": {"source": "unknown", "heading": current_heading}
})
current_chunk_content = paragraph
else:
current_chunk_content += "\n\n" + paragraph
if current_chunk_content.strip():
chunks.append({
"content": current_chunk_content.strip(),
"metadata": {"source": "unknown", "heading": current_heading}
})
return chunks
|