File size: 2,023 Bytes
36425a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import markdown
import re
from typing import List, Dict, Any

def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Splits markdown content into semantic chunks, respecting heading boundaries.

    Args:
        markdown_content: The full markdown content of a document.
        min_tokens: Minimum token count for a chunk.
        max_tokens: Maximum token count for a chunk.
        overlap_tokens: Number of tokens to overlap between chunks.

    Returns:
        A list of dictionaries, where each dictionary represents a chunk
        and contains 'content' and 'metadata' (e.g., 'source', 'heading').
    """
    chunks = []
    # This is a placeholder. A real implementation would involve tokenization
    # and more sophisticated logic to respect headings and token limits.
    # For now, we'll split by paragraphs as a simple approximation.
    paragraphs = markdown_content.split('\n\n')

    current_chunk_content = ""
    current_heading = "Introduction" # Default heading

    for paragraph in paragraphs:
        if paragraph.strip().startswith('#'): # Detect a new heading
            current_heading = paragraph.strip().lstrip('# ').strip()

        # Simple token count approximation (word count)
        paragraph_tokens = len(paragraph.split())

        if len(current_chunk_content.split()) + paragraph_tokens > max_tokens:
            if current_chunk_content.strip():
                chunks.append({
                    "content": current_chunk_content.strip(),
                    "metadata": {"source": "unknown", "heading": current_heading}
                })
            current_chunk_content = paragraph
        else:
            current_chunk_content += "\n\n" + paragraph

    if current_chunk_content.strip():
        chunks.append({
            "content": current_chunk_content.strip(),
            "metadata": {"source": "unknown", "heading": current_heading}
        })

    return chunks