Spaces:
Runtime error
Runtime error
| import markdown | |
| import re | |
| from typing import List, Dict, Any | |
| def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]: | |
| """ | |
| Splits markdown content into semantic chunks, respecting heading boundaries. | |
| Args: | |
| markdown_content: The full markdown content of a document. | |
| min_tokens: Minimum token count for a chunk. | |
| max_tokens: Maximum token count for a chunk. | |
| overlap_tokens: Number of tokens to overlap between chunks. | |
| Returns: | |
| A list of dictionaries, where each dictionary represents a chunk | |
| and contains 'content' and 'metadata' (e.g., 'source', 'heading'). | |
| """ | |
| chunks = [] | |
| # This is a placeholder. A real implementation would involve tokenization | |
| # and more sophisticated logic to respect headings and token limits. | |
| # For now, we'll split by paragraphs as a simple approximation. | |
| paragraphs = markdown_content.split('\n\n') | |
| current_chunk_content = "" | |
| current_heading = "Introduction" # Default heading | |
| for paragraph in paragraphs: | |
| if paragraph.strip().startswith('#'): # Detect a new heading | |
| current_heading = paragraph.strip().lstrip('# ').strip() | |
| # Simple token count approximation (word count) | |
| paragraph_tokens = len(paragraph.split()) | |
| if len(current_chunk_content.split()) + paragraph_tokens > max_tokens: | |
| if current_chunk_content.strip(): | |
| chunks.append({ | |
| "content": current_chunk_content.strip(), | |
| "metadata": {"source": "unknown", "heading": current_heading} | |
| }) | |
| current_chunk_content = paragraph | |
| else: | |
| current_chunk_content += "\n\n" + paragraph | |
| if current_chunk_content.strip(): | |
| chunks.append({ | |
| "content": current_chunk_content.strip(), | |
| "metadata": {"source": "unknown", "heading": current_heading} | |
| }) | |
| return chunks | |