Mexar / backend /utils /semantic_chunker.py
Devrajsinh bharatsinh gohil
Initial commit of MEXAR Ultimate - Phase 2 cleanup complete
b0b150b
"""
MEXAR - Semantic Chunking Module
Smart chunking that preserves semantic units for better retrieval.
"""
import re
from typing import List, Dict, Any
class SemanticChunker:
"""
Intelligent text chunking that preserves semantic meaning.
- Respects paragraph boundaries
- Groups sentences to target token count
- Maintains overlap for context continuity
"""
def __init__(self, target_tokens: int = 400, overlap_tokens: int = 50):
"""
Initialize chunker.
Args:
target_tokens: Target tokens per chunk (approx 4 chars/token)
overlap_tokens: Overlap between consecutive chunks
"""
self.target_tokens = target_tokens
self.overlap_tokens = overlap_tokens
def chunk_text(self, text: str, source: str) -> List[Dict[str, Any]]:
"""
Split unstructured text into semantic chunks.
Args:
text: Raw text content
source: Source file name
Returns:
List of chunk dictionaries
"""
if not text or not text.strip():
return []
paragraphs = self._split_paragraphs(text)
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = self._count_tokens(para)
# If adding this paragraph exceeds target and we have content, save chunk
if current_tokens + para_tokens > self.target_tokens and current_chunk:
chunk_text = "\n\n".join(current_chunk)
chunks.append({
"content": chunk_text,
"source": source,
"token_count": current_tokens,
"chunk_index": len(chunks)
})
# Overlap: keep last paragraph for context continuity
if current_chunk:
last_para = current_chunk[-1]
current_chunk = [last_para]
current_tokens = self._count_tokens(last_para)
else:
current_chunk = []
current_tokens = 0
current_chunk.append(para)
current_tokens += para_tokens
# Don't forget the last chunk
if current_chunk:
chunks.append({
"content": "\n\n".join(current_chunk),
"source": source,
"token_count": current_tokens,
"chunk_index": len(chunks)
})
return chunks
def chunk_structured_data(self, data: List[Dict], source: str) -> List[Dict[str, Any]]:
"""
Convert structured data (CSV/JSON rows) into searchable chunks.
Each row becomes a self-contained, readable chunk.
Args:
data: List of dictionaries (rows)
source: Source file name
Returns:
List of chunk dictionaries
"""
chunks = []
for i, row in enumerate(data):
if not isinstance(row, dict):
continue
# Format row as readable text with context
content_parts = [f"Entry {i+1} from {source}:"]
for key, value in row.items():
if value is not None and str(value).strip():
# Clean up the key name for readability
clean_key = str(key).replace("_", " ").title()
content_parts.append(f" {clean_key}: {value}")
content = "\n".join(content_parts)
chunks.append({
"content": content,
"source": f"{source}, Entry {i+1}",
"token_count": self._count_tokens(content),
"chunk_index": i,
"row_data": row # Keep original data for reference
})
return chunks
def _split_paragraphs(self, text: str) -> List[str]:
"""Split text into paragraphs."""
# Split on double newlines or multiple newlines
paragraphs = re.split(r'\n\s*\n', text)
# Clean and filter empty paragraphs
cleaned = []
for p in paragraphs:
p = p.strip()
if p:
cleaned.append(p)
return cleaned
def _count_tokens(self, text: str) -> int:
"""Approximate token count (roughly 4 chars per token)."""
return len(text.split())
def create_semantic_chunker(target_tokens: int = 400) -> SemanticChunker:
"""Factory function to create a SemanticChunker instance."""
return SemanticChunker(target_tokens=target_tokens)