""" MEXAR - Semantic Chunking Module Smart chunking that preserves semantic units for better retrieval. """ import re from typing import List, Dict, Any class SemanticChunker: """ Intelligent text chunking that preserves semantic meaning. - Respects paragraph boundaries - Groups sentences to target token count - Maintains overlap for context continuity """ def __init__(self, target_tokens: int = 400, overlap_tokens: int = 50): """ Initialize chunker. Args: target_tokens: Target tokens per chunk (approx 4 chars/token) overlap_tokens: Overlap between consecutive chunks """ self.target_tokens = target_tokens self.overlap_tokens = overlap_tokens def chunk_text(self, text: str, source: str) -> List[Dict[str, Any]]: """ Split unstructured text into semantic chunks. Args: text: Raw text content source: Source file name Returns: List of chunk dictionaries """ if not text or not text.strip(): return [] paragraphs = self._split_paragraphs(text) chunks = [] current_chunk = [] current_tokens = 0 for para in paragraphs: para_tokens = self._count_tokens(para) # If adding this paragraph exceeds target and we have content, save chunk if current_tokens + para_tokens > self.target_tokens and current_chunk: chunk_text = "\n\n".join(current_chunk) chunks.append({ "content": chunk_text, "source": source, "token_count": current_tokens, "chunk_index": len(chunks) }) # Overlap: keep last paragraph for context continuity if current_chunk: last_para = current_chunk[-1] current_chunk = [last_para] current_tokens = self._count_tokens(last_para) else: current_chunk = [] current_tokens = 0 current_chunk.append(para) current_tokens += para_tokens # Don't forget the last chunk if current_chunk: chunks.append({ "content": "\n\n".join(current_chunk), "source": source, "token_count": current_tokens, "chunk_index": len(chunks) }) return chunks def chunk_structured_data(self, data: List[Dict], source: str) -> List[Dict[str, Any]]: """ Convert structured data (CSV/JSON rows) into searchable chunks. Each row becomes a self-contained, readable chunk. Args: data: List of dictionaries (rows) source: Source file name Returns: List of chunk dictionaries """ chunks = [] for i, row in enumerate(data): if not isinstance(row, dict): continue # Format row as readable text with context content_parts = [f"Entry {i+1} from {source}:"] for key, value in row.items(): if value is not None and str(value).strip(): # Clean up the key name for readability clean_key = str(key).replace("_", " ").title() content_parts.append(f" {clean_key}: {value}") content = "\n".join(content_parts) chunks.append({ "content": content, "source": f"{source}, Entry {i+1}", "token_count": self._count_tokens(content), "chunk_index": i, "row_data": row # Keep original data for reference }) return chunks def _split_paragraphs(self, text: str) -> List[str]: """Split text into paragraphs.""" # Split on double newlines or multiple newlines paragraphs = re.split(r'\n\s*\n', text) # Clean and filter empty paragraphs cleaned = [] for p in paragraphs: p = p.strip() if p: cleaned.append(p) return cleaned def _count_tokens(self, text: str) -> int: """Approximate token count (roughly 4 chars per token).""" return len(text.split()) def create_semantic_chunker(target_tokens: int = 400) -> SemanticChunker: """Factory function to create a SemanticChunker instance.""" return SemanticChunker(target_tokens=target_tokens)