File size: 4,819 Bytes
b0b150b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
MEXAR - Semantic Chunking Module
Smart chunking that preserves semantic units for better retrieval.
"""
import re
from typing import List, Dict, Any


class SemanticChunker:
    """
    Intelligent text chunking that preserves semantic meaning.
    - Respects paragraph boundaries
    - Groups sentences to target token count
    - Maintains overlap for context continuity
    """
    
    def __init__(self, target_tokens: int = 400, overlap_tokens: int = 50):
        """
        Initialize chunker.
        
        Args:
            target_tokens: Target tokens per chunk (approx 4 chars/token)
            overlap_tokens: Overlap between consecutive chunks
        """
        self.target_tokens = target_tokens
        self.overlap_tokens = overlap_tokens
    
    def chunk_text(self, text: str, source: str) -> List[Dict[str, Any]]:
        """
        Split unstructured text into semantic chunks.
        
        Args:
            text: Raw text content
            source: Source file name
            
        Returns:
            List of chunk dictionaries
        """
        if not text or not text.strip():
            return []
        
        paragraphs = self._split_paragraphs(text)
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for para in paragraphs:
            para_tokens = self._count_tokens(para)
            
            # If adding this paragraph exceeds target and we have content, save chunk
            if current_tokens + para_tokens > self.target_tokens and current_chunk:
                chunk_text = "\n\n".join(current_chunk)
                chunks.append({
                    "content": chunk_text,
                    "source": source,
                    "token_count": current_tokens,
                    "chunk_index": len(chunks)
                })
                
                # Overlap: keep last paragraph for context continuity
                if current_chunk:
                    last_para = current_chunk[-1]
                    current_chunk = [last_para]
                    current_tokens = self._count_tokens(last_para)
                else:
                    current_chunk = []
                    current_tokens = 0
            
            current_chunk.append(para)
            current_tokens += para_tokens
        
        # Don't forget the last chunk
        if current_chunk:
            chunks.append({
                "content": "\n\n".join(current_chunk),
                "source": source,
                "token_count": current_tokens,
                "chunk_index": len(chunks)
            })
        
        return chunks
    
    def chunk_structured_data(self, data: List[Dict], source: str) -> List[Dict[str, Any]]:
        """
        Convert structured data (CSV/JSON rows) into searchable chunks.
        Each row becomes a self-contained, readable chunk.
        
        Args:
            data: List of dictionaries (rows)
            source: Source file name
            
        Returns:
            List of chunk dictionaries
        """
        chunks = []
        
        for i, row in enumerate(data):
            if not isinstance(row, dict):
                continue
            
            # Format row as readable text with context
            content_parts = [f"Entry {i+1} from {source}:"]
            
            for key, value in row.items():
                if value is not None and str(value).strip():
                    # Clean up the key name for readability
                    clean_key = str(key).replace("_", " ").title()
                    content_parts.append(f"  {clean_key}: {value}")
            
            content = "\n".join(content_parts)
            
            chunks.append({
                "content": content,
                "source": f"{source}, Entry {i+1}",
                "token_count": self._count_tokens(content),
                "chunk_index": i,
                "row_data": row  # Keep original data for reference
            })
        
        return chunks
    
    def _split_paragraphs(self, text: str) -> List[str]:
        """Split text into paragraphs."""
        # Split on double newlines or multiple newlines
        paragraphs = re.split(r'\n\s*\n', text)
        
        # Clean and filter empty paragraphs
        cleaned = []
        for p in paragraphs:
            p = p.strip()
            if p:
                cleaned.append(p)
        
        return cleaned
    
    def _count_tokens(self, text: str) -> int:
        """Approximate token count (roughly 4 chars per token)."""
        return len(text.split())


def create_semantic_chunker(target_tokens: int = 400) -> SemanticChunker:
    """Factory function to create a SemanticChunker instance."""
    return SemanticChunker(target_tokens=target_tokens)