Spaces:

devrajsinh2012
/

Mexar

Running

Mexar / backend /utils /semantic_chunker.py

Devrajsinh bharatsinh gohil

Initial commit of MEXAR Ultimate - Phase 2 cleanup complete

b0b150b 10 days ago

4.82 kB

	"""
	MEXAR - Semantic Chunking Module
	Smart chunking that preserves semantic units for better retrieval.
	"""
	import re
	from typing import List, Dict, Any


	class SemanticChunker:
	"""
	Intelligent text chunking that preserves semantic meaning.
	- Respects paragraph boundaries
	- Groups sentences to target token count
	- Maintains overlap for context continuity
	"""

	def __init__(self, target_tokens: int = 400, overlap_tokens: int = 50):
	"""
	Initialize chunker.

	Args:
	target_tokens: Target tokens per chunk (approx 4 chars/token)
	overlap_tokens: Overlap between consecutive chunks
	"""
	self.target_tokens = target_tokens
	self.overlap_tokens = overlap_tokens

	def chunk_text(self, text: str, source: str) -> List[Dict[str, Any]]:
	"""
	Split unstructured text into semantic chunks.

	Args:
	text: Raw text content
	source: Source file name

	Returns:
	List of chunk dictionaries
	"""
	if not text or not text.strip():
	return []

	paragraphs = self._split_paragraphs(text)
	chunks = []
	current_chunk = []
	current_tokens = 0

	for para in paragraphs:
	para_tokens = self._count_tokens(para)

	# If adding this paragraph exceeds target and we have content, save chunk
	if current_tokens + para_tokens > self.target_tokens and current_chunk:
	chunk_text = "\n\n".join(current_chunk)
	chunks.append({
	"content": chunk_text,
	"source": source,
	"token_count": current_tokens,
	"chunk_index": len(chunks)
	})

	# Overlap: keep last paragraph for context continuity
	if current_chunk:
	last_para = current_chunk[-1]
	current_chunk = [last_para]
	current_tokens = self._count_tokens(last_para)
	else:
	current_chunk = []
	current_tokens = 0

	current_chunk.append(para)
	current_tokens += para_tokens

	# Don't forget the last chunk
	if current_chunk:
	chunks.append({
	"content": "\n\n".join(current_chunk),
	"source": source,
	"token_count": current_tokens,
	"chunk_index": len(chunks)
	})

	return chunks

	def chunk_structured_data(self, data: List[Dict], source: str) -> List[Dict[str, Any]]:
	"""
	Convert structured data (CSV/JSON rows) into searchable chunks.
	Each row becomes a self-contained, readable chunk.

	Args:
	data: List of dictionaries (rows)
	source: Source file name

	Returns:
	List of chunk dictionaries
	"""
	chunks = []

	for i, row in enumerate(data):
	if not isinstance(row, dict):
	continue

	# Format row as readable text with context
	content_parts = [f"Entry {i+1} from {source}:"]

	for key, value in row.items():
	if value is not None and str(value).strip():
	# Clean up the key name for readability
	clean_key = str(key).replace("_", " ").title()
	content_parts.append(f" {clean_key}: {value}")

	content = "\n".join(content_parts)

	chunks.append({
	"content": content,
	"source": f"{source}, Entry {i+1}",
	"token_count": self._count_tokens(content),
	"chunk_index": i,
	"row_data": row # Keep original data for reference
	})

	return chunks

	def _split_paragraphs(self, text: str) -> List[str]:
	"""Split text into paragraphs."""
	# Split on double newlines or multiple newlines
	paragraphs = re.split(r'\n\s*\n', text)

	# Clean and filter empty paragraphs
	cleaned = []
	for p in paragraphs:
	p = p.strip()
	if p:
	cleaned.append(p)

	return cleaned

	def _count_tokens(self, text: str) -> int:
	"""Approximate token count (roughly 4 chars per token)."""
	return len(text.split())


	def create_semantic_chunker(target_tokens: int = 400) -> SemanticChunker:
	"""Factory function to create a SemanticChunker instance."""
	return SemanticChunker(target_tokens=target_tokens)