Spaces:

Debito
/

mamba-encoder-swarm_app

Sleeping

mamba-encoder-swarm_app / core /preprocess.py

Upload 8 files

055a9c8 verified 4 months ago

1.9 kB

	# =============================================================================
	# core/preprocess.py
	# =============================================================================
	import re
	import unicodedata
	from config import MambaConfig
	from typing import List, Dict, Any

	class TextPreprocessor:
	def __init__(self, config: MambaConfig):
	self.config = config
	self.max_length = config.max_seq_len

	def clean_text(self, text: str) -> str:
	"""Basic text cleaning"""
	# Normalize unicode
	text = unicodedata.normalize('NFKC', text)

	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove control characters except newlines and tabs
	text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)

	return text.strip()

	def chunk_text(self, text: str, chunk_size: int = None) -> List[str]:
	"""Split text into chunks for distributed processing"""
	if chunk_size is None:
	chunk_size = self.max_length // 2

	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 > chunk_size and current_chunk:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_length = len(word)
	else:
	current_chunk.append(word)
	current_length += len(word) + 1

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def preprocess_batch(self, texts: List[str]) -> List[str]:
	"""Preprocess a batch of texts"""
	return [self.clean_text(text) for text in texts]