soft.engineer
init project
e71fabd
import uuid
import re
import logging
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
import tiktoken
logger = logging.getLogger(__name__)
@dataclass
class Chunk:
"""Data class for document chunks"""
doc_id: str
chunk_id: str
content: str
metadata: Dict[str, Any]
embeddings: Optional[List[float]] = None
class TextProcessor:
"""Text processing utilities"""
def __init__(self):
self.encoding = tiktoken.get_encoding("cl100k_base")
def count_tokens(self, text: str) -> int:
"""Count tokens in text"""
return len(self.encoding.encode(text))
def mask_pii(self, text: str) -> str:
"""Mask personally identifiable information"""
# Email addresses
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
# Phone numbers
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
# Credit card numbers
text = re.sub(r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', '[CREDIT_CARD]', text)
# SSN
text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
return text
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
return text.strip()
def clean_text_preserve_newlines(self, text: str) -> str:
"""Normalize text but preserve paragraph breaks for chunking.
- Normalize Windows newlines to \n
- Trim spaces on each line
- Collapse 3+ newlines -> 2 newlines (keep blank lines as separators)
- Collapse multiple spaces within lines
- Keep basic punctuation
"""
# Normalize line endings
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Trim spaces on each line
text = '\n'.join(line.strip() for line in text.split('\n'))
# Collapse 3+ newlines to 2 newlines
text = re.sub(r'\n{3,}', '\n\n', text)
# Collapse multiple spaces within lines
text = re.sub(r'[ \t]+', ' ', text)
# Remove disallowed characters but keep punctuation and newlines
text = re.sub(r'[^\w\s\n.,!?;:()\-]', '', text)
return text.strip()
def generate_id() -> str:
"""Generate unique ID"""
return str(uuid.uuid4())