Spaces:

softblackhole
/

rag-evaluation-system

Sleeping

soft.engineer

init project

e71fabd 4 months ago

2.49 kB

	import uuid
	import re
	import logging
	from typing import Dict, Any, List, Optional
	from dataclasses import dataclass
	import tiktoken

	logger = logging.getLogger(__name__)

	@dataclass
	class Chunk:
	"""Data class for document chunks"""
	doc_id: str
	chunk_id: str
	content: str
	metadata: Dict[str, Any]
	embeddings: Optional[List[float]] = None

	class TextProcessor:
	"""Text processing utilities"""

	def __init__(self):
	self.encoding = tiktoken.get_encoding("cl100k_base")

	def count_tokens(self, text: str) -> int:
	"""Count tokens in text"""
	return len(self.encoding.encode(text))

	def mask_pii(self, text: str) -> str:
	"""Mask personally identifiable information"""
	# Email addresses
	text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', '[EMAIL]', text)
	# Phone numbers
	text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
	# Credit card numbers
	text = re.sub(r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', '[CREDIT_CARD]', text)
	# SSN
	text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
	return text

	def clean_text(self, text: str) -> str:
	"""Clean and normalize text"""
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
	return text.strip()

	def clean_text_preserve_newlines(self, text: str) -> str:
	"""Normalize text but preserve paragraph breaks for chunking.
	- Normalize Windows newlines to \n
	- Trim spaces on each line
	- Collapse 3+ newlines -> 2 newlines (keep blank lines as separators)
	- Collapse multiple spaces within lines
	- Keep basic punctuation
	"""
	# Normalize line endings
	text = text.replace('\r\n', '\n').replace('\r', '\n')
	# Trim spaces on each line
	text = '\n'.join(line.strip() for line in text.split('\n'))
	# Collapse 3+ newlines to 2 newlines
	text = re.sub(r'\n{3,}', '\n\n', text)
	# Collapse multiple spaces within lines
	text = re.sub(r'[ \t]+', ' ', text)
	# Remove disallowed characters but keep punctuation and newlines
	text = re.sub(r'[^\w\s\n.,!?;:()\-]', '', text)
	return text.strip()

	def generate_id() -> str:
	"""Generate unique ID"""
	return str(uuid.uuid4())