Spaces:
Sleeping
Sleeping
| import uuid | |
| import re | |
| import logging | |
| from typing import Dict, Any, List, Optional | |
| from dataclasses import dataclass | |
| import tiktoken | |
| logger = logging.getLogger(__name__) | |
| class Chunk: | |
| """Data class for document chunks""" | |
| doc_id: str | |
| chunk_id: str | |
| content: str | |
| metadata: Dict[str, Any] | |
| embeddings: Optional[List[float]] = None | |
| class TextProcessor: | |
| """Text processing utilities""" | |
| def __init__(self): | |
| self.encoding = tiktoken.get_encoding("cl100k_base") | |
| def count_tokens(self, text: str) -> int: | |
| """Count tokens in text""" | |
| return len(self.encoding.encode(text)) | |
| def mask_pii(self, text: str) -> str: | |
| """Mask personally identifiable information""" | |
| # Email addresses | |
| text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text) | |
| # Phone numbers | |
| text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text) | |
| # Credit card numbers | |
| text = re.sub(r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', '[CREDIT_CARD]', text) | |
| # SSN | |
| text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text) | |
| return text | |
| def clean_text(self, text: str) -> str: | |
| """Clean and normalize text""" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters but keep basic punctuation | |
| text = re.sub(r'[^\w\s.,!?;:()\-]', '', text) | |
| return text.strip() | |
| def clean_text_preserve_newlines(self, text: str) -> str: | |
| """Normalize text but preserve paragraph breaks for chunking. | |
| - Normalize Windows newlines to \n | |
| - Trim spaces on each line | |
| - Collapse 3+ newlines -> 2 newlines (keep blank lines as separators) | |
| - Collapse multiple spaces within lines | |
| - Keep basic punctuation | |
| """ | |
| # Normalize line endings | |
| text = text.replace('\r\n', '\n').replace('\r', '\n') | |
| # Trim spaces on each line | |
| text = '\n'.join(line.strip() for line in text.split('\n')) | |
| # Collapse 3+ newlines to 2 newlines | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Collapse multiple spaces within lines | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| # Remove disallowed characters but keep punctuation and newlines | |
| text = re.sub(r'[^\w\s\n.,!?;:()\-]', '', text) | |
| return text.strip() | |
| def generate_id() -> str: | |
| """Generate unique ID""" | |
| return str(uuid.uuid4()) |