Spaces:
Configuration error
Configuration error
| import re | |
| from typing import List, Tuple | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class TextPreprocessor: | |
| """ | |
| A utility class for preprocessing text before embedding. | |
| Includes cleaning, normalization, and chunking methods. | |
| """ | |
| def clean_text(text: str) -> str: | |
| """Clean text by removing extra whitespaces, newlines, etc.""" | |
| # Remove extra whitespaces and newlines | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters, keeping only alphanumeric and basic punctuation | |
| text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text) | |
| # Remove extra spaces again after special character removal | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def split_by_sentences(text: str) -> List[str]: | |
| """Split text into sentences.""" | |
| # Split by sentence endings | |
| sentences = re.split(r'[.!?]+', text) | |
| # Remove empty strings and strip whitespace | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| return sentences | |
| def split_by_paragraphs(text: str) -> List[str]: | |
| """Split text into paragraphs.""" | |
| paragraphs = text.split('\n\n') | |
| # Remove empty strings and strip whitespace | |
| paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
| return paragraphs | |
| def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]: | |
| """ | |
| Split text into overlapping chunks of specified size. | |
| Args: | |
| text: The input text to chunk | |
| chunk_size: Maximum size of each chunk (in characters) | |
| overlap: Number of characters to overlap between chunks | |
| Returns: | |
| List of text chunks | |
| """ | |
| if len(text) <= chunk_size: | |
| return [text] | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| # Try to break at sentence boundaries if possible | |
| if end < len(text): | |
| # Look for a sentence boundary near the end | |
| sentence_end = text.rfind('.', start, end) | |
| if sentence_end != -1 and sentence_end > start + chunk_size // 2: | |
| end = sentence_end + 1 | |
| else: | |
| # If no sentence boundary found, look for a space | |
| space_end = text.rfind(' ', start, end) | |
| if space_end != -1 and space_end > start + chunk_size // 2: | |
| end = space_end | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| # Move start position, considering overlap | |
| start = end - overlap if overlap < end else end | |
| # If the last chunk was not processed and we've reached the end | |
| if start >= len(text) and end < len(text): | |
| final_chunk = text[end:].strip() | |
| if final_chunk: | |
| chunks.append(final_chunk) | |
| # Filter out any empty chunks | |
| chunks = [chunk for chunk in chunks if chunk] | |
| return chunks | |
| def extract_key_info(text: str) -> dict: | |
| """ | |
| Extract key information from text such as headers, titles, etc. | |
| This is a simple implementation that looks for common patterns. | |
| """ | |
| info = {} | |
| # Look for potential titles (lines that are short and capitalized) | |
| lines = text.split('\n') | |
| potential_titles = [ | |
| line.strip() | |
| for line in lines[:10] # Check first 10 lines | |
| if 10 < len(line.strip()) < 100 and # Length between 10-100 chars | |
| line.strip().isupper() or # All caps | |
| line.strip().istitle() # Title case | |
| ] | |
| if potential_titles: | |
| info['potential_title'] = potential_titles[0] | |
| # Extract any email addresses | |
| emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) | |
| if emails: | |
| info['emails'] = emails[:5] # Limit to first 5 emails | |
| # Extract any URLs | |
| urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) | |
| if urls: | |
| info['urls'] = urls[:5] # Limit to first 5 URLs | |
| return info |