Spaces:

muhammadshaheryar
/

Docker_Deploy

Configuration error

File size: 4,563 Bytes

bec06d9

import re
from typing import List, Tuple
import logging

logger = logging.getLogger(__name__)

class TextPreprocessor:
    """
    A utility class for preprocessing text before embedding.
    Includes cleaning, normalization, and chunking methods.
    """
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean text by removing extra whitespaces, newlines, etc."""
        # Remove extra whitespaces and newlines
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters, keeping only alphanumeric and basic punctuation
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        # Remove extra spaces again after special character removal
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    @staticmethod
    def split_by_sentences(text: str) -> List[str]:
        """Split text into sentences."""
        # Split by sentence endings
        sentences = re.split(r'[.!?]+', text)
        # Remove empty strings and strip whitespace
        sentences = [s.strip() for s in sentences if s.strip()]
        return sentences
    
    @staticmethod
    def split_by_paragraphs(text: str) -> List[str]:
        """Split text into paragraphs."""
        paragraphs = text.split('\n\n')
        # Remove empty strings and strip whitespace
        paragraphs = [p.strip() for p in paragraphs if p.strip()]
        return paragraphs
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
        """
        Split text into overlapping chunks of specified size.
        
        Args:
            text: The input text to chunk
            chunk_size: Maximum size of each chunk (in characters)
            overlap: Number of characters to overlap between chunks
        
        Returns:
            List of text chunks
        """
        if len(text) <= chunk_size:
            return [text]
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            
            # Try to break at sentence boundaries if possible
            if end < len(text):
                # Look for a sentence boundary near the end
                sentence_end = text.rfind('.', start, end)
                if sentence_end != -1 and sentence_end > start + chunk_size // 2:
                    end = sentence_end + 1
                else:
                    # If no sentence boundary found, look for a space
                    space_end = text.rfind(' ', start, end)
                    if space_end != -1 and space_end > start + chunk_size // 2:
                        end = space_end
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            # Move start position, considering overlap
            start = end - overlap if overlap < end else end
            
            # If the last chunk was not processed and we've reached the end
            if start >= len(text) and end < len(text):
                final_chunk = text[end:].strip()
                if final_chunk:
                    chunks.append(final_chunk)
        
        # Filter out any empty chunks
        chunks = [chunk for chunk in chunks if chunk]
        return chunks
    
    @staticmethod
    def extract_key_info(text: str) -> dict:
        """
        Extract key information from text such as headers, titles, etc.
        This is a simple implementation that looks for common patterns.
        """
        info = {}
        
        # Look for potential titles (lines that are short and capitalized)
        lines = text.split('\n')
        potential_titles = [
            line.strip() 
            for line in lines[:10]  # Check first 10 lines
            if 10 < len(line.strip()) < 100 and  # Length between 10-100 chars
               line.strip().isupper() or  # All caps
               line.strip().istitle()  # Title case
        ]
        
        if potential_titles:
            info['potential_title'] = potential_titles[0]
        
        # Extract any email addresses
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        if emails:
            info['emails'] = emails[:5]  # Limit to first 5 emails
        
        # Extract any URLs
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if urls:
            info['urls'] = urls[:5]  # Limit to first 5 URLs
        
        return info