Docker_Deploy / src /python /preprocessor.py
Shaheryar Shah
Add backend files for RAG Chatbot Docker deployment
bec06d9
import re
from typing import List, Tuple
import logging
logger = logging.getLogger(__name__)
class TextPreprocessor:
"""
A utility class for preprocessing text before embedding.
Includes cleaning, normalization, and chunking methods.
"""
@staticmethod
def clean_text(text: str) -> str:
"""Clean text by removing extra whitespaces, newlines, etc."""
# Remove extra whitespaces and newlines
text = re.sub(r'\s+', ' ', text)
# Remove special characters, keeping only alphanumeric and basic punctuation
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
# Remove extra spaces again after special character removal
text = re.sub(r'\s+', ' ', text).strip()
return text
@staticmethod
def split_by_sentences(text: str) -> List[str]:
"""Split text into sentences."""
# Split by sentence endings
sentences = re.split(r'[.!?]+', text)
# Remove empty strings and strip whitespace
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
@staticmethod
def split_by_paragraphs(text: str) -> List[str]:
"""Split text into paragraphs."""
paragraphs = text.split('\n\n')
# Remove empty strings and strip whitespace
paragraphs = [p.strip() for p in paragraphs if p.strip()]
return paragraphs
@staticmethod
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""
Split text into overlapping chunks of specified size.
Args:
text: The input text to chunk
chunk_size: Maximum size of each chunk (in characters)
overlap: Number of characters to overlap between chunks
Returns:
List of text chunks
"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundaries if possible
if end < len(text):
# Look for a sentence boundary near the end
sentence_end = text.rfind('.', start, end)
if sentence_end != -1 and sentence_end > start + chunk_size // 2:
end = sentence_end + 1
else:
# If no sentence boundary found, look for a space
space_end = text.rfind(' ', start, end)
if space_end != -1 and space_end > start + chunk_size // 2:
end = space_end
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# Move start position, considering overlap
start = end - overlap if overlap < end else end
# If the last chunk was not processed and we've reached the end
if start >= len(text) and end < len(text):
final_chunk = text[end:].strip()
if final_chunk:
chunks.append(final_chunk)
# Filter out any empty chunks
chunks = [chunk for chunk in chunks if chunk]
return chunks
@staticmethod
def extract_key_info(text: str) -> dict:
"""
Extract key information from text such as headers, titles, etc.
This is a simple implementation that looks for common patterns.
"""
info = {}
# Look for potential titles (lines that are short and capitalized)
lines = text.split('\n')
potential_titles = [
line.strip()
for line in lines[:10] # Check first 10 lines
if 10 < len(line.strip()) < 100 and # Length between 10-100 chars
line.strip().isupper() or # All caps
line.strip().istitle() # Title case
]
if potential_titles:
info['potential_title'] = potential_titles[0]
# Extract any email addresses
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
if emails:
info['emails'] = emails[:5] # Limit to first 5 emails
# Extract any URLs
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
if urls:
info['urls'] = urls[:5] # Limit to first 5 URLs
return info