Spaces:

muhammadshaheryar
/

Docker_Deploy

Configuration error

Docker_Deploy / src /python /preprocessor.py

Shaheryar Shah

Add backend files for RAG Chatbot Docker deployment

bec06d9 about 2 months ago

4.56 kB

	import re
	from typing import List, Tuple
	import logging

	logger = logging.getLogger(__name__)

	class TextPreprocessor:
	"""
	A utility class for preprocessing text before embedding.
	Includes cleaning, normalization, and chunking methods.
	"""

	@staticmethod
	def clean_text(text: str) -> str:
	"""Clean text by removing extra whitespaces, newlines, etc."""
	# Remove extra whitespaces and newlines
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters, keeping only alphanumeric and basic punctuation
	text = re.sub(r'[^\w\s\.\,\!\?\;\:\-]', ' ', text)
	# Remove extra spaces again after special character removal
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	@staticmethod
	def split_by_sentences(text: str) -> List[str]:
	"""Split text into sentences."""
	# Split by sentence endings
	sentences = re.split(r'[.!?]+', text)
	# Remove empty strings and strip whitespace
	sentences = [s.strip() for s in sentences if s.strip()]
	return sentences

	@staticmethod
	def split_by_paragraphs(text: str) -> List[str]:
	"""Split text into paragraphs."""
	paragraphs = text.split('\n\n')
	# Remove empty strings and strip whitespace
	paragraphs = [p.strip() for p in paragraphs if p.strip()]
	return paragraphs

	@staticmethod
	def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
	"""
	Split text into overlapping chunks of specified size.

	Args:
	text: The input text to chunk
	chunk_size: Maximum size of each chunk (in characters)
	overlap: Number of characters to overlap between chunks

	Returns:
	List of text chunks
	"""
	if len(text) <= chunk_size:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size

	# Try to break at sentence boundaries if possible
	if end < len(text):
	# Look for a sentence boundary near the end
	sentence_end = text.rfind('.', start, end)
	if sentence_end != -1 and sentence_end > start + chunk_size // 2:
	end = sentence_end + 1
	else:
	# If no sentence boundary found, look for a space
	space_end = text.rfind(' ', start, end)
	if space_end != -1 and space_end > start + chunk_size // 2:
	end = space_end

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	# Move start position, considering overlap
	start = end - overlap if overlap < end else end

	# If the last chunk was not processed and we've reached the end
	if start >= len(text) and end < len(text):
	final_chunk = text[end:].strip()
	if final_chunk:
	chunks.append(final_chunk)

	# Filter out any empty chunks
	chunks = [chunk for chunk in chunks if chunk]
	return chunks

	@staticmethod
	def extract_key_info(text: str) -> dict:
	"""
	Extract key information from text such as headers, titles, etc.
	This is a simple implementation that looks for common patterns.
	"""
	info = {}

	# Look for potential titles (lines that are short and capitalized)
	lines = text.split('\n')
	potential_titles = [
	line.strip()
	for line in lines[:10] # Check first 10 lines
	if 10 < len(line.strip()) < 100 and # Length between 10-100 chars
	line.strip().isupper() or # All caps
	line.strip().istitle() # Title case
	]

	if potential_titles:
	info['potential_title'] = potential_titles[0]

	# Extract any email addresses
	emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)
	if emails:
	info['emails'] = emails[:5] # Limit to first 5 emails

	# Extract any URLs
	urls = re.findall(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
	if urls:
	info['urls'] = urls[:5] # Limit to first 5 URLs

	return info