IncludEd-AI / services /text_processor.py
nkubana0's picture
initial: IncludEd AI service
162cb6f
import re
class TextProcessor:
def clean_text(self, text: str) -> str:
"""Remove excessive whitespace and format text"""
# Remove multiple spaces
text = re.sub(r'\s+', ' ', text)
# Remove page numbers and headers
text = re.sub(r'Page \d+', '', text)
# Normalize quotes
text = text.replace('"', '"').replace('"', '"')
return text.strip()
def split_into_sentences(self, text: str) -> list:
"""Split text into sentences"""
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if s.strip()]
def count_words(self, text: str) -> int:
"""Count words in text"""
return len(text.split())