""" Text preprocessing and tokenization module for technical documents """ import re import logging from typing import List, Tuple import nltk from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize # Download required NLTK resources try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', quiet=True) try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords', quiet=True) logger = logging.getLogger(__name__) class TextPreprocessor: """Comprehensive text preprocessing for technical documents.""" def __init__(self, remove_stopwords: bool = False): """ Initialize preprocessor. Args: remove_stopwords: Whether to remove English stopwords """ self.remove_stopwords = remove_stopwords self.stop_words = set(stopwords.words('english')) if remove_stopwords else set() def clean_text(self, text: str) -> str: """ Clean technical document text. Args: text: Raw text to clean Returns: Cleaned text """ # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove URLs text = re.sub(r'http\S+|www\S+', '', text) # Remove email addresses text = re.sub(r'\S+@\S+', '', text) # Remove special characters but keep punctuation for sentences text = re.sub(r'[^\w\s.!?,;:\-()]', '', text) # Remove extra spaces created by above operations text = re.sub(r'\s+', ' ', text).strip() return text def remove_citations(self, text: str) -> str: """ Remove citation references from text. Args: text: Text with citations Returns: Text without citations """ # Remove [Author et al., Year] style citations text = re.sub(r'\[\d+\]|\[[\w\s\.]+,\s*\d{4}\]', '', text) # Remove (Author Year) style citations text = re.sub(r'\([\w\s\.]+,?\s*\d{4}\)', '', text) return text def remove_equations(self, text: str) -> str: """ Remove mathematical equations and formulas. Args: text: Text with equations Returns: Text without equations """ # Remove LaTeX equations text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL) text = re.sub(r'\$.*?\$', '', text) return text def sent_tokenize(self, text: str) -> List[str]: """ Tokenize text into sentences. Args: text: Input text Returns: List of sentences """ sentences = sent_tokenize(text) return [sent.strip() for sent in sentences if sent.strip()] def word_tokenize(self, text: str) -> List[str]: """ Tokenize text into words. Args: text: Input text Returns: List of words """ tokens = word_tokenize(text.lower()) if self.remove_stopwords: tokens = [t for t in tokens if t not in self.stop_words and t.isalnum()] return tokens def preprocess_document(self, text: str, remove_citations: bool = True, remove_equations: bool = False) -> str: """ Complete preprocessing pipeline. Args: text: Raw document text remove_citations: Whether to remove citations remove_equations: Whether to remove equations Returns: Preprocessed text """ # Clean text text = self.clean_text(text) # Remove citations if requested if remove_citations: text = self.remove_citations(text) # Remove equations if requested if remove_equations: text = self.remove_equations(text) logger.info("Document preprocessing completed") return text class TechnicalDocumentParser: """Parse technical document structure (sections, abstracts, etc.).""" @staticmethod def extract_abstract(text: str) -> Tuple[str, str]: """ Extract abstract from document. Args: text: Full document text Returns: Tuple of (abstract, remaining_text) """ abstract_match = re.search( r'(?:^|\n)(abstract|summary)(.*?)(?:\n(?:introduction|1\.|contents))', text, re.IGNORECASE | re.DOTALL ) if abstract_match: abstract = abstract_match.group(2).strip() remaining = text[:abstract_match.start()] + text[abstract_match.end():] return abstract, remaining return "", text @staticmethod def extract_sections(text: str) -> List[Tuple[str, str]]: """ Extract document sections. Args: text: Document text Returns: List of (section_title, section_content) tuples """ # Match common section patterns section_pattern = r'(?:^|\n)((?:\d+\.\s+)?(?:introduction|methodology|results|discussion|conclusion|references|abstract).*?)(?:\n(?:\d+\.\s+)?(?:[A-Z][^.]*?)(?=\n|$))' sections = [] matches = re.finditer(section_pattern, text, re.IGNORECASE) for match in matches: title = match.group(1).strip() content = match.group(2).strip() if match.lastindex >= 2 else "" sections.append((title, content)) return sections