Spaces:
Running
Running
| """ | |
| Text preprocessing and tokenization module for technical documents | |
| """ | |
| import re | |
| import logging | |
| from typing import List, Tuple | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| # Download required NLTK resources | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True) | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords', quiet=True) | |
| logger = logging.getLogger(__name__) | |
| class TextPreprocessor: | |
| """Comprehensive text preprocessing for technical documents.""" | |
| def __init__(self, remove_stopwords: bool = False): | |
| """ | |
| Initialize preprocessor. | |
| Args: | |
| remove_stopwords: Whether to remove English stopwords | |
| """ | |
| self.remove_stopwords = remove_stopwords | |
| self.stop_words = set(stopwords.words('english')) if remove_stopwords else set() | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean technical document text. | |
| Args: | |
| text: Raw text to clean | |
| Returns: | |
| Cleaned text | |
| """ | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove URLs | |
| text = re.sub(r'http\S+|www\S+', '', text) | |
| # Remove email addresses | |
| text = re.sub(r'\S+@\S+', '', text) | |
| # Remove special characters but keep punctuation for sentences | |
| text = re.sub(r'[^\w\s.!?,;:\-()]', '', text) | |
| # Remove extra spaces created by above operations | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def remove_citations(self, text: str) -> str: | |
| """ | |
| Remove citation references from text. | |
| Args: | |
| text: Text with citations | |
| Returns: | |
| Text without citations | |
| """ | |
| # Remove [Author et al., Year] style citations | |
| text = re.sub(r'\[\d+\]|\[[\w\s\.]+,\s*\d{4}\]', '', text) | |
| # Remove (Author Year) style citations | |
| text = re.sub(r'\([\w\s\.]+,?\s*\d{4}\)', '', text) | |
| return text | |
| def remove_equations(self, text: str) -> str: | |
| """ | |
| Remove mathematical equations and formulas. | |
| Args: | |
| text: Text with equations | |
| Returns: | |
| Text without equations | |
| """ | |
| # Remove LaTeX equations | |
| text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL) | |
| text = re.sub(r'\$.*?\$', '', text) | |
| return text | |
| def sent_tokenize(self, text: str) -> List[str]: | |
| """ | |
| Tokenize text into sentences. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of sentences | |
| """ | |
| sentences = sent_tokenize(text) | |
| return [sent.strip() for sent in sentences if sent.strip()] | |
| def word_tokenize(self, text: str) -> List[str]: | |
| """ | |
| Tokenize text into words. | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of words | |
| """ | |
| tokens = word_tokenize(text.lower()) | |
| if self.remove_stopwords: | |
| tokens = [t for t in tokens if t not in self.stop_words and t.isalnum()] | |
| return tokens | |
| def preprocess_document(self, text: str, remove_citations: bool = True, | |
| remove_equations: bool = False) -> str: | |
| """ | |
| Complete preprocessing pipeline. | |
| Args: | |
| text: Raw document text | |
| remove_citations: Whether to remove citations | |
| remove_equations: Whether to remove equations | |
| Returns: | |
| Preprocessed text | |
| """ | |
| # Clean text | |
| text = self.clean_text(text) | |
| # Remove citations if requested | |
| if remove_citations: | |
| text = self.remove_citations(text) | |
| # Remove equations if requested | |
| if remove_equations: | |
| text = self.remove_equations(text) | |
| logger.info("Document preprocessing completed") | |
| return text | |
| class TechnicalDocumentParser: | |
| """Parse technical document structure (sections, abstracts, etc.).""" | |
| def extract_abstract(text: str) -> Tuple[str, str]: | |
| """ | |
| Extract abstract from document. | |
| Args: | |
| text: Full document text | |
| Returns: | |
| Tuple of (abstract, remaining_text) | |
| """ | |
| abstract_match = re.search( | |
| r'(?:^|\n)(abstract|summary)(.*?)(?:\n(?:introduction|1\.|contents))', | |
| text, re.IGNORECASE | re.DOTALL | |
| ) | |
| if abstract_match: | |
| abstract = abstract_match.group(2).strip() | |
| remaining = text[:abstract_match.start()] + text[abstract_match.end():] | |
| return abstract, remaining | |
| return "", text | |
| def extract_sections(text: str) -> List[Tuple[str, str]]: | |
| """ | |
| Extract document sections. | |
| Args: | |
| text: Document text | |
| Returns: | |
| List of (section_title, section_content) tuples | |
| """ | |
| # Match common section patterns | |
| section_pattern = r'(?:^|\n)((?:\d+\.\s+)?(?:introduction|methodology|results|discussion|conclusion|references|abstract).*?)(?:\n(?:\d+\.\s+)?(?:[A-Z][^.]*?)(?=\n|$))' | |
| sections = [] | |
| matches = re.finditer(section_pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| title = match.group(1).strip() | |
| content = match.group(2).strip() if match.lastindex >= 2 else "" | |
| sections.append((title, content)) | |
| return sections | |