Spaces:
Sleeping
Sleeping
| import nltk | |
| import re | |
| import string | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| class TextProcessor: | |
| def __init__(self): | |
| """Initialize the text processor with required NLTK data.""" | |
| self.download_nltk_data() | |
| self.stop_words = set(stopwords.words('english')) | |
| self.lemmatizer = WordNetLemmatizer() | |
| def download_nltk_data(self): | |
| """Download required NLTK data if not already present.""" | |
| required_data = [ | |
| ('tokenizers/punkt_tab', 'punkt_tab'), | |
| ('tokenizers/punkt', 'punkt'), | |
| ('corpora/stopwords', 'stopwords'), | |
| ('corpora/wordnet', 'wordnet'), | |
| ('corpora/omw-1.4', 'omw-1.4') | |
| ] | |
| for path, name in required_data: | |
| try: | |
| nltk.data.find(path) | |
| except LookupError: | |
| print(f"Downloading NLTK {name}...") | |
| nltk.download(name) | |
| def clean_text(self, text): | |
| """ | |
| Clean and preprocess the input text. | |
| Args: | |
| text (str): Raw input text | |
| Returns: | |
| str: Cleaned text | |
| """ | |
| # Remove extra whitespace and normalize | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| # Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1") | |
| text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE) | |
| # Remove standalone numbers (often page numbers or list markers) | |
| text = re.sub(r'\b\d+\b', '', text) | |
| # Remove special characters but keep sentence structure | |
| # Keep periods, question marks, exclamation points, commas, and hyphens | |
| text = re.sub(r'[^\w\s\.\?\!,\-]', '', text) | |
| # Remove multiple periods/spaces | |
| text = re.sub(r'\.+', '.', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def tokenize_sentences(self, text): | |
| """ | |
| Tokenize text into sentences. | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| list: List of sentences | |
| """ | |
| sentences = sent_tokenize(text) | |
| # Filter out very short sentences (less than 5 words) | |
| filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5] | |
| return filtered_sentences | |
| def tokenize_words(self, text): | |
| """ | |
| Tokenize text into words and remove stopwords. | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| list: List of processed words | |
| """ | |
| words = word_tokenize(text.lower()) | |
| # Remove punctuation and stopwords | |
| words = [word for word in words if word not in string.punctuation] | |
| words = [word for word in words if word not in self.stop_words] | |
| # Lemmatize words | |
| words = [self.lemmatizer.lemmatize(word) for word in words] | |
| return words | |
| def preprocess_text(self, text): | |
| """ | |
| Complete preprocessing pipeline. | |
| Args: | |
| text (str): Raw input text | |
| Returns: | |
| dict: Processed text components | |
| """ | |
| cleaned_text = self.clean_text(text) | |
| sentences = self.tokenize_sentences(cleaned_text) | |
| words = self.tokenize_words(cleaned_text) | |
| return { | |
| 'cleaned_text': cleaned_text, | |
| 'sentences': sentences, | |
| 'words': words, | |
| 'word_count': len(words), | |
| 'sentence_count': len(sentences) | |
| } | |