Spaces:
Sleeping
Sleeping
File size: 3,779 Bytes
300f197 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
class TextProcessor:
def __init__(self):
"""Initialize the text processor with required NLTK data."""
self.download_nltk_data()
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def download_nltk_data(self):
"""Download required NLTK data if not already present."""
required_data = [
('tokenizers/punkt_tab', 'punkt_tab'),
('tokenizers/punkt', 'punkt'),
('corpora/stopwords', 'stopwords'),
('corpora/wordnet', 'wordnet'),
('corpora/omw-1.4', 'omw-1.4')
]
for path, name in required_data:
try:
nltk.data.find(path)
except LookupError:
print(f"Downloading NLTK {name}...")
nltk.download(name)
def clean_text(self, text):
"""
Clean and preprocess the input text.
Args:
text (str): Raw input text
Returns:
str: Cleaned text
"""
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text.strip())
# Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
# Remove standalone numbers (often page numbers or list markers)
text = re.sub(r'\b\d+\b', '', text)
# Remove special characters but keep sentence structure
# Keep periods, question marks, exclamation points, commas, and hyphens
text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)
# Remove multiple periods/spaces
text = re.sub(r'\.+', '.', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_sentences(self, text):
"""
Tokenize text into sentences.
Args:
text (str): Input text
Returns:
list: List of sentences
"""
sentences = sent_tokenize(text)
# Filter out very short sentences (less than 5 words)
filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
return filtered_sentences
def tokenize_words(self, text):
"""
Tokenize text into words and remove stopwords.
Args:
text (str): Input text
Returns:
list: List of processed words
"""
words = word_tokenize(text.lower())
# Remove punctuation and stopwords
words = [word for word in words if word not in string.punctuation]
words = [word for word in words if word not in self.stop_words]
# Lemmatize words
words = [self.lemmatizer.lemmatize(word) for word in words]
return words
def preprocess_text(self, text):
"""
Complete preprocessing pipeline.
Args:
text (str): Raw input text
Returns:
dict: Processed text components
"""
cleaned_text = self.clean_text(text)
sentences = self.tokenize_sentences(cleaned_text)
words = self.tokenize_words(cleaned_text)
return {
'cleaned_text': cleaned_text,
'sentences': sentences,
'words': words,
'word_count': len(words),
'sentence_count': len(sentences)
}
|