File size: 3,779 Bytes
300f197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

class TextProcessor:
    def __init__(self):
        """Initialize the text processor with required NLTK data."""
        self.download_nltk_data()
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def download_nltk_data(self):
        """Download required NLTK data if not already present."""
        required_data = [
            ('tokenizers/punkt_tab', 'punkt_tab'),
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('corpora/wordnet', 'wordnet'),
            ('corpora/omw-1.4', 'omw-1.4')
        ]
        
        for path, name in required_data:
            try:
                nltk.data.find(path)
            except LookupError:
                print(f"Downloading NLTK {name}...")
                nltk.download(name)
    
    def clean_text(self, text):
        """
        Clean and preprocess the input text.
        
        Args:
            text (str): Raw input text
            
        Returns:
            str: Cleaned text
        """
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
        text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
        text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
        
        # Remove standalone numbers (often page numbers or list markers)
        text = re.sub(r'\b\d+\b', '', text)
        
        # Remove special characters but keep sentence structure
        # Keep periods, question marks, exclamation points, commas, and hyphens
        text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)
        
        # Remove multiple periods/spaces
        text = re.sub(r'\.+', '.', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_sentences(self, text):
        """
        Tokenize text into sentences.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of sentences
        """
        sentences = sent_tokenize(text)
        # Filter out very short sentences (less than 5 words)
        filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
        return filtered_sentences
    
    def tokenize_words(self, text):
        """
        Tokenize text into words and remove stopwords.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of processed words
        """
        words = word_tokenize(text.lower())
        
        # Remove punctuation and stopwords
        words = [word for word in words if word not in string.punctuation]
        words = [word for word in words if word not in self.stop_words]
        
        # Lemmatize words
        words = [self.lemmatizer.lemmatize(word) for word in words]
        
        return words
    
    def preprocess_text(self, text):
        """
        Complete preprocessing pipeline.
        
        Args:
            text (str): Raw input text
            
        Returns:
            dict: Processed text components
        """
        cleaned_text = self.clean_text(text)
        sentences = self.tokenize_sentences(cleaned_text)
        words = self.tokenize_words(cleaned_text)
        
        return {
            'cleaned_text': cleaned_text,
            'sentences': sentences,
            'words': words,
            'word_count': len(words),
            'sentence_count': len(sentences)
        }