Spaces:

Omnamdev02
/

AutoExamGen

Sleeping

File size: 3,779 Bytes

300f197

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

class TextProcessor:
    def __init__(self):
        """Initialize the text processor with required NLTK data."""
        self.download_nltk_data()
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def download_nltk_data(self):
        """Download required NLTK data if not already present."""
        required_data = [
            ('tokenizers/punkt_tab', 'punkt_tab'),
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('corpora/wordnet', 'wordnet'),
            ('corpora/omw-1.4', 'omw-1.4')
        ]
        
        for path, name in required_data:
            try:
                nltk.data.find(path)
            except LookupError:
                print(f"Downloading NLTK {name}...")
                nltk.download(name)
    
    def clean_text(self, text):
        """
        Clean and preprocess the input text.
        
        Args:
            text (str): Raw input text
            
        Returns:
            str: Cleaned text
        """
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
        text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
        text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
        
        # Remove standalone numbers (often page numbers or list markers)
        text = re.sub(r'\b\d+\b', '', text)
        
        # Remove special characters but keep sentence structure
        # Keep periods, question marks, exclamation points, commas, and hyphens
        text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)
        
        # Remove multiple periods/spaces
        text = re.sub(r'\.+', '.', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_sentences(self, text):
        """
        Tokenize text into sentences.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of sentences
        """
        sentences = sent_tokenize(text)
        # Filter out very short sentences (less than 5 words)
        filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
        return filtered_sentences
    
    def tokenize_words(self, text):
        """
        Tokenize text into words and remove stopwords.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of processed words
        """
        words = word_tokenize(text.lower())
        
        # Remove punctuation and stopwords
        words = [word for word in words if word not in string.punctuation]
        words = [word for word in words if word not in self.stop_words]
        
        # Lemmatize words
        words = [self.lemmatizer.lemmatize(word) for word in words]
        
        return words
    
    def preprocess_text(self, text):
        """
        Complete preprocessing pipeline.
        
        Args:
            text (str): Raw input text
            
        Returns:
            dict: Processed text components
        """
        cleaned_text = self.clean_text(text)
        sentences = self.tokenize_sentences(cleaned_text)
        words = self.tokenize_words(cleaned_text)
        
        return {
            'cleaned_text': cleaned_text,
            'sentences': sentences,
            'words': words,
            'word_count': len(words),
            'sentence_count': len(sentences)
        }