Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import re | |
| import string | |
| import pandas as pd | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| # Add the root directory to sys.path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from logging_config.logger_config import get_logger | |
| # Download necessary NLTK data files | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Get the logger | |
| logger = get_logger(__name__) | |
| # Custom Preprocessor Class | |
| class TextPreprocessor: | |
| def __init__(self): | |
| self.stop_words = set(stopwords.words('english')) | |
| self.lemmatizer = WordNetLemmatizer() | |
| logger.info("TextPreprocessor initialized.") | |
| def preprocess_text(self, text): | |
| # logger.info(f"Original text: {text}") | |
| # Lowercase the text | |
| text = text.lower() | |
| # logger.info(f"Lowercased text: {text}") | |
| # Remove punctuation | |
| text = re.sub(f'[{re.escape(string.punctuation)}]', '', text) | |
| # logger.info(f"Text after punctuation removal: {text}") | |
| # Remove numbers | |
| text = re.sub(r'\d+', '', text) | |
| # logger.info(f"Text after number removal: {text}") | |
| # Tokenize the text | |
| words = text.split() | |
| # logger.info(f"Tokenized text: {words}") | |
| # Remove stopwords and apply lemmatization | |
| words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words] | |
| # logger.info(f"Text after stopword removal and lemmatization: {words}") | |
| # Join words back into a single string | |
| cleaned_text = ' '.join(words) | |
| # logger.info(f"Cleaned text: {cleaned_text}") | |
| return cleaned_text | |
| def load_and_preprocess_data(file_path): | |
| # Load the data | |
| logger.info(f"Loading data from {file_path}") | |
| df = pd.read_csv(file_path) | |
| # dropping missing values | |
| logger.info("Dropping missing values") | |
| df.dropna(inplace=True) | |
| # Check if the necessary column exists | |
| if 'statement' not in df.columns: | |
| logger.error("The required column 'statement' is missing from the dataset.") | |
| return | |
| # Initialize the text preprocessor | |
| preprocessor = TextPreprocessor() | |
| # Apply the preprocessing to the 'statement' column | |
| logger.info("Starting text preprocessing...") | |
| df['cleaned_statement'] = df['statement'].apply(preprocessor.preprocess_text) | |
| logger.info("Text preprocessing completed.") | |
| # Save the cleaned data to a new CSV file | |
| cleaned_file_path = os.path.join('./data', 'cleaned_data.csv') | |
| df.to_csv(cleaned_file_path, index=False) | |
| logger.info(f"Cleaned data saved to {cleaned_file_path}") | |
| if __name__ == "__main__": | |
| # Path to the downloaded dataset | |
| dataset_path = os.path.join("./data", "Combined_Data.csv") | |
| # Preprocess the data | |
| load_and_preprocess_data(dataset_path) | |