# src/preprocessing.py from datasets import load_dataset import nltk from nltk.corpus import stopwords import logging from src.config import DATA_PATH, SAMPLE_FRAC, LOG_FILE nltk.download("stopwords") stop_words = set(stopwords.words("english")) def setup_logging(): logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def preprocess_text(text): """Clean text: lowercase, remove punctuation, stop words.""" try: text = text.lower() text = "".join([c for c in text if c.isalnum() or c.isspace()]) text = " ".join([word for word in text.split() if word not in stop_words]) logging.debug(f"Preprocessed text: {text[:50]}...") return text except Exception as e: logging.error(f"Error preprocessing text: {e}") return text def load_and_preprocess_data(sample=False): """Load dataset using Hugging Face datasets, preprocess, and sample.""" setup_logging() logging.info("Loading dataset") dataset = load_dataset("csv", data_files=DATA_PATH, column_names=["category", "text"])["train"] dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"]), "category": x["category"]}) dataset = dataset.filter(lambda x: x["text"] is not None and x["category"] is not None) if sample: logging.info(f"Sampling {SAMPLE_FRAC*100}% of data") dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE_FRAC))) # Split into train/test dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="category", seed=42) logging.info(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}") return dataset["train"], dataset["test"]