Spaces:
No application file
No application file
| # src/preprocessing.py | |
| from datasets import load_dataset | |
| import nltk | |
| from nltk.corpus import stopwords | |
| import logging | |
| from src.config import DATA_PATH, SAMPLE_FRAC, LOG_FILE | |
| nltk.download("stopwords") | |
| stop_words = set(stopwords.words("english")) | |
| def setup_logging(): | |
| logging.basicConfig(filename=LOG_FILE, level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s") | |
| def preprocess_text(text): | |
| """Clean text: lowercase, remove punctuation, stop words.""" | |
| try: | |
| text = text.lower() | |
| text = "".join([c for c in text if c.isalnum() or c.isspace()]) | |
| text = " ".join([word for word in text.split() if word not in stop_words]) | |
| logging.debug(f"Preprocessed text: {text[:50]}...") | |
| return text | |
| except Exception as e: | |
| logging.error(f"Error preprocessing text: {e}") | |
| return text | |
| def load_and_preprocess_data(sample=False): | |
| """Load dataset using Hugging Face datasets, preprocess, and sample.""" | |
| setup_logging() | |
| logging.info("Loading dataset") | |
| dataset = load_dataset("csv", data_files=DATA_PATH, column_names=["category", "text"])["train"] | |
| dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"]), "category": x["category"]}) | |
| dataset = dataset.filter(lambda x: x["text"] is not None and x["category"] is not None) | |
| if sample: | |
| logging.info(f"Sampling {SAMPLE_FRAC*100}% of data") | |
| dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE_FRAC))) | |
| # Split into train/test | |
| dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="category", seed=42) | |
| logging.info(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}") | |
| return dataset["train"], dataset["test"] |