Spaces:
No application file
No application file
Update src/preprocessing.py
Browse files- src/preprocessing.py +12 -12
src/preprocessing.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
| 1 |
# src/preprocessing.py
|
| 2 |
-
|
| 3 |
import nltk
|
| 4 |
from nltk.corpus import stopwords
|
| 5 |
-
from sklearn.model_selection import train_test_split
|
| 6 |
import logging
|
| 7 |
-
from src.config import DATA_PATH, SAMPLE_FRAC
|
| 8 |
|
| 9 |
nltk.download("stopwords")
|
| 10 |
stop_words = set(stopwords.words("english"))
|
| 11 |
|
| 12 |
def setup_logging():
|
| 13 |
-
logging.basicConfig(filename=
|
| 14 |
format="%(asctime)s - %(levelname)s - %(message)s")
|
| 15 |
|
| 16 |
def preprocess_text(text):
|
|
@@ -26,17 +25,18 @@ def preprocess_text(text):
|
|
| 26 |
return text
|
| 27 |
|
| 28 |
def load_and_preprocess_data(sample=False):
|
| 29 |
-
"""Load dataset, preprocess, and
|
| 30 |
setup_logging()
|
| 31 |
logging.info("Loading dataset")
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
if sample:
|
| 37 |
logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
| 1 |
# src/preprocessing.py
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
import nltk
|
| 4 |
from nltk.corpus import stopwords
|
|
|
|
| 5 |
import logging
|
| 6 |
+
from src.config import DATA_PATH, SAMPLE_FRAC, LOG_FILE
|
| 7 |
|
| 8 |
nltk.download("stopwords")
|
| 9 |
stop_words = set(stopwords.words("english"))
|
| 10 |
|
| 11 |
def setup_logging():
|
| 12 |
+
logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
|
| 13 |
format="%(asctime)s - %(levelname)s - %(message)s")
|
| 14 |
|
| 15 |
def preprocess_text(text):
|
|
|
|
| 25 |
return text
|
| 26 |
|
| 27 |
def load_and_preprocess_data(sample=False):
|
| 28 |
+
"""Load dataset using Hugging Face datasets, preprocess, and sample."""
|
| 29 |
setup_logging()
|
| 30 |
logging.info("Loading dataset")
|
| 31 |
+
dataset = load_dataset("csv", data_files=DATA_PATH, column_names=["category", "text"])["train"]
|
| 32 |
+
dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"]), "category": x["category"]})
|
| 33 |
+
dataset = dataset.filter(lambda x: x["text"] is not None and x["category"] is not None)
|
| 34 |
|
| 35 |
if sample:
|
| 36 |
logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
|
| 37 |
+
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE_FRAC)))
|
| 38 |
|
| 39 |
+
# Split into train/test
|
| 40 |
+
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="category", seed=42)
|
| 41 |
+
logging.info(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
|
| 42 |
+
return dataset["train"], dataset["test"]
|