ayush2917 commited on
Commit
b013e35
·
verified ·
1 Parent(s): fcb9629

Update src/preprocessing.py

Browse files
Files changed (1) hide show
  1. src/preprocessing.py +12 -12
src/preprocessing.py CHANGED
@@ -1,16 +1,15 @@
1
  # src/preprocessing.py
2
- import pandas as pd
3
  import nltk
4
  from nltk.corpus import stopwords
5
- from sklearn.model_selection import train_test_split
6
  import logging
7
- from src.config import DATA_PATH, SAMPLE_FRAC
8
 
9
  nltk.download("stopwords")
10
  stop_words = set(stopwords.words("english"))
11
 
12
  def setup_logging():
13
- logging.basicConfig(filename="logs/app.log", level=logging.INFO,
14
  format="%(asctime)s - %(levelname)s - %(message)s")
15
 
16
  def preprocess_text(text):
@@ -26,17 +25,18 @@ def preprocess_text(text):
26
  return text
27
 
28
  def load_and_preprocess_data(sample=False):
29
- """Load dataset, preprocess, and optionally sample."""
30
  setup_logging()
31
  logging.info("Loading dataset")
32
- df = pd.read_csv(DATA_PATH, names=["category", "text"])
33
- df["text"] = df["text"].apply(preprocess_text)
34
- df = df.dropna()
35
 
36
  if sample:
37
  logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
38
- df = df.groupby("category").apply(lambda x: x.sample(frac=SAMPLE_FRAC)).reset_index(drop=True)
39
 
40
- train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["category"], random_state=42)
41
- logging.info(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
42
- return train_df, test_df
 
 
1
  # src/preprocessing.py
2
+ from datasets import load_dataset
3
  import nltk
4
  from nltk.corpus import stopwords
 
5
  import logging
6
+ from src.config import DATA_PATH, SAMPLE_FRAC, LOG_FILE
7
 
8
  nltk.download("stopwords")
9
  stop_words = set(stopwords.words("english"))
10
 
11
  def setup_logging():
12
+ logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
13
  format="%(asctime)s - %(levelname)s - %(message)s")
14
 
15
  def preprocess_text(text):
 
25
  return text
26
 
27
  def load_and_preprocess_data(sample=False):
28
+ """Load dataset using Hugging Face datasets, preprocess, and sample."""
29
  setup_logging()
30
  logging.info("Loading dataset")
31
+ dataset = load_dataset("csv", data_files=DATA_PATH, column_names=["category", "text"])["train"]
32
+ dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"]), "category": x["category"]})
33
+ dataset = dataset.filter(lambda x: x["text"] is not None and x["category"] is not None)
34
 
35
  if sample:
36
  logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
37
+ dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE_FRAC)))
38
 
39
+ # Split into train/test
40
+ dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="category", seed=42)
41
+ logging.info(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
42
+ return dataset["train"], dataset["test"]