Spaces:

ayush2917
/

Ubuntu-Customer-Centre-Inquiries

No application file

ayush2917 commited on Apr 18, 2025

Commit

b013e35

verified ·

1 Parent(s): fcb9629

Update src/preprocessing.py

Files changed (1) hide show

src/preprocessing.py CHANGED Viewed

@@ -1,16 +1,15 @@
 # src/preprocessing.py
-import pandas as pd
 import nltk
 from nltk.corpus import stopwords
-from sklearn.model_selection import train_test_split
 import logging
-from src.config import DATA_PATH, SAMPLE_FRAC
 nltk.download("stopwords")
 stop_words = set(stopwords.words("english"))
 def setup_logging():
-    logging.basicConfig(filename="logs/app.log", level=logging.INFO,
                         format="%(asctime)s - %(levelname)s - %(message)s")
 def preprocess_text(text):
@@ -26,17 +25,18 @@ def preprocess_text(text):
         return text
 def load_and_preprocess_data(sample=False):
-    """Load dataset, preprocess, and optionally sample."""
     setup_logging()
     logging.info("Loading dataset")
-    df = pd.read_csv(DATA_PATH, names=["category", "text"])
-    df["text"] = df["text"].apply(preprocess_text)
-    df = df.dropna()
     if sample:
         logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
-        df = df.groupby("category").apply(lambda x: x.sample(frac=SAMPLE_FRAC)).reset_index(drop=True)
-    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["category"], random_state=42)
-    logging.info(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
-    return train_df, test_df

 # src/preprocessing.py
+from datasets import load_dataset
 import nltk
 from nltk.corpus import stopwords
 import logging
+from src.config import DATA_PATH, SAMPLE_FRAC, LOG_FILE
 nltk.download("stopwords")
 stop_words = set(stopwords.words("english"))
 def setup_logging():
+    logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
                         format="%(asctime)s - %(levelname)s - %(message)s")
 def preprocess_text(text):
         return text
 def load_and_preprocess_data(sample=False):
+    """Load dataset using Hugging Face datasets, preprocess, and sample."""
     setup_logging()
     logging.info("Loading dataset")
+    dataset = load_dataset("csv", data_files=DATA_PATH, column_names=["category", "text"])["train"]
+    dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"]), "category": x["category"]})
+    dataset = dataset.filter(lambda x: x["text"] is not None and x["category"] is not None)
     if sample:
         logging.info(f"Sampling {SAMPLE_FRAC*100}% of data")
+        dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SAMPLE_FRAC)))
+    # Split into train/test
+    dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="category", seed=42)
+    logging.info(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
+    return dataset["train"], dataset["test"]