Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| from typing import Dict, Tuple | |
| from warnings import filterwarnings | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| import torch | |
| from newsclassifier.config.config import Cfg, logger | |
| from torch.utils.data import Dataset | |
| from transformers import RobertaTokenizer | |
| filterwarnings("ignore") | |
| def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame: | |
| """load data from source into a Pandas DataFrame. | |
| Args: | |
| filepath (str): file location. | |
| print_i (int): Print number of instances. | |
| Returns: | |
| pd.DataFrame: Pandas DataFrame of the data. | |
| """ | |
| logger.info("Loading Data.") | |
| df = pd.read_csv(filepath) | |
| if print_i: | |
| print(df.head(print_i), "\n") | |
| return df | |
| def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """Separate headlines instance and feature selection. | |
| Args: | |
| df: original dataframe. | |
| Returns: | |
| df: new dataframe with appropriate features. | |
| headlines_df: dataframe cintaining "headlines" category instances. | |
| """ | |
| logger.info("Preparing Data.") | |
| try: | |
| df = df[["Title", "Category"]] | |
| df.rename(columns={"Title": "Text"}, inplace=True) | |
| df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True) | |
| except Exception as e: | |
| logger.error(e) | |
| return df, headlines_df | |
| def clean_text(text: str) -> str: | |
| """Clean text (lower, puntuations removal, blank space removal).""" | |
| # lower case the text | |
| logger.info("Cleaning input text.") | |
| text = text.lower() # necessary to do before as stopwords are in lower case | |
| # remove stopwords | |
| stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*") | |
| text = stp_pattern.sub("", text) | |
| # custom cleaning | |
| text = text.strip() # remove space at start or end if any | |
| text = re.sub(" +", " ", text) # remove extra spaces | |
| text = re.sub("[^A-Za-z0-9]+", " ", text) # remove characters that are not alphanumeric | |
| return text | |
| def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]: | |
| """Preprocess the data. | |
| Args: | |
| df: Dataframe on which the preprocessing steps need to be performed. | |
| Returns: | |
| df: Preprocessed Data. | |
| class_to_index: class labels to indices mapping | |
| class_to_index: indices to class labels mapping | |
| """ | |
| df, headlines_df = prepare_data(df) | |
| cats = df["Category"].unique().tolist() | |
| class_to_index = {tag: i for i, tag in enumerate(cats)} | |
| index_to_class = {v: k for k, v in class_to_index.items()} | |
| df["Text"] = df["Text"].apply(clean_text) # clean text | |
| df = df[["Text", "Category"]] | |
| try: | |
| df["Category"] = df["Category"].map(class_to_index) # label encoding | |
| except Exception as e: | |
| logger.error(e) | |
| return df, headlines_df, class_to_index, index_to_class | |
| def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False): | |
| """Split data into train and test sets. | |
| Args: | |
| df (pd.DataFrame): Data to be split. | |
| split_size (float): train-test split ratio (test ratio). | |
| stratify_on_target (bool): Whether to do stratify split on target. | |
| target_sep (bool): Whether to do target setting for train and test sets. | |
| save_dfs (bool): Whether to save dataset splits in artifacts. | |
| Returns: | |
| train-test splits (with/without target setting) | |
| """ | |
| logger.info("Splitting Data.") | |
| try: | |
| if stratify_on_target: | |
| stra = df["Category"] | |
| else: | |
| stra = None | |
| train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra) | |
| train_ds = pd.DataFrame(train, columns=df.columns) | |
| test_ds = pd.DataFrame(test, columns=df.columns) | |
| if save_dfs: | |
| logger.info("Saving and storing data splits.") | |
| os.makedirs(Cfg.preprocessed_data_path, exist_ok=True) | |
| train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv")) | |
| test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv")) | |
| except Exception as e: | |
| logger.error(e) | |
| return train_ds, test_ds | |
| def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict: | |
| """Tokenize and prepare the input text using the provided tokenizer. | |
| Args: | |
| tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input. | |
| text (str): The input text to be tokenized. | |
| Returns: | |
| inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids', | |
| 'attention_mask', etc. | |
| """ | |
| logger.info("Tokenizing input text.") | |
| inputs = tokenizer.encode_plus( | |
| text, | |
| return_tensors=None, | |
| add_special_tokens=Cfg.add_special_tokens, | |
| max_length=Cfg.max_len, | |
| pad_to_max_length=Cfg.pad_to_max_length, | |
| truncation=Cfg.truncation, | |
| ) | |
| for k, v in inputs.items(): | |
| inputs[k] = torch.tensor(v, dtype=torch.long) | |
| return inputs | |
| class NewsDataset(Dataset): | |
| def __init__(self, ds): | |
| self.texts = ds["Text"].values | |
| self.labels = ds["Category"].values | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, item): | |
| tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
| inputs = prepare_input(tokenizer, self.texts[item]) | |
| labels = torch.tensor(self.labels[item], dtype=torch.float) | |
| return inputs, labels | |
| def collate(inputs: Dict) -> Dict: | |
| """Collate and modify the input dictionary to have the same sequence length for a particular input batch. | |
| Args: | |
| inputs (dict): A dictionary containing input tensors with varying sequence lengths. | |
| Returns: | |
| modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length. | |
| """ | |
| max_len = int(inputs["input_ids"].sum(axis=1).max()) | |
| for k, v in inputs.items(): | |
| inputs[k] = inputs[k][:, :max_len] | |
| return inputs | |
| if __name__ == "__main__": | |
| df = load_dataset(Cfg.dataset_loc) | |
| df, headlines_df, class_to_index, index_to_class = preprocess(df) | |
| print(df) | |
| print(class_to_index) | |
| train_ds, val_ds = data_split(df, save_dfs=True) | |
| dataset = NewsDataset(df) | |
| print(dataset.__getitem__(0)) | |