Spaces:

ManishW
/

News-Classifier

Sleeping

App Files Files Community

News-Classifier / newsclassifier /data.py

ManishW

Upload folder using huggingface_hub

022acf4 over 2 years ago

raw

history blame contribute delete

6.43 kB

	import os
	import re
	from typing import Dict, Tuple
	from warnings import filterwarnings

	import pandas as pd
	from sklearn.model_selection import train_test_split

	import torch
	from newsclassifier.config.config import Cfg, logger
	from torch.utils.data import Dataset
	from transformers import RobertaTokenizer

	filterwarnings("ignore")


	def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame:
	"""load data from source into a Pandas DataFrame.

	Args:
	filepath (str): file location.
	print_i (int): Print number of instances.

	Returns:
	pd.DataFrame: Pandas DataFrame of the data.
	"""
	logger.info("Loading Data.")
	df = pd.read_csv(filepath)
	if print_i:
	print(df.head(print_i), "\n")
	return df


	def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
	"""Separate headlines instance and feature selection.

	Args:
	df: original dataframe.

	Returns:
	df: new dataframe with appropriate features.
	headlines_df: dataframe cintaining "headlines" category instances.
	"""
	logger.info("Preparing Data.")
	try:
	df = df[["Title", "Category"]]
	df.rename(columns={"Title": "Text"}, inplace=True)
	df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)
	except Exception as e:
	logger.error(e)

	return df, headlines_df


	def clean_text(text: str) -> str:
	"""Clean text (lower, puntuations removal, blank space removal)."""
	# lower case the text
	logger.info("Cleaning input text.")
	text = text.lower() # necessary to do before as stopwords are in lower case

	# remove stopwords
	stp_pattern = re.compile(r"\b(" + r"\|".join(Cfg.STOPWORDS) + r")\b\s*")
	text = stp_pattern.sub("", text)

	# custom cleaning
	text = text.strip() # remove space at start or end if any
	text = re.sub(" +", " ", text) # remove extra spaces
	text = re.sub("[^A-Za-z0-9]+", " ", text) # remove characters that are not alphanumeric

	return text


	def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]:
	"""Preprocess the data.

	Args:
	df: Dataframe on which the preprocessing steps need to be performed.

	Returns:
	df: Preprocessed Data.
	class_to_index: class labels to indices mapping
	class_to_index: indices to class labels mapping
	"""
	df, headlines_df = prepare_data(df)

	cats = df["Category"].unique().tolist()
	class_to_index = {tag: i for i, tag in enumerate(cats)}
	index_to_class = {v: k for k, v in class_to_index.items()}

	df["Text"] = df["Text"].apply(clean_text) # clean text
	df = df[["Text", "Category"]]
	try:
	df["Category"] = df["Category"].map(class_to_index) # label encoding
	except Exception as e:
	logger.error(e)
	return df, headlines_df, class_to_index, index_to_class


	def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False):
	"""Split data into train and test sets.

	Args:
	df (pd.DataFrame): Data to be split.
	split_size (float): train-test split ratio (test ratio).
	stratify_on_target (bool): Whether to do stratify split on target.
	target_sep (bool): Whether to do target setting for train and test sets.
	save_dfs (bool): Whether to save dataset splits in artifacts.

	Returns:
	train-test splits (with/without target setting)
	"""
	logger.info("Splitting Data.")
	try:
	if stratify_on_target:
	stra = df["Category"]
	else:
	stra = None

	train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra)
	train_ds = pd.DataFrame(train, columns=df.columns)
	test_ds = pd.DataFrame(test, columns=df.columns)

	if save_dfs:
	logger.info("Saving and storing data splits.")

	os.makedirs(Cfg.preprocessed_data_path, exist_ok=True)
	train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv"))
	test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv"))
	except Exception as e:
	logger.error(e)

	return train_ds, test_ds


	def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
	"""Tokenize and prepare the input text using the provided tokenizer.

	Args:
	tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
	text (str): The input text to be tokenized.

	Returns:
	inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
	'attention_mask', etc.
	"""
	logger.info("Tokenizing input text.")
	inputs = tokenizer.encode_plus(
	text,
	return_tensors=None,
	add_special_tokens=Cfg.add_special_tokens,
	max_length=Cfg.max_len,
	pad_to_max_length=Cfg.pad_to_max_length,
	truncation=Cfg.truncation,
	)
	for k, v in inputs.items():
	inputs[k] = torch.tensor(v, dtype=torch.long)
	return inputs


	class NewsDataset(Dataset):
	def __init__(self, ds):
	self.texts = ds["Text"].values
	self.labels = ds["Category"].values

	def __len__(self):
	return len(self.texts)

	def __getitem__(self, item):
	tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
	inputs = prepare_input(tokenizer, self.texts[item])
	labels = torch.tensor(self.labels[item], dtype=torch.float)
	return inputs, labels


	def collate(inputs: Dict) -> Dict:
	"""Collate and modify the input dictionary to have the same sequence length for a particular input batch.

	Args:
	inputs (dict): A dictionary containing input tensors with varying sequence lengths.

	Returns:
	modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
	"""
	max_len = int(inputs["input_ids"].sum(axis=1).max())
	for k, v in inputs.items():
	inputs[k] = inputs[k][:, :max_len]
	return inputs


	if __name__ == "__main__":
	df = load_dataset(Cfg.dataset_loc)
	df, headlines_df, class_to_index, index_to_class = preprocess(df)
	print(df)
	print(class_to_index)
	train_ds, val_ds = data_split(df, save_dfs=True)
	dataset = NewsDataset(df)
	print(dataset.__getitem__(0))