import pandas as pd import torch class RegressionDataset: def __init__(self, path: str, file_name: str = 'train.csv', delimiter: str = ',', header: int = 0, encoding: str = 'utf-8', target_col: str = None): """ Initializes the RegressionDataset by loading data from a CSV file. Args: path (str): Path to the directory containing the CSV file. file_name (str): Name of the CSV file. Defaults to 'train.csv'. delimiter (str): Delimiter used in the CSV file. Defaults to ','. header (int): Row number to use as the column names. Defaults to 0. encoding (str): Encoding of the CSV file. Defaults to 'utf-8'. target_col (str): Name of the target column. If None, the last column is used. """ self.data = pd.read_csv(f"{path}/{file_name}", delimiter=delimiter, header=header, encoding=encoding) if self.data.empty: raise ValueError("CSV file is empty.") if target_col is None: target_col = self.data.columns[-1] if target_col not in self.data.columns: raise ValueError(f"CSV must contain a column named '{target_col}'.") self.X = self.data.drop(columns=[target_col]).apply(pd.to_numeric, errors='coerce').values self.y = pd.to_numeric(self.data[target_col], errors='coerce').values def get_data(self): """ Returns the data as PyTorch tensors (X, y). """ X_tensor = torch.tensor(self.X, dtype=torch.float32) y_tensor = torch.tensor(self.y, dtype=torch.float32) return X_tensor, y_tensor def get_numpy(self): """ Returns the data as NumPy arrays (useful for sympy and R² calculations). """ return self.X, self.y