| import pandas as pd | |
| import torch | |
| class RegressionDataset: | |
| def __init__(self, path: str, file_name: str = 'train.csv', delimiter: str = ',', header: int = 0, | |
| encoding: str = 'utf-8', target_col: str = None): | |
| """ | |
| Initializes the RegressionDataset by loading data from a CSV file. | |
| Args: | |
| path (str): Path to the directory containing the CSV file. | |
| file_name (str): Name of the CSV file. Defaults to 'train.csv'. | |
| delimiter (str): Delimiter used in the CSV file. Defaults to ','. | |
| header (int): Row number to use as the column names. Defaults to 0. | |
| encoding (str): Encoding of the CSV file. Defaults to 'utf-8'. | |
| target_col (str): Name of the target column. If None, the last column is used. | |
| """ | |
| self.data = pd.read_csv(f"{path}/{file_name}", delimiter=delimiter, header=header, encoding=encoding) | |
| if self.data.empty: | |
| raise ValueError("CSV file is empty.") | |
| if target_col is None: | |
| target_col = self.data.columns[-1] | |
| if target_col not in self.data.columns: | |
| raise ValueError(f"CSV must contain a column named '{target_col}'.") | |
| self.X = self.data.drop(columns=[target_col]).apply(pd.to_numeric, errors='coerce').values | |
| self.y = pd.to_numeric(self.data[target_col], errors='coerce').values | |
| def get_data(self): | |
| """ | |
| Returns the data as PyTorch tensors (X, y). | |
| """ | |
| X_tensor = torch.tensor(self.X, dtype=torch.float32) | |
| y_tensor = torch.tensor(self.y, dtype=torch.float32) | |
| return X_tensor, y_tensor | |
| def get_numpy(self): | |
| """ | |
| Returns the data as NumPy arrays (useful for sympy and R² calculations). | |
| """ | |
| return self.X, self.y | |