augustocsc's picture
GPT-2 Medium trained on prefix dataset (682K)
3742716 verified
import pandas as pd
import torch
class RegressionDataset:
def __init__(self, path: str, file_name: str = 'train.csv', delimiter: str = ',', header: int = 0,
encoding: str = 'utf-8', target_col: str = None):
"""
Initializes the RegressionDataset by loading data from a CSV file.
Args:
path (str): Path to the directory containing the CSV file.
file_name (str): Name of the CSV file. Defaults to 'train.csv'.
delimiter (str): Delimiter used in the CSV file. Defaults to ','.
header (int): Row number to use as the column names. Defaults to 0.
encoding (str): Encoding of the CSV file. Defaults to 'utf-8'.
target_col (str): Name of the target column. If None, the last column is used.
"""
self.data = pd.read_csv(f"{path}/{file_name}", delimiter=delimiter, header=header, encoding=encoding)
if self.data.empty:
raise ValueError("CSV file is empty.")
if target_col is None:
target_col = self.data.columns[-1]
if target_col not in self.data.columns:
raise ValueError(f"CSV must contain a column named '{target_col}'.")
self.X = self.data.drop(columns=[target_col]).apply(pd.to_numeric, errors='coerce').values
self.y = pd.to_numeric(self.data[target_col], errors='coerce').values
def get_data(self):
"""
Returns the data as PyTorch tensors (X, y).
"""
X_tensor = torch.tensor(self.X, dtype=torch.float32)
y_tensor = torch.tensor(self.y, dtype=torch.float32)
return X_tensor, y_tensor
def get_numpy(self):
"""
Returns the data as NumPy arrays (useful for sympy and R² calculations).
"""
return self.X, self.y