augustocsc
/

gpt2_medium_prefix_682k

Model card Files Files and versions

gpt2_medium_prefix_682k / classes /dataset.py

augustocsc's picture

GPT-2 Medium trained on prefix dataset (682K)

3742716 verified 4 days ago

history blame contribute delete

1.86 kB

	import pandas as pd
	import torch

	class RegressionDataset:
	def __init__(self, path: str, file_name: str = 'train.csv', delimiter: str = ',', header: int = 0,
	encoding: str = 'utf-8', target_col: str = None):
	"""
	Initializes the RegressionDataset by loading data from a CSV file.

	Args:
	path (str): Path to the directory containing the CSV file.
	file_name (str): Name of the CSV file. Defaults to 'train.csv'.
	delimiter (str): Delimiter used in the CSV file. Defaults to ','.
	header (int): Row number to use as the column names. Defaults to 0.
	encoding (str): Encoding of the CSV file. Defaults to 'utf-8'.
	target_col (str): Name of the target column. If None, the last column is used.
	"""
	self.data = pd.read_csv(f"{path}/{file_name}", delimiter=delimiter, header=header, encoding=encoding)

	if self.data.empty:
	raise ValueError("CSV file is empty.")

	if target_col is None:
	target_col = self.data.columns[-1]

	if target_col not in self.data.columns:
	raise ValueError(f"CSV must contain a column named '{target_col}'.")

	self.X = self.data.drop(columns=[target_col]).apply(pd.to_numeric, errors='coerce').values

	self.y = pd.to_numeric(self.data[target_col], errors='coerce').values

	def get_data(self):
	"""
	Returns the data as PyTorch tensors (X, y).
	"""
	X_tensor = torch.tensor(self.X, dtype=torch.float32)
	y_tensor = torch.tensor(self.y, dtype=torch.float32)
	return X_tensor, y_tensor

	def get_numpy(self):
	"""
	Returns the data as NumPy arrays (useful for sympy and R² calculations).
	"""
	return self.X, self.y