42Cummer's picture
Upload 15 files
2f560eb verified
import pandas as pd # type: ignore
import numpy as np # type: ignore
import torch # type: ignore
from torch.utils.data import Dataset # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
class VFVDataset(Dataset):
def __init__(self, csv_path, window_size=15):
# 1. Load CSV, skipping the 'Ticker' and empty 'Datetime' rows
# Based on your file, we skip rows 1 and 2 (0-indexed)
df = pd.read_csv(csv_path, skiprows=[1, 2])
# 2. Force 'Close' to numeric and drop any failed conversions
# The 'Price' column actually contains the Datetime in your CSV
prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values
# 3. Calculate log returns: log(P_t / P_{t-1})
# This makes the data 'stationary' (meaning it has a constant mean/variance)
returns = pd.Series(prices).pct_change().dropna().values
# 4. Normalize to Z-scores (mean=0, std=1)
# This is vital for Quantum Circuits which are sensitive to input scales
self.scaler = StandardScaler()
returns_scaled = self.scaler.fit_transform(returns.reshape(-1, 1)).flatten()
# 5. Create Sliding Windows of 15 minutes
self.windows = []
for i in range(len(returns_scaled) - window_size):
self.windows.append(returns_scaled[i : i + window_size])
self.windows = torch.tensor(np.array(self.windows), dtype=torch.float32)
print(f"Dataset Loaded: {len(self.windows)} windows of {window_size} minutes.")
def __len__(self):
return len(self.windows)
def __getitem__(self, idx):
return self.windows[idx]