import pandas as pd # type: ignore import numpy as np # type: ignore import torch # type: ignore from torch.utils.data import Dataset # type: ignore from sklearn.preprocessing import StandardScaler # type: ignore class VFVDataset(Dataset): def __init__(self, csv_path, window_size=15): # 1. Load CSV, skipping the 'Ticker' and empty 'Datetime' rows # Based on your file, we skip rows 1 and 2 (0-indexed) df = pd.read_csv(csv_path, skiprows=[1, 2]) # 2. Force 'Close' to numeric and drop any failed conversions # The 'Price' column actually contains the Datetime in your CSV prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values # 3. Calculate log returns: log(P_t / P_{t-1}) # This makes the data 'stationary' (meaning it has a constant mean/variance) returns = pd.Series(prices).pct_change().dropna().values # 4. Normalize to Z-scores (mean=0, std=1) # This is vital for Quantum Circuits which are sensitive to input scales self.scaler = StandardScaler() returns_scaled = self.scaler.fit_transform(returns.reshape(-1, 1)).flatten() # 5. Create Sliding Windows of 15 minutes self.windows = [] for i in range(len(returns_scaled) - window_size): self.windows.append(returns_scaled[i : i + window_size]) self.windows = torch.tensor(np.array(self.windows), dtype=torch.float32) print(f"Dataset Loaded: {len(self.windows)} windows of {window_size} minutes.") def __len__(self): return len(self.windows) def __getitem__(self, idx): return self.windows[idx]