Spaces:
Sleeping
Sleeping
File size: 1,702 Bytes
2f560eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import pandas as pd # type: ignore
import numpy as np # type: ignore
import torch # type: ignore
from torch.utils.data import Dataset # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
class VFVDataset(Dataset):
def __init__(self, csv_path, window_size=15):
# 1. Load CSV, skipping the 'Ticker' and empty 'Datetime' rows
# Based on your file, we skip rows 1 and 2 (0-indexed)
df = pd.read_csv(csv_path, skiprows=[1, 2])
# 2. Force 'Close' to numeric and drop any failed conversions
# The 'Price' column actually contains the Datetime in your CSV
prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values
# 3. Calculate log returns: log(P_t / P_{t-1})
# This makes the data 'stationary' (meaning it has a constant mean/variance)
returns = pd.Series(prices).pct_change().dropna().values
# 4. Normalize to Z-scores (mean=0, std=1)
# This is vital for Quantum Circuits which are sensitive to input scales
self.scaler = StandardScaler()
returns_scaled = self.scaler.fit_transform(returns.reshape(-1, 1)).flatten()
# 5. Create Sliding Windows of 15 minutes
self.windows = []
for i in range(len(returns_scaled) - window_size):
self.windows.append(returns_scaled[i : i + window_size])
self.windows = torch.tensor(np.array(self.windows), dtype=torch.float32)
print(f"Dataset Loaded: {len(self.windows)} windows of {window_size} minutes.")
def __len__(self):
return len(self.windows)
def __getitem__(self, idx):
return self.windows[idx] |