starx / model /preprocess /preprocess.py
recorderlegend1's picture
Upload folder using huggingface_hub
e418c5a verified
import numpy as np
import pandas as pd
import torch
import sys
import os
from sklearn.preprocessing import StandardScaler,MinMaxScaler
np.set_printoptions(suppress=True)
ss,mm = StandardScaler(), MinMaxScaler()
thigh = pd.read_csv(f"../../data/unprocessed/{sys.argv[1]}/front.txt",delimiter=',',usecols =[i for i in range(13) if i != 0])
shin = pd.read_csv(f"../../data/unprocessed/{sys.argv[1]}/back.txt",delimiter=',',usecols =[i for i in range(13) if i != 0])
thigh,shin = thigh.dropna(),shin.dropna()
delta = len(thigh) - len(shin)
thigh = thigh[delta:]
thigh.reset_index(inplace=True)
thigh,shin = thigh[:55000],shin[:55000]
for col in thigh.columns:
thigh.rename(columns={col:col+"_th"},inplace=True)
shin.rename(columns={col:col+"_sh"},inplace=True)
p_columns_th = [col for col in thigh.columns if col.startswith('p')]
s_columns_th = [col for col in thigh.columns if col.startswith('s')]
p_columns_sh = [col for col in shin.columns if col.startswith('p')]
s_columns_sh = [col for col in shin.columns if col.startswith('s')]
features = thigh[s_columns_th]
features = pd.concat([features,shin[s_columns_sh]],axis=1)
labels = thigh[p_columns_th]
labels = pd.concat([labels,shin[p_columns_sh]],axis=1)
features_scaled = pd.DataFrame(ss.fit_transform(features), columns=features.columns)
labels_scaled = pd.DataFrame(mm.fit_transform(labels), columns=labels.columns)
os.makedirs(f"../../data/processed/{sys.argv[1]}",exist_ok=True)
features.to_csv(f"../../data/processed/{sys.argv[1]}/features.csv")
labels.to_csv(f"../../data/processed/{sys.argv[1]}/labels.csv")
def preprocess_data(features_df, labels_df, lookback_window, predict_window, output_file):
lookback_window *= 150
predict_window *= 150
total_samples = len(features_df) - lookback_window - predict_window
x_data = torch.zeros((total_samples, lookback_window, features_df.shape[1]))
y_data = torch.zeros((total_samples, predict_window, labels_df.shape[1]))
for idx, i in enumerate(range(lookback_window, len(features_df) - predict_window)):
if idx % 1000 == 0:
print(f"Processing sample {idx}/{total_samples}...")
x_data[idx] = torch.tensor(features_df.iloc[i - lookback_window:i].values, dtype=torch.float32)
y_data[idx] = torch.tensor(labels_df.iloc[i:i + predict_window].values, dtype=torch.float32)
torch.save({"x": x_data, "y": y_data}, output_file)
print(f"Preprocessed data saved to {output_file}")
preprocess_data(features_scaled,labels_scaled,3,3,f"../../data/processed/{sys.argv[1]}/data.pt")