Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| from src.utils.helper_functions import save_parquet | |
| import os | |
| from config import Config | |
| config = vars(Config) | |
| def get_data( | |
| data_dir = '../data/raw/input/', | |
| file_name = 'demand_data_IFX.csv', | |
| date_columns = ['reporting_month_start'], | |
| split_local_test = False, | |
| target = 'demand', | |
| fixed_columns = [ | |
| 'product_id', | |
| 'product_application', | |
| 'product_marketing_name', | |
| 'product_main_family', | |
| 'planning_method_latest', | |
| ], | |
| prediction_interval = ('2023-11-01', '2024-07-01') | |
| ): | |
| print('Loading data...') | |
| dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns) | |
| dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date) | |
| dataframe.sort_values(by='date', inplace=True) | |
| if split_local_test: | |
| train, test = split_train_test( | |
| dataframe = dataframe | |
| ) | |
| test_min_date, test_max_date = test.date.min(), test.date.max() | |
| else: | |
| train, test = dataframe, None | |
| test_min_date, test_max_date = prediction_interval | |
| generated_test = generate_test_data( | |
| start_date=test_min_date, | |
| end_date=test_max_date, | |
| product_ids=train.product_id.unique() | |
| ) | |
| generated_test['date'] = pd.to_datetime(generated_test['date']) | |
| # merge the fixed columns | |
| generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right') | |
| save_parquet( | |
| dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns), | |
| path=f'{config["fold_input_directory"]}/fixed_columns.parquet' | |
| ) | |
| # merge the ground-truth | |
| if split_local_test: | |
| generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right') | |
| generated_test[target] = generated_test[target].fillna(0) | |
| else: | |
| generated_test[target] = np.nan | |
| # generate fixed train | |
| generated_train = train[[target,'date'] + fixed_columns] | |
| y_test = generated_test[target] | |
| generated_test.drop(target, axis=1, inplace=True) | |
| return dataframe, generated_train, generated_test, y_test | |
| def split_train_test(dataframe): | |
| train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')] | |
| test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))& | |
| (dataframe['date'] <= pd.to_datetime('2023-07-01'))] | |
| return train, test | |
| def generate_test_data(start_date, end_date, product_ids): | |
| # Generate a range of monthly start dates | |
| monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS') | |
| monthly_starts = pd.DataFrame(monthly_starts, columns=['date']) | |
| product_ids = pd.DataFrame(product_ids, columns=['product_id']) | |
| joined_df = product_ids.merge(monthly_starts, how='cross') | |
| return joined_df |