|
|
import datetime |
|
|
import pickle |
|
|
import warnings |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
def make_datetime(year, month, day): |
|
|
try: |
|
|
date = datetime.datetime(int(year), int(month), int(day)) |
|
|
except ValueError as e: |
|
|
if e.args[0] == 'day is out of range for month': |
|
|
date = datetime.datetime(int(year), int(month), int(day)-1) |
|
|
return datetime.datetime.timestamp(date) + 61851630000 |
|
|
|
|
|
|
|
|
def clean_csv(): |
|
|
source_dir = 'events.csv' |
|
|
|
|
|
df = pd.read_csv(source_dir, header=0) |
|
|
|
|
|
df = df[~df['event_date_year'].isna()] |
|
|
df = df[df['event_date_year'] > 0] |
|
|
df['event_date_month'].fillna(1, inplace=True) |
|
|
df['event_date_day'].fillna(1, inplace=True) |
|
|
df.drop_duplicates(inplace=True) |
|
|
norm_const = 1000000 |
|
|
df['event_timestamp'] = df.apply( |
|
|
lambda x: make_datetime(x['event_date_year'], x['event_date_month'], x['event_date_day']), |
|
|
axis=1)/norm_const |
|
|
df.sort_values(by=['event_date_year', 'event_date_month', 'event_date_day'], inplace=True) |
|
|
df['event_type'] = [0] * len(df) |
|
|
|
|
|
df.to_csv('volcano.csv', index=False, header=True) |
|
|
return |
|
|
|
|
|
|
|
|
def make_seq(df): |
|
|
seq = [] |
|
|
df['time_diff'] = df['event_timestamp'].diff() |
|
|
df.index = np.arange(len(df)) |
|
|
for index, row in df.iterrows(): |
|
|
if index == 0: |
|
|
event_dict = {"time_since_last_event": 0.0, |
|
|
"time_since_start": 0.0, |
|
|
"type_event": row['event_type'] |
|
|
} |
|
|
start_event_time = row['event_timestamp'] |
|
|
else: |
|
|
event_dict = {"time_since_last_event": row['time_diff'], |
|
|
"time_since_start": row['event_timestamp'] - start_event_time, |
|
|
"type_event": row['event_type'] |
|
|
} |
|
|
seq.append(event_dict) |
|
|
|
|
|
return seq |
|
|
|
|
|
|
|
|
def make_pkl(target_dir, dim_process, split, seqs): |
|
|
with open(target_dir, "wb") as f_out: |
|
|
pickle.dump( |
|
|
{ |
|
|
"dim_process": dim_process, |
|
|
split: seqs |
|
|
}, f_out |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
def make_dataset(source_dir): |
|
|
df = pd.read_csv(source_dir, header=0) |
|
|
|
|
|
vols = np.unique(df['volcano_name']) |
|
|
total_seq = [] |
|
|
for vol in vols: |
|
|
df_ = df[df['volcano_name'] == vol] |
|
|
df_.sort_values('event_timestamp', inplace=True) |
|
|
total_seq.append(make_seq(df_)) |
|
|
|
|
|
|
|
|
print(len(total_seq)) |
|
|
make_pkl('train.pkl', 1, 'train', total_seq[:400]) |
|
|
count_seq(total_seq[:400]) |
|
|
make_pkl('dev.pkl', 1, 'dev', total_seq[400:450]) |
|
|
count_seq(total_seq[400:450]) |
|
|
make_pkl('test.pkl', 1, 'test', total_seq[450:]) |
|
|
count_seq(total_seq[450:]) |
|
|
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
def count_seq(seqs): |
|
|
total_len = [len(seq) for seq in seqs] |
|
|
print(np.mean(total_len)) |
|
|
print(np.sum(total_len)) |
|
|
|
|
|
return |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
make_dataset('volcano.csv') |
|
|
|