|
|
import pickle |
|
|
import warnings |
|
|
from datetime import datetime |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def event_type_map(mag): |
|
|
if mag < 2.75: |
|
|
return 0 |
|
|
elif mag < 3.0: |
|
|
return 1 |
|
|
elif mag < 3.5: |
|
|
return 2 |
|
|
elif mag < 4.0: |
|
|
return 3 |
|
|
elif mag < 4.5: |
|
|
return 4 |
|
|
elif mag < 5.0: |
|
|
return 5 |
|
|
else: |
|
|
return 6 |
|
|
|
|
|
|
|
|
def clean_csv(source_dir): |
|
|
df = pd.read_csv(source_dir, header=0) |
|
|
|
|
|
df.drop_duplicates(inplace=True) |
|
|
|
|
|
df.sort_values(by=['time'], inplace=True) |
|
|
print(len(df)) |
|
|
df = df[['time', 'mag']] |
|
|
df['event_type'] = df['mag'].apply(lambda x: event_type_map(x)) |
|
|
|
|
|
df.to_csv('earthquake.csv', index=False, header=True) |
|
|
return |
|
|
|
|
|
|
|
|
def make_seq(df): |
|
|
seq = [] |
|
|
df['time_diff'] = df['event_time'].diff() |
|
|
df.index = np.arange(len(df)) |
|
|
for index, row in df.iterrows(): |
|
|
if index == 0: |
|
|
event_dict = {"time_since_last_event": 0.0, |
|
|
"time_since_start": 0.0, |
|
|
"type_event": row['event_type'] |
|
|
} |
|
|
start_event_time = row['event_time'] |
|
|
else: |
|
|
event_dict = {"time_since_last_event": row['time_diff'], |
|
|
"time_since_start": row['event_time'] - start_event_time, |
|
|
"type_event": row['event_type'] |
|
|
} |
|
|
seq.append(event_dict) |
|
|
|
|
|
return seq |
|
|
|
|
|
|
|
|
def make_pkl(target_dir, dim_process, split, seqs): |
|
|
with open(target_dir, "wb") as f_out: |
|
|
pickle.dump( |
|
|
{ |
|
|
"dim_process": dim_process, |
|
|
split: seqs |
|
|
}, f_out |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
def make_dataset(source_dir): |
|
|
df = pd.read_csv(source_dir, header=0) |
|
|
df['time'] = pd.to_datetime(df['time']) |
|
|
|
|
|
norm_const = 10000 |
|
|
df['event_time'] = df['time'].apply(lambda x: datetime.timestamp(x)) / norm_const |
|
|
seq_len = np.random.randint(15, 19, 4300) |
|
|
print(np.sum(seq_len)) |
|
|
|
|
|
seq_start_idx = [0] + list(np.cumsum(seq_len)[:-1] - 1) |
|
|
seq_end_idx = np.cumsum(seq_len) - 1 |
|
|
|
|
|
total_seq = [make_seq(df.iloc[start_idx:end_idx, :]) for (start_idx, end_idx) in |
|
|
zip(seq_start_idx, seq_end_idx)] |
|
|
|
|
|
print(len(total_seq)) |
|
|
make_pkl('train.pkl', 7, 'train', total_seq[:3000]) |
|
|
print(np.sum(seq_len[:3000])) |
|
|
make_pkl('dev.pkl', 7, 'dev', total_seq[3000:3400]) |
|
|
print(np.sum(seq_len[3000:3400])) |
|
|
make_pkl('test.pkl', 7, 'test', total_seq[3400:]) |
|
|
print(np.sum(seq_len[3400:])) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
make_dataset('earthquake.csv') |
|
|
|