Abigail99216's picture
Upload folder using huggingface_hub
f43af3c verified
import pickle
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
# data source: https://earthquake.usgs.gov/earthquakes/search/
def event_type_map(mag):
if mag < 2.75:
return 0
elif mag < 3.0:
return 1
elif mag < 3.5:
return 2
elif mag < 4.0:
return 3
elif mag < 4.5:
return 4
elif mag < 5.0:
return 5
else:
return 6
def clean_csv(source_dir):
df = pd.read_csv(source_dir, header=0)
df.drop_duplicates(inplace=True)
df.sort_values(by=['time'], inplace=True)
print(len(df))
df = df[['time', 'mag']]
df['event_type'] = df['mag'].apply(lambda x: event_type_map(x))
df.to_csv('earthquake.csv', index=False, header=True)
return
def make_seq(df):
seq = []
df['time_diff'] = df['event_time'].diff()
df.index = np.arange(len(df))
for index, row in df.iterrows():
if index == 0:
event_dict = {"time_since_last_event": 0.0,
"time_since_start": 0.0,
"type_event": row['event_type']
}
start_event_time = row['event_time']
else:
event_dict = {"time_since_last_event": row['time_diff'],
"time_since_start": row['event_time'] - start_event_time,
"type_event": row['event_type']
}
seq.append(event_dict)
return seq
def make_pkl(target_dir, dim_process, split, seqs):
with open(target_dir, "wb") as f_out:
pickle.dump(
{
"dim_process": dim_process,
split: seqs
}, f_out
)
return
def make_dataset(source_dir):
df = pd.read_csv(source_dir, header=0)
df['time'] = pd.to_datetime(df['time'])
norm_const = 10000
df['event_time'] = df['time'].apply(lambda x: datetime.timestamp(x)) / norm_const
seq_len = np.random.randint(15, 19, 4300)
print(np.sum(seq_len))
seq_start_idx = [0] + list(np.cumsum(seq_len)[:-1] - 1)
seq_end_idx = np.cumsum(seq_len) - 1
total_seq = [make_seq(df.iloc[start_idx:end_idx, :]) for (start_idx, end_idx) in
zip(seq_start_idx, seq_end_idx)]
print(len(total_seq))
make_pkl('train.pkl', 7, 'train', total_seq[:3000])
print(np.sum(seq_len[:3000]))
make_pkl('dev.pkl', 7, 'dev', total_seq[3000:3400])
print(np.sum(seq_len[3000:3400]))
make_pkl('test.pkl', 7, 'test', total_seq[3400:])
print(np.sum(seq_len[3400:]))
# 70794
# 4300
# 49364
# 6612
# 14818
return
if __name__ == '__main__':
# clean_csv()
make_dataset('earthquake.csv')