csun22's picture
Upload 59 files
ca1888b verified
#!/usr/bin/env python
"""
log_parser
tools to parse log_train and log_err
"""
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
import os
import re
import sys
__author__ = "Xin Wang"
__email__ = "wangxin@nii.ac.jp"
__copyright__ = "Copyright 2020, Xin Wang"
def f_read_log_err(file_path, train_num, val_num):
"""
log_train, log_val = f_read_log_err(log_err, num_train_utt, num_val_utt)
input:
-----
log_err: path to the log_err file
num_train_utt: how many training utterances
num_val_utt: how many validation utterances
output:
------
log_train: np.array, average error values per epoch on training set
log_val: np.array, average error values per epoch on valiation set
"""
data_str = []
with open(file_path, 'r') as file_ptr:
for line in file_ptr:
if not line.count('skip'):
try:
tmp = int(line[0])
data_str.append(line)
except ValueError:
pass
row = len(data_str)
col = len(np.fromstring(data_str[0], dtype=np.float32, sep=','))
data = np.zeros([row,col])
for idx, line in enumerate(data_str):
data[idx, :] = np.fromstring(line, dtype=np.float32, sep=',')
print(data.shape[0])
total_num = train_num + val_num
epoch_num = int(data.shape[0] / total_num)
data_train = np.zeros([epoch_num, data.shape[1]])
data_val = np.zeros([epoch_num, data.shape[1]])
for x in range(epoch_num):
temp_data = data[x * total_num:(x+1)*total_num, :]
train_part = temp_data[0:train_num,:]
val_part = temp_data[train_num:(train_num+val_num),:]
data_train[x, :] = np.mean(train_part, axis=0)
data_val[x, :] = np.mean(val_part, axis=0)
return data_train, data_val
def f_read_log_train(file_path):
"""
data_train, data_val, time_per_epoch = read_log_train(path_to_log_train)
input:
-----
path_to_log_train: path to the log_train file
output:
------
data_train: error values per epoch on training set
data_val: error values per epoch on valiation set
time_per_epoch: training time per epoch
"""
read_flag = False
data_str = []
with open(file_path, 'r') as file_ptr:
for line in file_ptr:
if read_flag and line.count('|') > 2:
data_str.append(line)
if line.count('Duration'):
read_flag = True
row = len(data_str)
data_train = np.zeros([row, 3])
data_val = np.zeros([row, 3])
time_per_epoch = np.zeros(row)
for idx, line in enumerate(data_str):
try:
time_per_epoch[idx] = float(line.split('|')[1])
except ValueError:
continue
trn_data = line.split('|')[2].split('/')
val_data = line.split('|')[3].split('/')
for idx2 in np.arange(len(trn_data)):
data_train[idx, idx2] = float(trn_data[idx2])
data_val[idx,idx2] = float(val_data[idx2])
return data_train, data_val, time_per_epoch
def read_log_err_pytorch(file_path, merge_epoch=False):
def set_size(line):
return int(line.split('/')[1].split(',')[0])
def data_line(line):
if line.count("Time:"):
return True
else:
return False
def get_data(line):
return [float(x.split(":")[1]) for x in line.split(',') if x.count("Loss:")]
trn_utt_num = None
val_utt_num = None
trn_total_num = 0
val_total_num = 0
with open(file_path, 'r') as file_ptr:
for line in file_ptr:
if not data_line(line):
continue
temp_num = set_size(line)
col_num = len(get_data(line))
if trn_utt_num is None:
trn_utt_num = temp_num
if temp_num != val_utt_num and temp_num != trn_utt_num:
val_utt_num = temp_num
if trn_utt_num == temp_num:
trn_total_num += 1
if val_utt_num == temp_num:
val_total_num += 1
if trn_utt_num is None:
print("Cannot parse file")
return
if val_utt_num is None:
print("Trn %d, no val" % (trn_utt_num))
else:
print("Trn %d, val %d" % (trn_utt_num, val_utt_num))
print("Trn data %d, val data %d" % (trn_total_num, val_total_num))
trn_data = np.zeros([trn_total_num, col_num])
val_data = np.zeros([val_total_num, col_num])
trn_utt_cnt = 0
val_utt_cnt = 0
with open(file_path, 'r') as file_ptr:
for line in file_ptr:
if not data_line(line):
continue
data = get_data(line)
temp_num = set_size(line)
if trn_utt_num == temp_num:
trn_data[trn_utt_cnt, :] = np.array(data)
trn_utt_cnt += 1
if val_utt_num == temp_num:
val_data[val_utt_cnt, :] = np.array(data)
val_utt_cnt += 1
if merge_epoch:
trn_data_new = np.zeros([trn_total_num // trn_utt_num, col_num])
val_data_new = np.zeros([val_total_num // val_utt_num, col_num])
for idx in range(min([trn_total_num // trn_utt_num, val_total_num // val_utt_num])):
trn_data_new[idx, :] = trn_data[idx*trn_utt_num:(idx+1)*trn_utt_num, :].mean(axis=0)
val_data_new[idx, :] = val_data[idx*val_utt_num:(idx+1)*val_utt_num, :].mean(axis=0)
return trn_data_new, val_data_new
else:
return trn_data, val_data
if __name__ == "__main__":
print("logParser")