Spaces:
Sleeping
Sleeping
| ## Necessary Packages | |
| import scipy.stats | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from sklearn.manifold import TSNE | |
| from sklearn.decomposition import PCA | |
| def display_scores(results): | |
| mean = np.mean(results) | |
| sigma = scipy.stats.sem(results) | |
| sigma = sigma * scipy.stats.t.ppf((1 + 0.95) / 2.0, 5 - 1) | |
| # sigma = 1.96*(np.std(results)/np.sqrt(len(results))) | |
| print("Final Score: ", f"{mean} \xB1 {sigma}") | |
| return mean, sigma | |
| def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8): | |
| """Divide train and test data for both original and synthetic data. | |
| Args: | |
| - data_x: original data | |
| - data_x_hat: generated data | |
| - data_t: original time | |
| - data_t_hat: generated time | |
| - train_rate: ratio of training data from the original data | |
| """ | |
| # Divide train/test index (original data) | |
| no = len(data_x) | |
| idx = np.random.permutation(no) | |
| train_idx = idx[: int(no * train_rate)] | |
| test_idx = idx[int(no * train_rate) :] | |
| train_x = [data_x[i] for i in train_idx] | |
| test_x = [data_x[i] for i in test_idx] | |
| train_t = [data_t[i] for i in train_idx] | |
| test_t = [data_t[i] for i in test_idx] | |
| # Divide train/test index (synthetic data) | |
| no = len(data_x_hat) | |
| idx = np.random.permutation(no) | |
| train_idx = idx[: int(no * train_rate)] | |
| test_idx = idx[int(no * train_rate) :] | |
| train_x_hat = [data_x_hat[i] for i in train_idx] | |
| test_x_hat = [data_x_hat[i] for i in test_idx] | |
| train_t_hat = [data_t_hat[i] for i in train_idx] | |
| test_t_hat = [data_t_hat[i] for i in test_idx] | |
| return ( | |
| train_x, | |
| train_x_hat, | |
| test_x, | |
| test_x_hat, | |
| train_t, | |
| train_t_hat, | |
| test_t, | |
| test_t_hat, | |
| ) | |
| def extract_time(data): | |
| """Returns Maximum sequence length and each sequence length. | |
| Args: | |
| - data: original data | |
| Returns: | |
| - time: extracted time information | |
| - max_seq_len: maximum sequence length | |
| """ | |
| time = list() | |
| max_seq_len = 0 | |
| for i in range(len(data)): | |
| max_seq_len = max(max_seq_len, len(data[i][:, 0])) | |
| time.append(len(data[i][:, 0])) | |
| return time, max_seq_len | |
| def visualization(ori_data, generated_data, analysis, compare=3000, output_label=""): | |
| """Using PCA or tSNE for generated and original data visualization. | |
| Args: | |
| - ori_data: original data | |
| - generated_data: generated synthetic data | |
| - analysis: tsne or pca or kernel | |
| """ | |
| # Analysis sample size (for faster computation) | |
| anal_sample_no = min([compare, ori_data.shape[0]]) | |
| idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] | |
| # Data preprocessing | |
| # ori_data = np.asarray(ori_data) | |
| # generated_data = np.asarray(generated_data) | |
| ori_data = ori_data[idx] | |
| generated_data = generated_data[idx] | |
| no, seq_len, dim = ori_data.shape | |
| for i in range(anal_sample_no): | |
| if i == 0: | |
| prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) | |
| prep_data_hat = np.reshape( | |
| np.mean(generated_data[0, :, :], 1), [1, seq_len] | |
| ) | |
| else: | |
| prep_data = np.concatenate( | |
| (prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) | |
| ) | |
| prep_data_hat = np.concatenate( | |
| ( | |
| prep_data_hat, | |
| np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), | |
| ) | |
| ) | |
| # Visualization parameter | |
| # colors = [ | |
| # "red" for i in range(anal_sample_no)] + [ | |
| # "blue" for i in range(anal_sample_no) | |
| # ] | |
| colors = [ | |
| # "#CA0020", | |
| "#F4A582", | |
| # "#92C5DE", | |
| "#0571B0", | |
| "#5E4FA2", | |
| "#54278F", | |
| ] | |
| if analysis == "pca": | |
| # PCA Analysis | |
| pca = PCA(n_components=2) | |
| pca.fit(prep_data) | |
| pca_results = pca.transform(prep_data) | |
| pca_hat_results = pca.transform(prep_data_hat) | |
| # Plotting | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| plt.scatter( | |
| pca_results[:, 0], | |
| pca_results[:, 1], | |
| # c=colors[:anal_sample_no], | |
| c=[colors[0] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Original", | |
| ) | |
| plt.scatter( | |
| pca_hat_results[:, 0], | |
| pca_hat_results[:, 1], | |
| # c=colors[anal_sample_no:], | |
| c=[colors[1] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Generated", | |
| ) | |
| ax.legend() | |
| plt.title("PCA plot") | |
| plt.xlabel("x") | |
| plt.ylabel("y") | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_pca.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| elif analysis == "tsne": | |
| # Do t-SNE Analysis together | |
| prep_data_final = np.concatenate((prep_data, prep_data_hat), axis=0) | |
| # TSNE anlaysis | |
| tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) | |
| tsne_results = tsne.fit_transform(prep_data_final) | |
| # Plotting | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| plt.scatter( | |
| tsne_results[:anal_sample_no, 0], | |
| tsne_results[:anal_sample_no, 1], | |
| c=[colors[0] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Original", | |
| ) | |
| plt.scatter( | |
| tsne_results[anal_sample_no:, 0], | |
| tsne_results[anal_sample_no:, 1], | |
| c=[colors[1] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Generated", | |
| ) | |
| ax.legend() | |
| plt.title("t-SNE plot") | |
| plt.xlabel("x") | |
| plt.ylabel("y") | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| elif analysis == "kernel": | |
| # Visualization parameter | |
| # colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| sns.distplot( | |
| prep_data, | |
| hist=False, | |
| kde=True, | |
| kde_kws={"linewidth": 2}, | |
| label="Original", | |
| color=colors[0], | |
| ) | |
| sns.distplot( | |
| prep_data_hat, | |
| hist=False, | |
| kde=True, | |
| kde_kws={"linewidth": 2, "linestyle": "--"}, | |
| label="Generated", | |
| color=colors[1], | |
| ) | |
| # Plot formatting | |
| # plt.legend(prop={'size': 22}) | |
| plt.legend() | |
| plt.xlabel("Data Value") | |
| plt.ylabel("Data Density Estimate") | |
| # plt.rcParams['pdf.fonttype'] = 42 | |
| # plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') | |
| # plt.ylim((0, 12)) | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| plt.close() | |
| def visualization_control(data, analysis, compare=3000, output_label=""): | |
| """Using PCA or tSNE for generated and original data visualization. | |
| Args: | |
| - data: dictionary of original and generated data | |
| - analysis: tsne or pca or kernel | |
| """ | |
| ori_data = data.get("ori_data") | |
| keys = list(data.keys()) | |
| keys.remove("ori_data") | |
| # Analysis sample size (for faster computation) | |
| anal_sample_no = min([compare, ori_data.shape[0]]) | |
| idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] | |
| # Data preprocessing | |
| # ori_data = np.asarray(ori_data) | |
| # generated_data = np.asarray(generated_data) | |
| ori_data = ori_data[idx] | |
| for i, key in enumerate(keys): | |
| data[key] = data[key][idx] | |
| _, seq_len, dim = ori_data.shape | |
| preprossed_data = {} | |
| for i in range(anal_sample_no): | |
| if i == 0: | |
| prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) | |
| # prep_data_hat = np.reshape( | |
| # np.mean(generated_data[0, :, :], 1), [1, seq_len] | |
| # ) | |
| for key in keys: | |
| prep_data_hat = np.reshape( | |
| np.mean(data[key][0, :, :], 1), [1, seq_len] | |
| ) | |
| preprossed_data[key] = prep_data_hat | |
| else: | |
| prep_data = np.concatenate( | |
| (prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) | |
| ) | |
| # prep_data_hat = np.concatenate( | |
| # ( | |
| # prep_data_hat, | |
| # np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), | |
| # ) | |
| # ) | |
| for key in keys: | |
| prep_data_hat = np.concatenate( | |
| ( | |
| preprossed_data[key], | |
| np.reshape(np.mean(data[key][i, :, :], 1), [1, seq_len]), | |
| ) | |
| ) | |
| preprossed_data[key] = prep_data_hat | |
| # Visualization parameter | |
| # colors = [ | |
| # "red" for i in range(anal_sample_no)] + [ | |
| # "blue" for i in range(anal_sample_no) | |
| # ] | |
| colors = [ | |
| "#CA0020", | |
| "#F4A582", | |
| "#92C5DE", | |
| "#0571B0", | |
| "#5E4FA2", | |
| "#54278F", | |
| "#6A3D9A", | |
| "#9E0142", | |
| "#D53E4F", | |
| "#F46D43", | |
| "#FDAE61", | |
| "#FEE08B", | |
| ] * 3 | |
| if analysis == "pca": | |
| # PCA Analysis | |
| pca = PCA(n_components=2) | |
| pca.fit(prep_data) | |
| pca_results = pca.transform(prep_data) | |
| pca_control_results = {} | |
| for key in keys: | |
| pca_control_results[key] = pca.transform(preprossed_data[key]) | |
| # pca_hat_results = pca.transform(prep_data_hat) | |
| # Plotting | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| plt.scatter( | |
| pca_results[:, 0], | |
| pca_results[:, 1], | |
| # c=colors[:anal_sample_no], | |
| c=[colors[0] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Original", | |
| ) | |
| # plt.scatter( | |
| # pca_hat_results[:, 0], | |
| # pca_hat_results[:, 1], | |
| # # c=colors[anal_sample_no:], | |
| # c=[colors[1] for _ in range(anal_sample_no)], | |
| # alpha=0.5, | |
| # label="Generated", | |
| # ) | |
| for i, key in enumerate(keys): | |
| plt.scatter( | |
| pca_control_results[key][:, 0], | |
| pca_control_results[key][:, 1], | |
| c=[colors[i+1] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label=key, | |
| ) | |
| ax.legend() | |
| plt.title("PCA plot") | |
| plt.xlabel("x") | |
| plt.ylabel("y") | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_pca.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| elif analysis == "tsne": | |
| # Do t-SNE Analysis together | |
| prep_data_final = np.concatenate([prep_data] + [preprossed_data[key] for key in keys], axis=0) | |
| # TSNE anlaysis | |
| tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) | |
| tsne_results = tsne.fit_transform(prep_data_final) | |
| # Plotting | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| plt.scatter( | |
| tsne_results[:anal_sample_no, 0], | |
| tsne_results[:anal_sample_no, 1], | |
| c=[colors[0] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label="Original", | |
| ) | |
| for i, key in enumerate(keys): | |
| plt.scatter( | |
| tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 0], | |
| tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 1], | |
| c=[colors[i+1] for _ in range(anal_sample_no)], | |
| alpha=0.5, | |
| label=key, | |
| ) | |
| # plt.scatter( | |
| # tsne_results[anal_sample_no:, 0], | |
| # tsne_results[anal_sample_no:, 1], | |
| # c=[colors[1] for _ in range(anal_sample_no)], | |
| # alpha=0.5, | |
| # label="Generated", | |
| # ) | |
| ax.legend() | |
| plt.title("t-SNE plot") | |
| plt.xlabel("x") | |
| plt.ylabel("y") | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| elif analysis == "kernel": | |
| # Visualization parameter | |
| # colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] | |
| fig, ax = plt.subplots(1, figsize=(8, 6)) | |
| sns.distplot( | |
| prep_data, | |
| hist=False, | |
| kde=True, | |
| kde_kws={"linewidth": 2}, | |
| label="Original", | |
| color=colors[0], | |
| ) | |
| # sns.distplot( | |
| # prep_data_hat, | |
| # hist=False, | |
| # kde=True, | |
| # kde_kws={"linewidth": 2, "linestyle": "--"}, | |
| # label="Generated", | |
| # color=colors[1], | |
| # ) | |
| for i, key in enumerate(keys): | |
| sns.distplot( | |
| preprossed_data[key], | |
| hist=False, | |
| kde=True, | |
| kde_kws={"linewidth": 2, "linestyle": "--"}, | |
| label=key, | |
| color=colors[i+1], | |
| ) | |
| # Plot formatting | |
| # plt.legend(prop={'size': 22}) | |
| plt.legend() | |
| plt.xlabel("Data Value") | |
| plt.ylabel("Data Density Estimate") | |
| # plt.rcParams['pdf.fonttype'] = 42 | |
| # plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') | |
| # plt.ylim((0, 12)) | |
| plt.show() | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") | |
| pdf.savefig(fig) | |
| pdf.close() | |
| plt.close() | |
| def save_pdf(fig, path): | |
| # from matplotlib.backends.backend_pdf import PdfPages | |
| # pdf = PdfPages(path) | |
| # pdf.savefig(fig) | |
| # pdf.close() | |
| fig.savefig(path, format="pdf", bbox_inches="tight") | |
| if __name__ == "__main__": | |
| pass | |