# import matplotlib.pyplot as plt # import numpy as np # import subprocess # import json # from umap import UMAP # from tqdm import tqdm # def count_lines_shell(file_path): # result = subprocess.run(["wc", "-l", file_path], capture_output=True, text=True) # return int(result.stdout.split()[0]) # def load_chunk(file_path,chunk_size): # lines = count_lines_shell(file_path) # with open(file_path,'r') as file: # dataset = [] # # embed = [] # for i in tqdm(file,total=lines): # data = json.loads(i) # key = list(data.keys())[0] # dataset.append([key,data[key][0]]) # # embed.append(data[key][1]) # if len(dataset)==chunk_size: # return dataset # dataset=[] # # embed=[] # if len(dataset)!=0: # return dataset # if __name__ == '__main__': # file_name = "pocketfm_pure_textlossless_data_stats.json" # bs = -1 # # data = load_chunk(file_name,-1) # embed = np.load("/nlsasfs/home/dubverse/varshulg/work/NeuralSpeak/T2S/pocketfm_embeddings.npy") # print(embed.shape) # plt.scatter(embed[:,0],embed[:,1]) # # plt.imsave("gst_embed.png") # plt.savefig('gst_embed_pocketfm.png')#, dpi=300, bbox_inches='tight')