import epitran from tqdm import tqdm import pickle as pkl ''' 统计分析。分词,利用分词结果,做统计分析和构建音标词典 ''' def analyse_by_IPA_statistic(file_lo, file_th, statistic_conclusion_exist=False): from transformers import AutoTokenizer if statistic_conclusion_exist: IPA_lo_dict = pkl.load(open('IPA_lo_dict', 'rb')) IPA_th_dict = pkl.load(open('IPA_th_dict', 'rb')) IPA_lo_dict_cop = IPA_lo_dict.copy() IPA_th_dict_cop = IPA_th_dict.copy() for key_ in IPA_th_dict: for i in key_: if i.isdigit(): del IPA_th_dict_cop[key_] break for key_ in IPA_lo_dict: for i in key_: if i.isdigit(): del IPA_lo_dict_cop[key_] break sorted_IPA_lo_tp = sorted(IPA_th_dict_cop.items(), key=lambda x: x[1], reverse=True) sorted_IPA_th_tp = sorted(IPA_lo_dict_cop.items(), key=lambda x: x[1], reverse=True) sorted_IPA_lo = [t[0] for t in sorted_IPA_lo_tp] sorted_IPA_th = [t[0] for t in sorted_IPA_th_tp] same_list = [] for idx, i in enumerate(sorted_IPA_lo): if i in sorted_IPA_th: ''' 如果IPA_th,IPA_lo有相同元素,获取该元素的值 ''' same_list.append([i, idx, sorted_IPA_th.index(i), IPA_lo_dict[i], IPA_th_dict[i]]) pkl.dump(same_list, open('same_list', 'wb')) return else: plm_tokenizer = AutoTokenizer.from_pretrained( r'../foundation/E5') with open(file_lo, 'r', encoding='utf-8') as f: data_lo = f.readlines() with open(file_th, 'r', encoding='utf-8') as f: data_th = f.readlines() IPA_lo_dict = {} IPA_th_dict = {} print(len(data_lo)) print(len(data_th)) for i, j in tqdm(zip(data_lo, data_th)): input_lo = i input_th = j tked_lo = \ plm_tokenizer(input_lo, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[ 0].tokens[2:-1] tked_th = \ plm_tokenizer(input_th, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[ 0].tokens[2:-1] epi_lo = epitran.Epitran("lao-Laoo") epi_th = epitran.Epitran("tha-Thai") for i in tked_lo: IPA_lo = epi_lo.transliterate(i) IPA_lo_dict[IPA_lo] = IPA_lo_dict.get(IPA_lo, 1) + 1 for j in tked_th: IPA_th = epi_th.transliterate(j) IPA_th_dict[IPA_th] = IPA_th_dict.get(IPA_th, 1) + 1 pkl.dump(IPA_lo_dict, open('IPA_lo_dict', 'wb')) pkl.dump(IPA_th_dict, open('IPA_th_dict', 'wb')) def spliteKeyWord(in_str): # print(in_str) # in_str.replace('/([0-9]+)/g', '') return set(list(in_str))4 def minhash(str_a, str_b): # 相似度计算 0-1 score = 0.0 jaccard_distance = lambda seta, setb: len(seta & setb) / float(len(seta | setb)) try: score = jaccard_distance(spliteKeyWord(str_a), spliteKeyWord(str_b)) except ZeroDivisionError: print('ZeroDivisionError') return score if __name__ == "__main__": analyse_by_IPA_statistic('../data/triple/data_lo.txt', '../data/triple/data_th.txt', statistic_conclusion_exist=False)