File size: 3,613 Bytes

9d0d562

import epitran
from tqdm import tqdm
import pickle as pkl

''' 统计分析。分词，利用分词结果，做统计分析和构建音标词典 '''


def analyse_by_IPA_statistic(file_lo, file_th, statistic_conclusion_exist=False):
    from transformers import AutoTokenizer
    if statistic_conclusion_exist:
        IPA_lo_dict = pkl.load(open('IPA_lo_dict', 'rb'))
        IPA_th_dict = pkl.load(open('IPA_th_dict', 'rb'))
        IPA_lo_dict_cop = IPA_lo_dict.copy()
        IPA_th_dict_cop = IPA_th_dict.copy()
        for key_ in IPA_th_dict:
            for i in key_:
                if i.isdigit():
                    del IPA_th_dict_cop[key_]
                    break

        for key_ in IPA_lo_dict:
            for i in key_:
                if i.isdigit():
                    del IPA_lo_dict_cop[key_]
                    break
        sorted_IPA_lo_tp = sorted(IPA_th_dict_cop.items(), key=lambda x: x[1], reverse=True)
        sorted_IPA_th_tp = sorted(IPA_lo_dict_cop.items(), key=lambda x: x[1], reverse=True)
        sorted_IPA_lo = [t[0] for t in sorted_IPA_lo_tp]
        sorted_IPA_th = [t[0] for t in sorted_IPA_th_tp]
        same_list = []
        for idx, i in enumerate(sorted_IPA_lo):
            if i in sorted_IPA_th:
                '''

                如果IPA_th，IPA_lo有相同元素，获取该元素的值

                '''
                same_list.append([i, idx, sorted_IPA_th.index(i), IPA_lo_dict[i], IPA_th_dict[i]])

        pkl.dump(same_list, open('same_list', 'wb'))
        return
    else:
        plm_tokenizer = AutoTokenizer.from_pretrained(
            r'../foundation/E5')

        with open(file_lo, 'r', encoding='utf-8') as f:
            data_lo = f.readlines()
        with open(file_th, 'r', encoding='utf-8') as f:
            data_th = f.readlines()

        IPA_lo_dict = {}
        IPA_th_dict = {}
        print(len(data_lo))
        print(len(data_th))

        for i, j in tqdm(zip(data_lo, data_th)):
            input_lo = i
            input_th = j
            tked_lo = \
                plm_tokenizer(input_lo, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
                    0].tokens[2:-1]
            tked_th = \
                plm_tokenizer(input_th, max_length=512, padding=True, truncation=True, return_tensors='pt').encodings[
                    0].tokens[2:-1]
            epi_lo = epitran.Epitran("lao-Laoo")
            epi_th = epitran.Epitran("tha-Thai")

            for i in tked_lo:
                IPA_lo = epi_lo.transliterate(i)
                IPA_lo_dict[IPA_lo] = IPA_lo_dict.get(IPA_lo, 1) + 1
            for j in tked_th:
                IPA_th = epi_th.transliterate(j)
                IPA_th_dict[IPA_th] = IPA_th_dict.get(IPA_th, 1) + 1

        pkl.dump(IPA_lo_dict, open('IPA_lo_dict', 'wb'))
        pkl.dump(IPA_th_dict, open('IPA_th_dict', 'wb'))


def spliteKeyWord(in_str):
    # print(in_str)
    # in_str.replace('/([0-9]+)/g', '')
    return set(list(in_str))4


def minhash(str_a, str_b):  # 相似度计算 0-1
    score = 0.0
    jaccard_distance = lambda seta, setb: len(seta & setb) / float(len(seta | setb))
    try:
        score = jaccard_distance(spliteKeyWord(str_a), spliteKeyWord(str_b))
    except ZeroDivisionError:
        print('ZeroDivisionError')

    return score


if __name__ == "__main__":
    analyse_by_IPA_statistic('../data/triple/data_lo.txt', '../data/triple/data_th.txt',
                             statistic_conclusion_exist=False)