| import mojimoji | |
| import pandas as pd | |
| from rapidfuzz import fuzz, process | |
| class EntityDictionary: | |
| def __init__(self, path, candidate_column, normalization_column): | |
| if path is None: | |
| raise ValueError('Path to dictionary file is not specified.') | |
| if candidate_column is None: | |
| raise ValueError('Candidate column is not specified.') | |
| if normalization_column is None: | |
| raise ValueError('Normalization column is not specified.') | |
| self.df = pd.read_csv(path) | |
| self.candidate_column = candidate_column | |
| self.normalization_column = normalization_column | |
| def get_candidates_list(self): | |
| return self.df.iloc[:, self.candidate_column].to_list() | |
| def get_normalization_list(self): | |
| return self.df.iloc[:, self.normalization_column].to_list() | |
| def get_normalized_term(self, term): | |
| return self.df[self.df.iloc[:, self.candidate_column] == term].iloc[:, self.normalization_column].item() | |
| class DefaultDiseaseDict(EntityDictionary): | |
| def __init__(self): | |
| super().__init__('dictionaries/disease_dict.csv', 0, 2) | |
| class DefaultDrugDict(EntityDictionary): | |
| def __init__(self): | |
| super().__init__('dictionaries/drug_dict.csv', 0, 2) | |
| class EntityNormalizer: | |
| def __init__(self, database: EntityDictionary, matching_method=fuzz.ratio, matching_threshold=0): | |
| self.database = database | |
| self.matching_method = matching_method | |
| self.matching_threshold = matching_threshold | |
| self.candidates = [mojimoji.han_to_zen(x) for x in self.database.get_candidates_list()] | |
| def normalize(self, term): | |
| term = mojimoji.han_to_zen(term) | |
| preferred_candidate = process.extractOne(term, self.candidates, scorer=self.matching_method) | |
| score = preferred_candidate[1] | |
| if score > self.matching_threshold: | |
| ret = self.database.get_normalized_term(preferred_candidate[0]) | |
| return ('' if pd.isna(ret) else ret), score | |
| else: | |
| return '', score | |