Spaces:
Running
Running
| from utils.common.data_record import read_json, write_json | |
| import requests | |
| import random | |
| import hashlib | |
| import tqdm | |
| import time | |
| session = requests.Session() | |
| def translate(sentence): | |
| app_id = '20221004001369410' | |
| salt = str(random.randint(1000000000, 9999999999)) | |
| key = 'XEsBS6babmp9wz5bcoEs' | |
| sign = hashlib.md5(f'{app_id}{sentence}{salt}{key}'.encode('utf8')).hexdigest() | |
| response = requests.get( | |
| 'https://fanyi-api.baidu.com/api/trans/vip/translate', | |
| params={ | |
| 'q': sentence, | |
| 'from': 'en', | |
| 'to': 'zh', | |
| 'appid': app_id, | |
| 'salt': salt, | |
| 'sign': sign | |
| } | |
| ).json() | |
| if 'trans_result' not in response.keys(): | |
| print(response) | |
| raise RuntimeError | |
| return response['trans_result'][0]['src'], response['trans_result'][0]['dst'] | |
| def gen_label_from_sen_cls_json(sen_cls_json_path): | |
| # generate Chinese translation | |
| texts = [] | |
| anns = read_json(sen_cls_json_path) | |
| for v in anns.values(): | |
| texts += [v['sentence']] | |
| assert '\n' not in texts[-1] | |
| texts = list(set(texts)) | |
| res_json = [] | |
| for text in tqdm.tqdm(texts): | |
| time.sleep(1.2) | |
| src_text, dst_text = translate(text) | |
| res_json += [{ | |
| 'src': src_text, | |
| 'dst': dst_text | |
| }] | |
| write_json(sen_cls_json_path + '.translate_data', res_json, backup=False) | |
| if __name__ == '__main__': | |
| # res = translate('I am a doctor.\nHello world!') | |
| # print(res) | |
| import os | |
| data_dir_paths = { | |
| **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing5Domains/asc/{k.split("-")[1]}' | |
| for k in ['HL5Domains-ApexAD2600Progressive', 'HL5Domains-CanonG3', 'HL5Domains-CreativeLabsNomadJukeboxZenXtra40GB', | |
| 'HL5Domains-NikonCoolpix4300', 'HL5Domains-Nokia6610']}, | |
| **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing3Domains/asc/{k.split("-")[1]}' | |
| for k in ['Liu3Domains-Computer', 'Liu3Domains-Router', 'Liu3Domains-Speaker']}, | |
| **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc/{k.split("-")[1]}' | |
| for k in [f'Ding9Domains-{d}' for d in os.listdir('/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc')]}, | |
| **{f'SemEval-{k[0].upper()}{k[1:]}': f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/XuSemEval/asc/14/{k}' | |
| for k in ['laptop', 'rest']}, | |
| } | |
| json_paths = [] | |
| for p in data_dir_paths.values(): | |
| json_paths += [os.path.join(p, f'{split}.json') for split in ['train', 'dev', 'test']] | |
| assert all([os.path.exists(p) for p in json_paths]) | |
| # print(len(json_paths)) | |
| # exit() | |
| for p in tqdm.tqdm(json_paths[23:]): | |
| print(p) | |
| gen_label_from_sen_cls_json(p) |