| import pandas as pd |
| import os |
| import json |
|
|
| with open('./political_data_with_extra_data.json', 'r', encoding='utf-8') as file: |
| data = json.load(file) |
| len = len(data) |
|
|
| final_folder = 'playground/data/cmmlu' |
|
|
| files = os.listdir(final_folder) |
|
|
| selected_files = ['combined_anatomy.csv','combined_ancient_chinese.csv','combined_arts.csv','combined_chinese_civil_service_exam.csv','combined_chinese_foreign_policy.csv', |
| 'combined_chinese_history.csv','combined_college_education.csv', 'combined_college_engineering_hydrology.csv', 'combined_college_mathematics.csv', 'combined_college_medicine.csv', |
| 'combined_conceptual_physics.csv','combined_electrical_engineering.csv','combined_elementary_mathematics.csv','combined_food_science.csv', |
| 'combined_genetics.csv', 'combined_high_school_biology.csv', 'combined_high_school_chemistry.csv','combined_high_school_geography.csv','combined_high_school_mathematics.csv', |
| 'combined_high_school_physics.csv','combined_high_school_politics.csv','combined_legal_and_moral_basis.csv','combined_management.csv','combined_marxist_theory.csv', |
| 'combined_modern_chinese.csv','combined_philosophy.csv','combined_virology.csv','combined_world_history.csv'] |
| cmmlu_list = [] |
| for file_name in selected_files: |
| path = os.path.join(final_folder, file_name) |
| df = pd.read_csv(path) |
| |
| for index, row in df.iterrows(): |
| dict_item = {} |
| dict_item['id'] = str(len) |
| len+=1 |
|
|
| dict_item['image'] = "" |
|
|
| conversion = [] |
| human = {} |
| human['from'] = 'human' |
| question = row['Question'] + '\nA.' + row['A'] + '\nB.' + row['B'] + '\nC.' + row['C'] + '\nD' + row['D'] + '\n' |
| human['value'] = question |
| gpt = {} |
| gpt['from'] = 'gpt' |
| result = "答案是:" + row['Answer'] |
| gpt['value'] = result |
| conversion.append(human) |
| conversion.append(gpt) |
| dict_item['conversations'] = conversion |
|
|
| print(dict_item) |
| |
| cmmlu_list.append(dict_item) |
| |
| data = cmmlu_list + data |
|
|
| with open('cmmlu_political_data_gaokao.json', 'w', encoding='utf-8') as file: |
| |
| json.dump(data, file, ensure_ascii=False, indent=4) |