import json import pandas as pd from llava.train.train import train with open('./playground/data/llava_v1_5_mix665k.json', 'r', encoding='utf-8') as file: data = json.load(file) len = len(data) paths = ["/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-biology/snapshots/91e58112d4022523e02d07cfbc96a950eac9219f/data/test-00000-of-00001-de5aacbbef2a047d.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-geography/snapshots/bea8c9da6c3ecf4c07a423b36914f1daa1ba6a1e/data/test-00000-of-00001-82c3eb504d984b0c.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chemistry/snapshots/2fb33cf46ce4aeea9409ea3600a3b1d7e5216536/data/test-00000-of-00001-79e3d766a5e30db5.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chinese/snapshots/d47e4d2c79b7280a7fb9990a11b036dfb8cdd89b/data/test-00000-of-00001-cb21ebb290e0161f.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-english/snapshots/691d13566972917f1cdc82f4fa1bad1a5b197cab/data/test-00000-of-00001-8025cecb3b3c0c99.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-history/snapshots/41252f835bf3198590df5f4d488d64f78b6fd595/data/test-00000-of-00001-92728bc55381f2f3.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-mathqa/snapshots/54ede00f50d90dab8295c0163e16912ee52f8068/data/test-00000-of-00001-31399d80475862e0.parquet", "/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-physics/snapshots/3f82847f19ead1a682f0b27cc5c829ac964586bb/data/test-00000-of-00001-d34ffce230cd958a.parquet"] datas = [] for path in paths: df = pd.read_parquet(path) dict_list = [] for index, row in df.iterrows(): dict_item = {} dict_item['id'] = str(len) len+=1 dict_item['image'] = "" conversion = [] human = {} human['from'] = 'human' human['value'] = row['query'] gpt = {} gpt['from'] = 'gpt' result = "答案是:" for option in row['gold']: result += chr(ord('A') + option) gpt['value'] = result conversion.append(human) conversion.append(gpt) dict_item['conversations'] = conversion print(dict_item) dict_list.append(dict_item) datas = datas + dict_list data = data + datas with open('data_with_extra_data.json', 'w', encoding='utf-8') as file: # 使用json.dump()函数将字典写入文件 json.dump(data, file, ensure_ascii=False, indent=4) new_data = data[::2] with open('data_with_extra_data_half.json', 'w', encoding='utf-8') as file: # 使用json.dump()函数将字典写入文件 json.dump(new_data, file, ensure_ascii=False, indent=4)