|
|
import json |
|
|
import pandas as pd |
|
|
|
|
|
from llava.train.train import train |
|
|
|
|
|
with open('./playground/data/llava_v1_5_mix665k.json', 'r', encoding='utf-8') as file: |
|
|
data = json.load(file) |
|
|
len = len(data) |
|
|
|
|
|
paths = ["/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-biology/snapshots/91e58112d4022523e02d07cfbc96a950eac9219f/data/test-00000-of-00001-de5aacbbef2a047d.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-geography/snapshots/bea8c9da6c3ecf4c07a423b36914f1daa1ba6a1e/data/test-00000-of-00001-82c3eb504d984b0c.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chemistry/snapshots/2fb33cf46ce4aeea9409ea3600a3b1d7e5216536/data/test-00000-of-00001-79e3d766a5e30db5.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chinese/snapshots/d47e4d2c79b7280a7fb9990a11b036dfb8cdd89b/data/test-00000-of-00001-cb21ebb290e0161f.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-english/snapshots/691d13566972917f1cdc82f4fa1bad1a5b197cab/data/test-00000-of-00001-8025cecb3b3c0c99.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-history/snapshots/41252f835bf3198590df5f4d488d64f78b6fd595/data/test-00000-of-00001-92728bc55381f2f3.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-mathqa/snapshots/54ede00f50d90dab8295c0163e16912ee52f8068/data/test-00000-of-00001-31399d80475862e0.parquet", |
|
|
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-physics/snapshots/3f82847f19ead1a682f0b27cc5c829ac964586bb/data/test-00000-of-00001-d34ffce230cd958a.parquet"] |
|
|
datas = [] |
|
|
for path in paths: |
|
|
df = pd.read_parquet(path) |
|
|
|
|
|
dict_list = [] |
|
|
for index, row in df.iterrows(): |
|
|
dict_item = {} |
|
|
dict_item['id'] = str(len) |
|
|
len+=1 |
|
|
|
|
|
dict_item['image'] = "" |
|
|
|
|
|
conversion = [] |
|
|
human = {} |
|
|
human['from'] = 'human' |
|
|
human['value'] = row['query'] |
|
|
gpt = {} |
|
|
gpt['from'] = 'gpt' |
|
|
result = "答案是:" |
|
|
for option in row['gold']: |
|
|
result += chr(ord('A') + option) |
|
|
gpt['value'] = result |
|
|
conversion.append(human) |
|
|
conversion.append(gpt) |
|
|
dict_item['conversations'] = conversion |
|
|
|
|
|
print(dict_item) |
|
|
|
|
|
dict_list.append(dict_item) |
|
|
|
|
|
datas = datas + dict_list |
|
|
data = data + datas |
|
|
with open('data_with_extra_data.json', 'w', encoding='utf-8') as file: |
|
|
|
|
|
json.dump(data, file, ensure_ascii=False, indent=4) |
|
|
|
|
|
new_data = data[::2] |
|
|
with open('data_with_extra_data_half.json', 'w', encoding='utf-8') as file: |
|
|
|
|
|
json.dump(new_data, file, ensure_ascii=False, indent=4) |
|
|
|