my-llava-moss2 / LLaVA-MOSS2 /add_extra_data.py
GSheep's picture
add politics
cb9f2b6
import json
import pandas as pd
from llava.train.train import train
with open('./playground/data/llava_v1_5_mix665k.json', 'r', encoding='utf-8') as file:
data = json.load(file)
len = len(data)
paths = ["/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-biology/snapshots/91e58112d4022523e02d07cfbc96a950eac9219f/data/test-00000-of-00001-de5aacbbef2a047d.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-geography/snapshots/bea8c9da6c3ecf4c07a423b36914f1daa1ba6a1e/data/test-00000-of-00001-82c3eb504d984b0c.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chemistry/snapshots/2fb33cf46ce4aeea9409ea3600a3b1d7e5216536/data/test-00000-of-00001-79e3d766a5e30db5.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-chinese/snapshots/d47e4d2c79b7280a7fb9990a11b036dfb8cdd89b/data/test-00000-of-00001-cb21ebb290e0161f.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-english/snapshots/691d13566972917f1cdc82f4fa1bad1a5b197cab/data/test-00000-of-00001-8025cecb3b3c0c99.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-history/snapshots/41252f835bf3198590df5f4d488d64f78b6fd595/data/test-00000-of-00001-92728bc55381f2f3.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-mathqa/snapshots/54ede00f50d90dab8295c0163e16912ee52f8068/data/test-00000-of-00001-31399d80475862e0.parquet",
"/root/.cache/huggingface/hub/datasets--dmayhem93--agieval-gaokao-physics/snapshots/3f82847f19ead1a682f0b27cc5c829ac964586bb/data/test-00000-of-00001-d34ffce230cd958a.parquet"]
datas = []
for path in paths:
df = pd.read_parquet(path)
dict_list = []
for index, row in df.iterrows():
dict_item = {}
dict_item['id'] = str(len)
len+=1
dict_item['image'] = ""
conversion = []
human = {}
human['from'] = 'human'
human['value'] = row['query']
gpt = {}
gpt['from'] = 'gpt'
result = "答案是:"
for option in row['gold']:
result += chr(ord('A') + option)
gpt['value'] = result
conversion.append(human)
conversion.append(gpt)
dict_item['conversations'] = conversion
print(dict_item)
dict_list.append(dict_item)
datas = datas + dict_list
data = data + datas
with open('data_with_extra_data.json', 'w', encoding='utf-8') as file:
# 使用json.dump()函数将字典写入文件
json.dump(data, file, ensure_ascii=False, indent=4)
new_data = data[::2]
with open('data_with_extra_data_half.json', 'w', encoding='utf-8') as file:
# 使用json.dump()函数将字典写入文件
json.dump(new_data, file, ensure_ascii=False, indent=4)