ocr_task / dataset_creation.py
kirchik47's picture
Model code modifications
dcc8a5e
raw
history blame contribute delete
504 Bytes
import pandas as pd
import json
import os
dataset = pd.read_csv('data_80k/data.csv')
labels = dataset['image_file']
text = dataset['text']
json_data = []
images_path = '/kaggle/input/hindi-english-images/data_80k/output_images/'
for i in range(len(labels)):
json_data.append(
{
"query": "<image>",
"response": text[i],
"images": [os.path.join(images_path, labels[i])],
}
)
with open('dataset.json', 'w') as f:
json.dump(json_data, f)