import json
import pickle
import os
from tqdm import tqdm

def build_referring_dataset(instance_path, refs_path, split, save_path):
    assert os.path.exists(instance_path), f'Path not found: {instance_path}'
    assert os.path.exists(refs_path), f'Path not found: {refs_path}'

    with open(instance_path) as f:
        instance = json.load(f)
    with open(refs_path, 'rb') as f:
        refs = pickle.load(f)

    images = instance['images']
    annotations = instance['annotations']

    img_id2info = {}
    for image in images:
        img_id2info[image['id']] = image
    anno_id2info = {}
    for annotation in annotations:
        anno_id2info[annotation['id']] = annotation

    outputs = []
    new_img_id = 0
    for sample in tqdm(refs):
        if sample['split'] != split:
            continue
        sample_annotation = anno_id2info[sample['ann_id']]
        sample_image = img_id2info[sample['image_id']]
        outputs.append(
            {
                'image': sample_image['file_name'],
                'image_info': sample_image,
                'instruction': sample['sentences'],
                'anns': [sample_annotation],
                'new_img_id': new_img_id,
            }
        )
        new_img_id += 1

    with open(save_path, 'w') as f:
        json.dump(outputs, f)
    print(f'Saving at {save_path}. Total sample: {len(outputs)}.')

    

if __name__ == '__main__':
    # Change root path to your own directory
    root_path = 'datasets/refer_seg'
    datasets = ['refcoco', 'refcoco+', 'refcocog']
    splits = ['train', 'val', 'testA', 'testB']
    for dataset in datasets:
        if dataset == 'refcocog':
            splits = ['train', 'val', 'test']

        for split in splits:
            instance_path = os.path.join(root_path, f'{dataset}', 'instances.json')
            if dataset == 'refcocog':
                refs_name = 'refs(umd).p'
            else:
                refs_name = 'refs(unc).p'
            refs_path = os.path.join(root_path, f'{dataset}', refs_name)
            save_path = os.path.join(root_path, f'{dataset}', f'{split}_psalm.json')
            print(f'Processing {dataset}: {split}...')

            build_referring_dataset(instance_path, refs_path, split, save_path)

    print(f'Done')