import os
import json

def get_local_images(root_dir):
    """
    Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
    """
    local_images = set()

    for p_folder in os.listdir(root_dir):
        if not p_folder.startswith("p1"):  # chỉ p10 -> p19
            continue

        p_path = os.path.join(root_dir, p_folder)

        for root, _, files in os.walk(p_path):
            for file in files:
                if file.endswith(".jpg"):
                    full_path = os.path.join(root, file)

                    # convert về dạng giống VQA: p10/.../xxx.jpg
                    rel_path = os.path.relpath(full_path, root_dir)
                    rel_path = rel_path.replace("\\", "/")

                    local_images.add(rel_path)

    return local_images


def get_vqa_images(vqa_json_path):
    """
    Lấy toàn bộ image_path từ file VQA json
    """
    with open(vqa_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    vqa_images = set()

    for item in data:
        if "image_path" in item:
            vqa_images.add(item["image_path"])

    return vqa_images


def main(root_dir, vqa_json_path):
    print("Đang quét ảnh local...")
    local_images = get_local_images(root_dir)
    print(f"Số ảnh local: {len(local_images)}")

    print("Đang đọc VQA json...")
    vqa_images = get_vqa_images(vqa_json_path)
    print(f"Số ảnh trong VQA: {len(vqa_images)}")

    # intersection
    matched = local_images & vqa_images

    print("\n===== KẾT QUẢ =====")
    print(f"Số ảnh trùng: {len(matched)}")
    print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")

    # nếu muốn lưu danh sách
    with open("matched_images.txt", "w") as f:
        for path in matched:
            f.write(path + "\n")

    print("Đã lưu danh sách vào matched_images.txt")


if __name__ == "__main__":
    x = "train"
    y = "valid"
    root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x)     # ví dụ: D:/mimic-cxr
    vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y)         # ví dụ: D:/vqa/train.json

    main(root_dir, vqa_json)