| import os |
| import json |
|
|
| def get_local_images(root_dir): |
| """ |
| Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...) |
| """ |
| local_images = set() |
|
|
| for p_folder in os.listdir(root_dir): |
| if not p_folder.startswith("p1"): |
| continue |
|
|
| p_path = os.path.join(root_dir, p_folder) |
|
|
| for root, _, files in os.walk(p_path): |
| for file in files: |
| if file.endswith(".jpg"): |
| full_path = os.path.join(root, file) |
|
|
| |
| rel_path = os.path.relpath(full_path, root_dir) |
| rel_path = rel_path.replace("\\", "/") |
|
|
| local_images.add(rel_path) |
|
|
| return local_images |
|
|
|
|
| def get_vqa_images(vqa_json_path): |
| """ |
| Lấy toàn bộ image_path từ file VQA json |
| """ |
| with open(vqa_json_path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
|
|
| vqa_images = set() |
|
|
| for item in data: |
| if "image_path" in item: |
| vqa_images.add(item["image_path"]) |
|
|
| return vqa_images |
|
|
|
|
| def main(root_dir, vqa_json_path): |
| print("Đang quét ảnh local...") |
| local_images = get_local_images(root_dir) |
| print(f"Số ảnh local: {len(local_images)}") |
|
|
| print("Đang đọc VQA json...") |
| vqa_images = get_vqa_images(vqa_json_path) |
| print(f"Số ảnh trong VQA: {len(vqa_images)}") |
|
|
| |
| matched = local_images & vqa_images |
|
|
| print("\n===== KẾT QUẢ =====") |
| print(f"Số ảnh trùng: {len(matched)}") |
| print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}") |
|
|
| |
| with open("matched_images.txt", "w") as f: |
| for path in matched: |
| f.write(path + "\n") |
|
|
| print("Đã lưu danh sách vào matched_images.txt") |
|
|
|
|
| if __name__ == "__main__": |
| x = "train" |
| y = "valid" |
| root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x) |
| vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y) |
|
|
| main(root_dir, vqa_json) |