File size: 6,107 Bytes
5626a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import csv
import json
from PIL import Image
from DataModels.AnnotatedData import AnnotatedData 
from DataModels.Region import Region

def crop_and_save_regions(image_path: str, regions: list[Region], output_folder: str, base_filename: str):
    """

    Crops regions from an image and saves them to the output folder.

    Returns a list of tuples containing the cropped image path and the corresponding text.

    """
    cropped_data = []
    if not os.path.exists(image_path):
        print(f"Skipping {image_path}: Image file not found.")
        return cropped_data

    img = Image.open(image_path)
    for idx, region in enumerate(regions):
        try:
            # Extract region coordinates
            x, y, width, height = (
                region.shape_attributes.x,
                region.shape_attributes.y,
                region.shape_attributes.width,
                region.shape_attributes.height,
            )
            # Crop the region
            cropped_img = img.crop((x, y, x + width, y + height))
            cropped_img = cropped_img.convert("RGB")

            # Generate the cropped image name
            cropped_image_name = f"{base_filename}_{idx + 1}.jpg"
            cropped_image_path = os.path.join(output_folder, cropped_image_name)

            # Save the cropped image
            cropped_img.save(cropped_image_path)
            languageInt = 0
            # Extract text from region attributes
            if region.region_attributes.language == "English":
                languageInt = 0
            else:
                languageInt = 1 
            int_type = 0
            text = ""
            if region.region_attributes.medicine_name:
                int_type = 0
                text = region.region_attributes.medicine_name
            elif region.region_attributes.dosage:
                int_type = 1
                text = region.region_attributes.dosage
            elif region.region_attributes.dignostic:
                int_type = 2
                text = region.region_attributes.dignostic
            elif region.region_attributes.symptoms:
                int_type = 3
                text = region.region_attributes.symptoms
            elif region.region_attributes.personal_info:
                int_type = 4
                text = region.region_attributes.personal_info
            elif region.region_attributes.numeric_data:
                int_type = 5
                text = region.region_attributes.numeric_data
            elif region.region_attributes.text:
                int_type = 6
                text = region.region_attributes.text
            text.replace("\n","").replace("\"","").replace(",","`")
            # Add to the list of cropped data
            cropped_data.append((cropped_image_path, text, int_type, languageInt))
        except Exception as e:
            print(f"Error cropping region {idx + 1} from {image_path}: {e}")

    return cropped_data

def process_folders_to_csv_and_crop(base_folder: str, output_csv: str, cropped_images_folder: str):
    """

    Processes multiple dr folders containing JSON annotations and images.

    Crops regions from images, saves them to a folder, and consolidates into a single CSV file.

    """
    os.makedirs(cropped_images_folder, exist_ok=True)  # Ensure cropped images folder exists

    # Initialize CSV data
    csv_data = [["Cropped Image Path", "Text","type","language"]]

    # Loop through all folders starting with 'dr'
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        if not os.path.isdir(folder_path) or not folder_name.startswith("dr"):
            continue  # Skip if not a valid dr folder

        json_path = os.path.join(folder_path, f"{folder_name}.json")
        if not os.path.exists(json_path):
            print(f"Skipping {folder_path}: No JSON file found.")
            continue

        with open(json_path, "r", encoding="utf-8") as file:
            data = json.load(file)
            annotated_data = AnnotatedData(data)

        # Process each image in the annotated data
        for image_id, metadata in annotated_data.metadata.items():
            image_path = os.path.join(folder_path, metadata.filename)
            base_filename = os.path.splitext(metadata.filename)[0]  # Remove file extension

            # Crop regions and save to folder
            cropped_data = crop_and_save_regions(image_path, metadata.regions, cropped_images_folder, base_filename)
            # Add cropped data to CSV data
            csv_data.extend(cropped_data)

    # Write to a single CSV file
    with open(output_csv, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(csv_data)

    print(f"CSV file created: {output_csv}")

def clean_second_column(overall_output_csv, output_file):
    with open(overall_output_csv, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        for row in reader:
            if len(row) > 1:  # Ensure the second column exists
                row[1] = row[1].replace(',', '').replace('"', '').replace('\n', ' ')
                row[0] = row[0].replace("./","").replace("\\","/")
            if len(row[1].strip()) > 0:
                writer.writerow(row)
    os.remove(overall_output_csv)

# Usage Example
base_folder = "./base_data"  # Base directory containing dr folders
overall_output_csv = "./all_cropped_data.csv"  # Single output CSV file
overall_output_csv_cleaned = "./all_cropped_data_cleaned.csv"  # Single output CSV file
cropped_images_folder = "./all_cropped_images"  # Folder to save all cropped images

process_folders_to_csv_and_crop(base_folder, overall_output_csv, cropped_images_folder)
clean_second_column(overall_output_csv,overall_output_csv_cleaned)