File size: 6,107 Bytes
5626a1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import csv
import json
from PIL import Image
from DataModels.AnnotatedData import AnnotatedData
from DataModels.Region import Region
def crop_and_save_regions(image_path: str, regions: list[Region], output_folder: str, base_filename: str):
"""
Crops regions from an image and saves them to the output folder.
Returns a list of tuples containing the cropped image path and the corresponding text.
"""
cropped_data = []
if not os.path.exists(image_path):
print(f"Skipping {image_path}: Image file not found.")
return cropped_data
img = Image.open(image_path)
for idx, region in enumerate(regions):
try:
# Extract region coordinates
x, y, width, height = (
region.shape_attributes.x,
region.shape_attributes.y,
region.shape_attributes.width,
region.shape_attributes.height,
)
# Crop the region
cropped_img = img.crop((x, y, x + width, y + height))
cropped_img = cropped_img.convert("RGB")
# Generate the cropped image name
cropped_image_name = f"{base_filename}_{idx + 1}.jpg"
cropped_image_path = os.path.join(output_folder, cropped_image_name)
# Save the cropped image
cropped_img.save(cropped_image_path)
languageInt = 0
# Extract text from region attributes
if region.region_attributes.language == "English":
languageInt = 0
else:
languageInt = 1
int_type = 0
text = ""
if region.region_attributes.medicine_name:
int_type = 0
text = region.region_attributes.medicine_name
elif region.region_attributes.dosage:
int_type = 1
text = region.region_attributes.dosage
elif region.region_attributes.dignostic:
int_type = 2
text = region.region_attributes.dignostic
elif region.region_attributes.symptoms:
int_type = 3
text = region.region_attributes.symptoms
elif region.region_attributes.personal_info:
int_type = 4
text = region.region_attributes.personal_info
elif region.region_attributes.numeric_data:
int_type = 5
text = region.region_attributes.numeric_data
elif region.region_attributes.text:
int_type = 6
text = region.region_attributes.text
text.replace("\n","").replace("\"","").replace(",","`")
# Add to the list of cropped data
cropped_data.append((cropped_image_path, text, int_type, languageInt))
except Exception as e:
print(f"Error cropping region {idx + 1} from {image_path}: {e}")
return cropped_data
def process_folders_to_csv_and_crop(base_folder: str, output_csv: str, cropped_images_folder: str):
"""
Processes multiple dr folders containing JSON annotations and images.
Crops regions from images, saves them to a folder, and consolidates into a single CSV file.
"""
os.makedirs(cropped_images_folder, exist_ok=True) # Ensure cropped images folder exists
# Initialize CSV data
csv_data = [["Cropped Image Path", "Text","type","language"]]
# Loop through all folders starting with 'dr'
for folder_name in os.listdir(base_folder):
folder_path = os.path.join(base_folder, folder_name)
if not os.path.isdir(folder_path) or not folder_name.startswith("dr"):
continue # Skip if not a valid dr folder
json_path = os.path.join(folder_path, f"{folder_name}.json")
if not os.path.exists(json_path):
print(f"Skipping {folder_path}: No JSON file found.")
continue
with open(json_path, "r", encoding="utf-8") as file:
data = json.load(file)
annotated_data = AnnotatedData(data)
# Process each image in the annotated data
for image_id, metadata in annotated_data.metadata.items():
image_path = os.path.join(folder_path, metadata.filename)
base_filename = os.path.splitext(metadata.filename)[0] # Remove file extension
# Crop regions and save to folder
cropped_data = crop_and_save_regions(image_path, metadata.regions, cropped_images_folder, base_filename)
# Add cropped data to CSV data
csv_data.extend(cropped_data)
# Write to a single CSV file
with open(output_csv, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerows(csv_data)
print(f"CSV file created: {output_csv}")
def clean_second_column(overall_output_csv, output_file):
with open(overall_output_csv, mode='r', newline='', encoding='utf-8') as infile, \
open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
if len(row) > 1: # Ensure the second column exists
row[1] = row[1].replace(',', '').replace('"', '').replace('\n', ' ')
row[0] = row[0].replace("./","").replace("\\","/")
if len(row[1].strip()) > 0:
writer.writerow(row)
os.remove(overall_output_csv)
# Usage Example
base_folder = "./base_data" # Base directory containing dr folders
overall_output_csv = "./all_cropped_data.csv" # Single output CSV file
overall_output_csv_cleaned = "./all_cropped_data_cleaned.csv" # Single output CSV file
cropped_images_folder = "./all_cropped_images" # Folder to save all cropped images
process_folders_to_csv_and_crop(base_folder, overall_output_csv, cropped_images_folder)
clean_second_column(overall_output_csv,overall_output_csv_cleaned) |