Spaces:

Umer2762
/

Doctor_Handwriting_Text_Detection

Running

App Files Files Community

Doctor_Handwriting_Text_Detection / OutputMetadataCreator.py

Umer2762

Upload folder using huggingface_hub

5626a1a verified 12 months ago

raw

history blame contribute delete

6.11 kB

	import os
	import csv
	import json
	from PIL import Image
	from DataModels.AnnotatedData import AnnotatedData
	from DataModels.Region import Region

	def crop_and_save_regions(image_path: str, regions: list[Region], output_folder: str, base_filename: str):
	"""
	Crops regions from an image and saves them to the output folder.
	Returns a list of tuples containing the cropped image path and the corresponding text.
	"""
	cropped_data = []
	if not os.path.exists(image_path):
	print(f"Skipping {image_path}: Image file not found.")
	return cropped_data

	img = Image.open(image_path)
	for idx, region in enumerate(regions):
	try:
	# Extract region coordinates
	x, y, width, height = (
	region.shape_attributes.x,
	region.shape_attributes.y,
	region.shape_attributes.width,
	region.shape_attributes.height,
	)
	# Crop the region
	cropped_img = img.crop((x, y, x + width, y + height))
	cropped_img = cropped_img.convert("RGB")

	# Generate the cropped image name
	cropped_image_name = f"{base_filename}_{idx + 1}.jpg"
	cropped_image_path = os.path.join(output_folder, cropped_image_name)

	# Save the cropped image
	cropped_img.save(cropped_image_path)
	languageInt = 0
	# Extract text from region attributes
	if region.region_attributes.language == "English":
	languageInt = 0
	else:
	languageInt = 1
	int_type = 0
	text = ""
	if region.region_attributes.medicine_name:
	int_type = 0
	text = region.region_attributes.medicine_name
	elif region.region_attributes.dosage:
	int_type = 1
	text = region.region_attributes.dosage
	elif region.region_attributes.dignostic:
	int_type = 2
	text = region.region_attributes.dignostic
	elif region.region_attributes.symptoms:
	int_type = 3
	text = region.region_attributes.symptoms
	elif region.region_attributes.personal_info:
	int_type = 4
	text = region.region_attributes.personal_info
	elif region.region_attributes.numeric_data:
	int_type = 5
	text = region.region_attributes.numeric_data
	elif region.region_attributes.text:
	int_type = 6
	text = region.region_attributes.text
	text.replace("\n","").replace("\"","").replace(",","`")
	# Add to the list of cropped data
	cropped_data.append((cropped_image_path, text, int_type, languageInt))
	except Exception as e:
	print(f"Error cropping region {idx + 1} from {image_path}: {e}")

	return cropped_data

	def process_folders_to_csv_and_crop(base_folder: str, output_csv: str, cropped_images_folder: str):
	"""
	Processes multiple dr folders containing JSON annotations and images.
	Crops regions from images, saves them to a folder, and consolidates into a single CSV file.
	"""
	os.makedirs(cropped_images_folder, exist_ok=True) # Ensure cropped images folder exists

	# Initialize CSV data
	csv_data = [["Cropped Image Path", "Text","type","language"]]

	# Loop through all folders starting with 'dr'
	for folder_name in os.listdir(base_folder):
	folder_path = os.path.join(base_folder, folder_name)
	if not os.path.isdir(folder_path) or not folder_name.startswith("dr"):
	continue # Skip if not a valid dr folder

	json_path = os.path.join(folder_path, f"{folder_name}.json")
	if not os.path.exists(json_path):
	print(f"Skipping {folder_path}: No JSON file found.")
	continue

	with open(json_path, "r", encoding="utf-8") as file:
	data = json.load(file)
	annotated_data = AnnotatedData(data)

	# Process each image in the annotated data
	for image_id, metadata in annotated_data.metadata.items():
	image_path = os.path.join(folder_path, metadata.filename)
	base_filename = os.path.splitext(metadata.filename)[0] # Remove file extension

	# Crop regions and save to folder
	cropped_data = crop_and_save_regions(image_path, metadata.regions, cropped_images_folder, base_filename)
	# Add cropped data to CSV data
	csv_data.extend(cropped_data)

	# Write to a single CSV file
	with open(output_csv, "w", newline="", encoding="utf-8") as file:
	writer = csv.writer(file)
	writer.writerows(csv_data)

	print(f"CSV file created: {output_csv}")

	def clean_second_column(overall_output_csv, output_file):
	with open(overall_output_csv, mode='r', newline='', encoding='utf-8') as infile, \
	open(output_file, mode='w', newline='', encoding='utf-8') as outfile:

	reader = csv.reader(infile)
	writer = csv.writer(outfile)

	for row in reader:
	if len(row) > 1: # Ensure the second column exists
	row[1] = row[1].replace(',', '').replace('"', '').replace('\n', ' ')
	row[0] = row[0].replace("./","").replace("\\","/")
	if len(row[1].strip()) > 0:
	writer.writerow(row)
	os.remove(overall_output_csv)

	# Usage Example
	base_folder = "./base_data" # Base directory containing dr folders
	overall_output_csv = "./all_cropped_data.csv" # Single output CSV file
	overall_output_csv_cleaned = "./all_cropped_data_cleaned.csv" # Single output CSV file
	cropped_images_folder = "./all_cropped_images" # Folder to save all cropped images

	process_folders_to_csv_and_crop(base_folder, overall_output_csv, cropped_images_folder)
	clean_second_column(overall_output_csv,overall_output_csv_cleaned)