Spaces:
Sleeping
Sleeping
File size: 5,052 Bytes
490c4a2 5319cf6 490c4a2 5319cf6 490c4a2 5319cf6 490c4a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
Data Preparation Module for Diatom Classification.
This script parses Pascal VOC XML annotation files to extract bounding box coordinates,
crops the individual diatoms from the raw microscopic images, and organizes them
into subdirectories based on their Genus for downstream deep learning classification.
"""
import xml.etree.ElementTree as ET
import logging
from pathlib import Path
from PIL import Image
# Configure logging for professional output tracking
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
# Define paths relative to the script's location
BASE_DIR: Path = Path(__file__).resolve().parent.parent
RAW_DIR: Path = BASE_DIR / 'data' / 'raw'
IMG_DIR: Path = RAW_DIR / 'images'
XML_DIR: Path = RAW_DIR / 'xmls'
OUT_DIR: Path = BASE_DIR / 'data' / 'processed'
def get_genus(scientific_name: str) -> str:
"""
Extracts the Genus (the first word) from a full scientific name.
Args:
scientific_name (str): The full species name (e.g., 'Encyonema ventricosum').
Returns:
str: The extracted Genus (e.g., 'Encyonema').
"""
return scientific_name.strip().split(' ')[0]
def process_dataset() -> None:
"""
Iterates through XML annotations, crops the corresponding images,
and saves them categorized by Genus.
"""
# Ensure the output directory exists
OUT_DIR.mkdir(parents=True, exist_ok=True)
xml_files = list(XML_DIR.glob('*.xml'))
logging.info(f"Found {len(xml_files)} XML files. Starting extraction...")
processed_count = 0
missing_images = 0
for xml_path in xml_files:
try:
# Parse the XML file
tree = ET.parse(xml_path)
root = tree.getroot()
# Locate the filename definition in the XML
filename_node = root.find('filename')
if filename_node is None or not filename_node.text:
logging.warning(f"No filename tag found in {xml_path.name}. Skipping.")
continue
base_name = filename_node.text
img_path = IMG_DIR / f"{base_name}.png"
if not img_path.exists():
logging.warning(f"Image not found: {img_path.name}. Skipping.")
missing_images += 1
continue
# Process the corresponding image
with Image.open(img_path) as img:
objects = root.find('objects')
if objects is None:
continue
# Iterate through every annotated diatom in the image
for idx, obj in enumerate(objects.findall('object')):
name_node = obj.find('name')
if name_node is None or not name_node.text:
continue
genus = get_genus(name_node.text)
# Extract original bounding box coordinates
bbox = obj.find('bbox')
if bbox is None:
continue
orig_xmin = int(bbox.find('xmin').text)
orig_ymin = int(bbox.find('ymin').text)
orig_xmax = int(bbox.find('xmax').text)
orig_ymax = int(bbox.find('ymax').text)
# Calculate width and height of the diatom
box_width = orig_xmax - orig_xmin
box_height = orig_ymax - orig_ymin
# Define a margin (e.g., 15% extra space on all sides)
margin_pct = 0.15
margin_x = int(box_width * margin_pct)
margin_y = int(box_height * margin_pct)
# Apply margin, ensuring we don't go outside the image boundaries
# img.width and img.height come from the PIL Image object
xmin = max(0, orig_xmin - margin_x)
ymin = max(0, orig_ymin - margin_y)
xmax = min(img.width, orig_xmax + margin_x)
ymax = min(img.height, orig_ymax + margin_y)
# Crop the image with the new wider boundaries (PIL expects a tuple: left, upper, right, lower)
cropped_img = img.crop((xmin, ymin, xmax, ymax))
# Create a specific directory for this Genus if it doesn't exist
genus_dir = OUT_DIR / genus
genus_dir.mkdir(exist_ok=True)
# Save the cropped diatom
out_filename = f"{base_name}_diatom_{idx}.png"
cropped_img.save(genus_dir / out_filename)
processed_count += 1
except Exception as e:
logging.error(f"Error processing {xml_path.name}: {e}")
logging.info(f"Extraction complete! Successfully processed {processed_count} diatoms.")
if missing_images > 0:
logging.info(f"Total missing images skipped: {missing_images}")
if __name__ == "__main__":
process_dataset()
|