""" Data Preparation Module for Diatom Classification. This script parses Pascal VOC XML annotation files to extract bounding box coordinates, crops the individual diatoms from the raw microscopic images, and organizes them into subdirectories based on their Genus for downstream deep learning classification. """ import xml.etree.ElementTree as ET import logging from pathlib import Path from PIL import Image # Configure logging for professional output tracking logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) # Define paths relative to the script's location BASE_DIR: Path = Path(__file__).resolve().parent.parent RAW_DIR: Path = BASE_DIR / 'data' / 'raw' IMG_DIR: Path = RAW_DIR / 'images' XML_DIR: Path = RAW_DIR / 'xmls' OUT_DIR: Path = BASE_DIR / 'data' / 'processed' def get_genus(scientific_name: str) -> str: """ Extracts the Genus (the first word) from a full scientific name. Args: scientific_name (str): The full species name (e.g., 'Encyonema ventricosum'). Returns: str: The extracted Genus (e.g., 'Encyonema'). """ return scientific_name.strip().split(' ')[0] def process_dataset() -> None: """ Iterates through XML annotations, crops the corresponding images, and saves them categorized by Genus. """ # Ensure the output directory exists OUT_DIR.mkdir(parents=True, exist_ok=True) xml_files = list(XML_DIR.glob('*.xml')) logging.info(f"Found {len(xml_files)} XML files. Starting extraction...") processed_count = 0 missing_images = 0 for xml_path in xml_files: try: # Parse the XML file tree = ET.parse(xml_path) root = tree.getroot() # Locate the filename definition in the XML filename_node = root.find('filename') if filename_node is None or not filename_node.text: logging.warning(f"No filename tag found in {xml_path.name}. Skipping.") continue base_name = filename_node.text img_path = IMG_DIR / f"{base_name}.png" if not img_path.exists(): logging.warning(f"Image not found: {img_path.name}. Skipping.") missing_images += 1 continue # Process the corresponding image with Image.open(img_path) as img: objects = root.find('objects') if objects is None: continue # Iterate through every annotated diatom in the image for idx, obj in enumerate(objects.findall('object')): name_node = obj.find('name') if name_node is None or not name_node.text: continue genus = get_genus(name_node.text) # Extract original bounding box coordinates bbox = obj.find('bbox') if bbox is None: continue orig_xmin = int(bbox.find('xmin').text) orig_ymin = int(bbox.find('ymin').text) orig_xmax = int(bbox.find('xmax').text) orig_ymax = int(bbox.find('ymax').text) # Calculate width and height of the diatom box_width = orig_xmax - orig_xmin box_height = orig_ymax - orig_ymin # Define a margin (e.g., 15% extra space on all sides) margin_pct = 0.15 margin_x = int(box_width * margin_pct) margin_y = int(box_height * margin_pct) # Apply margin, ensuring we don't go outside the image boundaries # img.width and img.height come from the PIL Image object xmin = max(0, orig_xmin - margin_x) ymin = max(0, orig_ymin - margin_y) xmax = min(img.width, orig_xmax + margin_x) ymax = min(img.height, orig_ymax + margin_y) # Crop the image with the new wider boundaries (PIL expects a tuple: left, upper, right, lower) cropped_img = img.crop((xmin, ymin, xmax, ymax)) # Create a specific directory for this Genus if it doesn't exist genus_dir = OUT_DIR / genus genus_dir.mkdir(exist_ok=True) # Save the cropped diatom out_filename = f"{base_name}_diatom_{idx}.png" cropped_img.save(genus_dir / out_filename) processed_count += 1 except Exception as e: logging.error(f"Error processing {xml_path.name}: {e}") logging.info(f"Extraction complete! Successfully processed {processed_count} diatoms.") if missing_images > 0: logging.info(f"Total missing images skipped: {missing_images}") if __name__ == "__main__": process_dataset()