File size: 5,052 Bytes
490c4a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5319cf6
490c4a2
 
 
 
5319cf6
 
 
 
490c4a2
5319cf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490c4a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Data Preparation Module for Diatom Classification.

This script parses Pascal VOC XML annotation files to extract bounding box coordinates,
crops the individual diatoms from the raw microscopic images, and organizes them
into subdirectories based on their Genus for downstream deep learning classification.
"""

import xml.etree.ElementTree as ET
import logging
from pathlib import Path
from PIL import Image

# Configure logging for professional output tracking
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Define paths relative to the script's location
BASE_DIR: Path = Path(__file__).resolve().parent.parent
RAW_DIR: Path = BASE_DIR / 'data' / 'raw'
IMG_DIR: Path = RAW_DIR / 'images'
XML_DIR: Path = RAW_DIR / 'xmls'
OUT_DIR: Path = BASE_DIR / 'data' / 'processed'


def get_genus(scientific_name: str) -> str:
    """
    Extracts the Genus (the first word) from a full scientific name.

    Args:
        scientific_name (str): The full species name (e.g., 'Encyonema ventricosum').

    Returns:
        str: The extracted Genus (e.g., 'Encyonema').
    """
    return scientific_name.strip().split(' ')[0]


def process_dataset() -> None:
    """
    Iterates through XML annotations, crops the corresponding images, 
    and saves them categorized by Genus.
    """
    # Ensure the output directory exists
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    xml_files = list(XML_DIR.glob('*.xml'))
    logging.info(f"Found {len(xml_files)} XML files. Starting extraction...")

    processed_count = 0
    missing_images = 0

    for xml_path in xml_files:
        try:
            # Parse the XML file
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Locate the filename definition in the XML
            filename_node = root.find('filename')
            if filename_node is None or not filename_node.text:
                logging.warning(f"No filename tag found in {xml_path.name}. Skipping.")
                continue

            base_name = filename_node.text
            img_path = IMG_DIR / f"{base_name}.png"

            if not img_path.exists():
                logging.warning(f"Image not found: {img_path.name}. Skipping.")
                missing_images += 1
                continue

            # Process the corresponding image
            with Image.open(img_path) as img:
                objects = root.find('objects')
                if objects is None:
                    continue

                # Iterate through every annotated diatom in the image
                for idx, obj in enumerate(objects.findall('object')):
                    name_node = obj.find('name')
                    if name_node is None or not name_node.text:
                        continue

                    genus = get_genus(name_node.text)

                    # Extract original bounding box coordinates
                    bbox = obj.find('bbox')
                    if bbox is None:
                        continue

                    orig_xmin = int(bbox.find('xmin').text)
                    orig_ymin = int(bbox.find('ymin').text)
                    orig_xmax = int(bbox.find('xmax').text)
                    orig_ymax = int(bbox.find('ymax').text)

                    # Calculate width and height of the diatom
                    box_width = orig_xmax - orig_xmin
                    box_height = orig_ymax - orig_ymin

                    # Define a margin (e.g., 15% extra space on all sides)
                    margin_pct = 0.15
                    margin_x = int(box_width * margin_pct)
                    margin_y = int(box_height * margin_pct)

                    # Apply margin, ensuring we don't go outside the image boundaries
                    # img.width and img.height come from the PIL Image object
                    xmin = max(0, orig_xmin - margin_x)
                    ymin = max(0, orig_ymin - margin_y)
                    xmax = min(img.width, orig_xmax + margin_x)
                    ymax = min(img.height, orig_ymax + margin_y)

                    # Crop the image with the new wider boundaries (PIL expects a tuple: left, upper, right, lower)
                    cropped_img = img.crop((xmin, ymin, xmax, ymax))

                    # Create a specific directory for this Genus if it doesn't exist
                    genus_dir = OUT_DIR / genus
                    genus_dir.mkdir(exist_ok=True)

                    # Save the cropped diatom
                    out_filename = f"{base_name}_diatom_{idx}.png"
                    cropped_img.save(genus_dir / out_filename)
                    processed_count += 1

        except Exception as e:
            logging.error(f"Error processing {xml_path.name}: {e}")

    logging.info(f"Extraction complete! Successfully processed {processed_count} diatoms.")
    if missing_images > 0:
        logging.info(f"Total missing images skipped: {missing_images}")


if __name__ == "__main__":
    process_dataset()