""" This script is meant to be run in Google Colab for fine-tuning the YOLOv10 model on the BCCD dataset. It contains all the steps needed for training and should be run before deploying the application. """ import os import glob import zipfile import requests import xml.etree.ElementTree as ET from pathlib import Path import shutil import ultralytics from ultralytics import YOLO import numpy as np import time def download_bccd_dataset(): """ Downloads the BCCD dataset from the GitHub repository. Returns the path to the dataset directory. """ # Install dependencies if needed os.system('pip install ultralytics gdown') # Clone the repository os.system('git clone https://github.com/Shenggan/BCCD_Dataset.git') # Verify download dataset_dir = Path('BCCD_Dataset') if not dataset_dir.exists(): print("Failed to download dataset using git. Trying alternative download...") # Alternative download method using direct download links os.makedirs('BCCD_Dataset/BCCD', exist_ok=True) url = "https://github.com/Shenggan/BCCD_Dataset/archive/refs/heads/master.zip" r = requests.get(url, allow_redirects=True) with open('bccd_dataset.zip', 'wb') as f: f.write(r.content) # Extract the zipfile with zipfile.ZipFile('bccd_dataset.zip', 'r') as zip_ref: zip_ref.extractall('.') # Move contents to the expected location extracted_dir = Path('BCCD_Dataset-master') if extracted_dir.exists(): # Copy contents to the BCCD_Dataset directory for item in extracted_dir.glob('*'): if item.is_dir(): shutil.copytree(item, dataset_dir / item.name) else: shutil.copy(item, dataset_dir / item.name) print("Dataset downloaded successfully.") return dataset_dir def setup_dataset_for_yolo(dataset_path): """ Prepares the BCCD dataset for YOLO format. Args: dataset_path: Path to the downloaded dataset Returns: Path to the processed dataset """ yolo_dir = Path('BCCD_YOLO') os.makedirs(yolo_dir, exist_ok=True) # Create directory structure for split in ['train', 'val', 'test']: os.makedirs(yolo_dir / split / 'images', exist_ok=True) os.makedirs(yolo_dir / split / 'labels', exist_ok=True) # Map sources to destinations splits = { 'train': dataset_path / 'BCCD' / 'train', 'val': dataset_path / 'BCCD' / 'val', 'test': dataset_path / 'BCCD' / 'test' } # Process each split for split_name, split_dir in splits.items(): image_files = list(split_dir.glob('*.jpg')) for img_file in image_files: # Copy image shutil.copy(img_file, yolo_dir / split_name / 'images' / img_file.name) # Convert annotation xml_file = split_dir / f"{img_file.stem}.xml" if xml_file.exists(): txt_file = yolo_dir / split_name / 'labels' / f"{img_file.stem}.txt" convert_annotations(xml_file, txt_file) return yolo_dir def convert_annotations(xml_path, txt_path): """ Converts XML annotations to YOLO format TXT files. Args: xml_path: Path to XML annotation file txt_path: Path to output TXT file """ tree = ET.parse(xml_path) root = tree.getroot() # Get image dimensions size = root.find('size') img_width = int(size.find('width').text) img_height = int(size.find('height').text) # Map class names to IDs class_map = {'RBC': 0, 'WBC': 1, 'Platelets': 2} with open(txt_path, 'w') as f: for obj in root.findall('object'): cls_name = obj.find('name').text if cls_name not in class_map: continue cls_id = class_map[cls_name] # Get bounding box coordinates bbox = obj.find('bndbox') x_min = float(bbox.find('xmin').text) y_min = float(bbox.find('ymin').text) x_max = float(bbox.find('xmax').text) y_max = float(bbox.find('ymax').text) # Convert to YOLO format: center_x, center_y, width, height x_center = (x_min + x_max) / (2.0 * img_width) y_center = (y_min + y_max) / (2.0 * img_height) width = (x_max - x_min) / img_width height = (y_max - y_min) / img_height # Write to file f.write(f"{cls_id} {x_center} {y_center} {width} {height}\n") def create_dataset_yaml(dataset_path): """ Creates the YAML file required by YOLOv10 for training. Args: dataset_path: Path to the processed dataset """ yaml_content = f""" # YOLOv10 dataset config for BCCD path: {dataset_path.absolute()} # Root directory train: train/images # Train images relative to path val: val/images # Validation images relative to path test: test/images # Test images relative to path # Classes names: 0: RBC 1: WBC 2: Platelets # Number of classes nc: 3 """ yaml_path = dataset_path / 'bccd.yaml' with open(yaml_path, 'w') as f: f.write(yaml_content) return yaml_path def train_model(dataset_path): """ Trains YOLOv10 on the BCCD dataset. Args: dataset_path: Path to the processed dataset Returns: Path to the trained model """ # Create YAML config file yaml_path = create_dataset_yaml(dataset_path) # Import required modules import torch # Load a pretrained YOLOv10 model # Note: Use 'yolov10n.pt' for faster training, or 'yolov10s.pt' for better accuracy model = YOLO('yolov10n.pt') # Nano model # Train the model device = '0' if torch.cuda.is_available() else 'cpu' print(f"Training on device: {device}") results = model.train( data=str(yaml_path), epochs=50, # Number of epochs imgsz=640, # Image size batch=16, # Batch size patience=15, # Early stopping patience device=device, project='BCCD_Training', name='yolov10_bccd', seed=42, workers=8 if torch.cuda.is_available() else 1 ) # Get the path to the best model best_model_path = Path('BCCD_Training/yolov10_bccd/weights/best.pt') # Export the model to other formats if needed model.export(format='onnx') # Copy model to Google Drive if running in Colab try: from google.colab import drive drive_path = Path('/content/drive/MyDrive/BCCD_Model') drive_path.mkdir(exist_ok=True, parents=True) model_save_path = drive_path / 'yolov10_bccd.pt' shutil.copy(best_model_path, model_save_path) print(f"Model saved to Google Drive at {model_save_path}") except: print("Not running in Colab or couldn't mount Google Drive.") return best_model_path def main(): """ Main function to execute the fine-tuning process. """ start_time = time.time() print("Step 1: Downloading BCCD dataset...") dataset_path = download_bccd_dataset() print("Step 2: Setting up dataset in YOLO format...") yolo_dataset_path = setup_dataset_for_yolo(dataset_path) print("Step 3: Training YOLOv10 model...") trained_model_path = train_model(yolo_dataset_path) elapsed_time = (time.time() - start_time) / 60 print(f"Training completed in {elapsed_time:.2f} minutes.") print(f"Trained model saved at: {trained_model_path}") return trained_model_path if __name__ == "__main__": main()