import os import shutil import random from ultralytics import YOLO from tqdm import tqdm MODEL_PATH = r"C:\Users\charu\Documents\goyam\roboflow\runs\segment\yolo26_real_v1\weights\best.pt" INPUT_IMG_DIR = r"C:\Users\charu\Desktop\all new\40000\all_images" OUTPUT_DATASET_DIR = r"C:\Users\charu\Desktop\all new\40000\goyam_v2_dataset" CONF_THRESHOLD = 0.30 SPLIT_RATIO = 0.85 BATCH_SIZE = 16 def setup_directories(): """Creates the YOLO standard folder structure.""" print("šŸ“ Creating dataset directories...") dirs = [ os.path.join(OUTPUT_DATASET_DIR, "images", "train"), os.path.join(OUTPUT_DATASET_DIR, "images", "val"), os.path.join(OUTPUT_DATASET_DIR, "labels", "train"), os.path.join(OUTPUT_DATASET_DIR, "labels", "val") ] for d in dirs: os.makedirs(d, exist_ok=True) def generate_yaml(model): """Automatically creates the data.yaml file needed for the next training.""" yaml_path = os.path.join(OUTPUT_DATASET_DIR, "data.yaml") names_dict = model.names with open(yaml_path, "w") as f: f.write(f"train: {os.path.join(OUTPUT_DATASET_DIR, 'images', 'train')}\n") f.write(f"val: {os.path.join(OUTPUT_DATASET_DIR, 'images', 'val')}\n\n") f.write(f"nc: {len(names_dict)}\n") f.write(f"names: {list(names_dict.values())}\n") print(f"Created data.yaml at {yaml_path}") def auto_label_and_split(): setup_directories() print(f"Loading : {MODEL_PATH}") model = YOLO(MODEL_PATH) generate_yaml(model) valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.webp') all_images = [f for f in os.listdir(INPUT_IMG_DIR) if f.lower().endswith(valid_extensions)] total_images = len(all_images) print(f"Found {total_images} images. Shuffling and Splitting...") random.shuffle(all_images) split_idx = int(total_images * SPLIT_RATIO) train_images = set(all_images[:split_idx]) print(f"Starting Auto-Labeling (Batch Size: {BATCH_SIZE})...") results = model.predict( source=INPUT_IMG_DIR, stream=True, batch=BATCH_SIZE, conf=CONF_THRESHOLD, verbose=False, device="cuda:0" ) for result in tqdm(results, total=total_images, desc="Labeling"): img_path = result.path filename = os.path.basename(img_path) folder_type = "train" if filename in train_images else "val" dest_img_path = os.path.join(OUTPUT_DATASET_DIR, "images", folder_type, filename) txt_filename = os.path.splitext(filename)[0] + ".txt" dest_txt_path = os.path.join(OUTPUT_DATASET_DIR, "labels", folder_type, txt_filename) lines = [] if result.masks is not None and result.boxes is not None: for i, polygon in enumerate(result.masks.xyn): cls_id = int(result.boxes.cls[i].item()) coords = " ".join([f"{x:.6f} {y:.6f}" for x, y in polygon]) lines.append(f"{cls_id} {coords}") with open(dest_txt_path, "w") as f: f.write("\n".join(lines)) shutil.copy2(img_path, dest_img_path) print("\nšŸŽ‰ Auto-Labeling Complete!") print(f"Dataset ready at: {OUTPUT_DATASET_DIR}") print("You can now train your V2 model using the newly generated data.yaml!") if __name__ == "__main__": auto_label_and_split()