import os import random import glob import shutil from ultralytics import YOLO from pathlib import Path from tqdm import tqdm def find_best_model(): base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) candidates = [ os.path.join(base_dir, "runs", "detect", "train2", "weights", "best.pt"), os.path.join(base_dir, "..", "runs", "detect", "train2", "weights", "best.pt"), os.path.join(base_dir, "runs", "detect", "train", "weights", "best.pt"), os.path.join(base_dir, "..", "runs", "detect", "train", "weights", "best.pt"), ] for path in candidates: if os.path.exists(path): return path return None def auto_annotate(num_samples=1000, confidence_threshold=0.6): backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) model_path = find_best_model() if not model_path: print("Error: best.pt model not found! Please ensure it exists.") return print(f"Loading YOLO model from: {model_path}") model = YOLO(model_path) raw_images_dir = os.path.join(backend_dir, "scraper", "data", "raw_images") output_dataset_dir = os.path.join(backend_dir, "yolo_dataset", "auto_generated") images_out_dir = os.path.join(output_dataset_dir, "images") labels_out_dir = os.path.join(output_dataset_dir, "labels") os.makedirs(images_out_dir, exist_ok=True) os.makedirs(labels_out_dir, exist_ok=True) # Get all jpg images all_images = glob.glob(os.path.join(raw_images_dir, "*.jpg")) if not all_images: print(f"No images found in {raw_images_dir}") return print(f"Found {len(all_images)} raw images.") # Shuffle and pick num_samples random.shuffle(all_images) samples = all_images[:num_samples] print(f"Starting auto-annotation for {len(samples)} images...") successful_annotated = 0 for img_path in tqdm(samples): filename = os.path.basename(img_path) img_name, ext = os.path.splitext(filename) try: results = model(img_path, conf=confidence_threshold, verbose=False) # If nothing was detected above threshold, skip saving this image # Or you might want to save it as empty background. For now, we only save if detected. has_detections = False label_lines = [] for result in results: for box in result.boxes: cls_id = int(box.cls) # YOLO normalized coordinates: center_x center_y width height # xywhn is normalized 0-1 x, y, w, h = box.xywhn[0] label_lines.append(f"{cls_id} {float(x)} {float(y)} {float(w)} {float(h)}") has_detections = True if has_detections: # 1. Copy image dest_img = os.path.join(images_out_dir, filename) shutil.copy2(img_path, dest_img) # 2. Save label label_path = os.path.join(labels_out_dir, f"{img_name}.txt") with open(label_path, "w", encoding="utf-8") as f: f.write("\n".join(label_lines) + "\n") successful_annotated += 1 except Exception as e: print(f"Error processing {filename}: {e}") print(f"Auto-annotation complete! {successful_annotated} images successfully extracted to {output_dataset_dir}") if __name__ == "__main__": # Test run with 1000 images auto_annotate(num_samples=1000, confidence_threshold=0.6)