from pathlib import Path import json import re from typing import Any, List, Tuple, Iterator, Dict from .base_adapter import DatasetAdapter from models.dataset import UniversalDatasetItem, DatasetContentType, UniversalAnnotation, UniversalAnnotationType, DatasetTask from .annotation_parser import YOLOParser, COCOParser, VOCParser, RoboflowTXTParser, _img_dimensions class YOLOAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: if list(dataset_path.rglob("data.yaml")): return True txt_files = list(dataset_path.rglob("*.txt")) label_txts = [f for f in txt_files if f.name not in ("classes.txt", "obj.names", "README.txt", "LICENSE.txt", "README.roboflow.txt")] if label_txts: try: content = label_txts[0].read_text(encoding="utf-8").strip().split('\n')[0] if re.match(r"^\d+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+", content): return True except: pass return False def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.detection def get_class_names(self, dataset_path: Path) -> List[str]: return YOLOParser.load_class_map(dataset_path) def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: class_map = self.get_class_names(dataset_path) for rel_path, image_id, split, anns in YOLOParser.iter_dataset(dataset_path, dataset_id, class_map): abs_path = dataset_path / rel_path w, h = _img_dimensions(abs_path) img_rec = { "id": image_id, "filename": Path(rel_path).name, "rel_path": str(rel_path), "width": w, "height": h, "split": split, "ann_count": len(anns), } yield img_rec, anns class COCOAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: for jf in dataset_path.rglob("*.json"): try: snippet = jf.read_text(encoding="utf-8", errors="replace")[:2048] if '"images"' in snippet and '"annotations"' in snippet: return True except: pass return False def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.segmentation # Roboflow COCO often implies segmentation def get_class_names(self, dataset_path: Path) -> List[str]: ann_files = COCOParser.find_annotation_files(dataset_path) all_classes = [] for ann_file in ann_files: classes, _ = COCOParser.parse_file(ann_file, "dummy") all_classes = list(dict.fromkeys(all_classes + classes)) return all_classes def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: ann_files = COCOParser.find_annotation_files(dataset_path) for ann_file in ann_files: _, coco_results = COCOParser.parse_file(ann_file, dataset_id) for rel_path, image_id, split, anns in coco_results: abs_path = dataset_path / rel_path w, h = _img_dimensions(abs_path) img_rec = { "id": image_id, "filename": Path(rel_path).name, "rel_path": str(rel_path), "width": w, "height": h, "split": split, "ann_count": len(anns), } yield img_rec, anns class VOCAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: for xf in dataset_path.rglob("*.xml"): try: snippet = xf.read_text(encoding="utf-8", errors="replace")[:512] if "" in snippet: return True except: pass return False def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.detection def get_class_names(self, dataset_path: Path) -> List[str]: classes = set() for _, _, _, _, _, anns in VOCParser.iter_dataset(dataset_path, "dummy"): for ann in anns: classes.add(ann["label"]) return sorted(list(classes)) def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: for rel_path, image_id, split, w, h, anns in VOCParser.iter_dataset(dataset_path, dataset_id): img_rec = { "id": image_id, "filename": Path(rel_path).name, "rel_path": str(rel_path), "width": w, "height": h, "split": split, "ann_count": len(anns), } yield img_rec, anns class CreateMLAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: for jf in dataset_path.rglob("*.json"): try: snippet = jf.read_text(encoding="utf-8", errors="replace")[:1024] if '"image"' in snippet and '"annotations"' in snippet and "[" in snippet: return True except: pass return False def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.detection def get_class_names(self, dataset_path: Path) -> List[str]: classes = set() for jf in dataset_path.rglob("*.json"): try: data = json.loads(jf.read_text(encoding="utf-8")) if isinstance(data, list): for item in data: for ann in item.get("annotations", []): if "label" in ann: classes.add(ann["label"]) except: pass return sorted(list(classes)) def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: from .annotation_parser import _make_ann for jf in dataset_path.rglob("*.json"): try: data = json.loads(jf.read_text(encoding="utf-8")) if not isinstance(data, list): continue # Determine split from path split = "train" if "val" in jf.parts or "valid" in jf.parts: split = "val" elif "test" in jf.parts: split = "test" for item in data: rel_img_path = item.get("image") if not rel_img_path: continue # Try to find the image relative to JSON or root img_path = jf.parent / rel_img_path if not img_path.exists(): img_path = dataset_path / rel_img_path if img_path.exists(): image_id = f"img-{uuid.uuid4().hex[:12]}" w, h = _img_dimensions(img_path) anns = [] for ca in item.get("annotations", []): label = ca.get("label", "unknown") coord = ca.get("coordinates", {}) # CreateML coords are usually center-based pixels: {x, y, width, height} if "x" in coord and "y" in coord and w > 0 and h > 0: cx, cy, bw, bh = coord["x"], coord["y"], coord["width"], coord["height"] # Convert to top-left normalized nx = (cx - bw/2) / w ny = (cy - bh/2) / h nw = bw / w nh = bh / h anns.append(_make_ann(image_id, dataset_id, label, (nx, ny, nw, nh))) img_rec = { "id": image_id, "filename": img_path.name, "rel_path": str(img_path.relative_to(dataset_path)), "width": w, "height": h, "split": split, "ann_count": len(anns) } yield img_rec, anns except: pass class NLPAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: return any(dataset_path.rglob("*.csv")) or any(dataset_path.rglob("*.tsv")) def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.nlp def get_class_names(self, dataset_path: Path) -> List[str]: # Implementation for NLP class names return [] def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: # Implementation for NLP items yield {}, [] class TabularAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: return False # Placeholder def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.classification def get_class_names(self, dataset_path: Path) -> List[str]: return [] def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: yield {}, [] class RoboflowClassificationAdapter(DatasetAdapter): def detect(self, dataset_path: Path) -> bool: # Check for _annotations.txt or folder-based classification if list(dataset_path.rglob("_annotations.txt")): return True for split in ["train", "valid", "test"]: split_dir = dataset_path / split if split_dir.exists() and split_dir.is_dir(): subdirs = [d for d in split_dir.iterdir() if d.is_dir()] if subdirs and not any(d.name.lower() in ["images", "labels"] for d in subdirs): return True return False def get_task(self, dataset_path: Path) -> DatasetTask: return DatasetTask.classification def get_class_names(self, dataset_path: Path) -> List[str]: classes = set() for _, _, _, anns in RoboflowTXTParser.iter_dataset(dataset_path, "dummy"): for ann in anns: classes.add(ann["label"]) return sorted(list(classes)) def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]: for rel_path, image_id, split, anns in RoboflowTXTParser.iter_dataset(dataset_path, dataset_id): abs_path = dataset_path / rel_path w, h = _img_dimensions(abs_path) img_rec = { "id": image_id, "filename": Path(rel_path).name, "rel_path": str(rel_path), "width": w, "height": h, "split": split, "ann_count": len(anns), } yield img_rec, anns