Spaces:

senthil2421
/

mlforge

Sleeping

File size: 10,751 Bytes

e10cda2

from pathlib import Path
import json
import re
from typing import Any, List, Tuple, Iterator, Dict
from .base_adapter import DatasetAdapter
from models.dataset import UniversalDatasetItem, DatasetContentType, UniversalAnnotation, UniversalAnnotationType, DatasetTask
from .annotation_parser import YOLOParser, COCOParser, VOCParser, RoboflowTXTParser, _img_dimensions

class YOLOAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        if list(dataset_path.rglob("data.yaml")):
            return True
        txt_files = list(dataset_path.rglob("*.txt"))
        label_txts = [f for f in txt_files if f.name not in ("classes.txt", "obj.names", "README.txt", "LICENSE.txt", "README.roboflow.txt")]
        if label_txts:
            try:
                content = label_txts[0].read_text(encoding="utf-8").strip().split('\n')[0]
                if re.match(r"^\d+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+", content):
                    return True
            except: pass
        return False

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.detection

    def get_class_names(self, dataset_path: Path) -> List[str]:
        return YOLOParser.load_class_map(dataset_path)

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        class_map = self.get_class_names(dataset_path)
        for rel_path, image_id, split, anns in YOLOParser.iter_dataset(dataset_path, dataset_id, class_map):
            abs_path = dataset_path / rel_path
            w, h = _img_dimensions(abs_path)
            img_rec = {
                "id": image_id, "filename": Path(rel_path).name,
                "rel_path": str(rel_path), "width": w, "height": h,
                "split": split, "ann_count": len(anns),
            }
            yield img_rec, anns

class COCOAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        for jf in dataset_path.rglob("*.json"):
            try:
                snippet = jf.read_text(encoding="utf-8", errors="replace")[:2048]
                if '"images"' in snippet and '"annotations"' in snippet:
                    return True
            except: pass
        return False

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.segmentation # Roboflow COCO often implies segmentation

    def get_class_names(self, dataset_path: Path) -> List[str]:
        ann_files = COCOParser.find_annotation_files(dataset_path)
        all_classes = []
        for ann_file in ann_files:
            classes, _ = COCOParser.parse_file(ann_file, "dummy")
            all_classes = list(dict.fromkeys(all_classes + classes))
        return all_classes

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        ann_files = COCOParser.find_annotation_files(dataset_path)
        for ann_file in ann_files:
            _, coco_results = COCOParser.parse_file(ann_file, dataset_id)
            for rel_path, image_id, split, anns in coco_results:
                abs_path = dataset_path / rel_path
                w, h = _img_dimensions(abs_path)
                img_rec = {
                    "id": image_id, "filename": Path(rel_path).name,
                    "rel_path": str(rel_path), "width": w, "height": h,
                    "split": split, "ann_count": len(anns),
                }
                yield img_rec, anns

class VOCAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        for xf in dataset_path.rglob("*.xml"):
            try:
                snippet = xf.read_text(encoding="utf-8", errors="replace")[:512]
                if "<annotation>" in snippet:
                    return True
            except: pass
        return False

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.detection

    def get_class_names(self, dataset_path: Path) -> List[str]:
        classes = set()
        for _, _, _, _, _, anns in VOCParser.iter_dataset(dataset_path, "dummy"):
            for ann in anns:
                classes.add(ann["label"])
        return sorted(list(classes))

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        for rel_path, image_id, split, w, h, anns in VOCParser.iter_dataset(dataset_path, dataset_id):
            img_rec = {
                "id": image_id, "filename": Path(rel_path).name,
                "rel_path": str(rel_path), "width": w, "height": h,
                "split": split, "ann_count": len(anns),
            }
            yield img_rec, anns

class CreateMLAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        for jf in dataset_path.rglob("*.json"):
            try:
                snippet = jf.read_text(encoding="utf-8", errors="replace")[:1024]
                if '"image"' in snippet and '"annotations"' in snippet and "[" in snippet:
                    return True
            except: pass
        return False

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.detection

    def get_class_names(self, dataset_path: Path) -> List[str]:
        classes = set()
        for jf in dataset_path.rglob("*.json"):
            try:
                data = json.loads(jf.read_text(encoding="utf-8"))
                if isinstance(data, list):
                    for item in data:
                        for ann in item.get("annotations", []):
                            if "label" in ann: classes.add(ann["label"])
            except: pass
        return sorted(list(classes))

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        from .annotation_parser import _make_ann
        for jf in dataset_path.rglob("*.json"):
            try:
                data = json.loads(jf.read_text(encoding="utf-8"))
                if not isinstance(data, list): continue
                
                # Determine split from path
                split = "train"
                if "val" in jf.parts or "valid" in jf.parts: split = "val"
                elif "test" in jf.parts: split = "test"

                for item in data:
                    rel_img_path = item.get("image")
                    if not rel_img_path: continue
                    
                    # Try to find the image relative to JSON or root
                    img_path = jf.parent / rel_img_path
                    if not img_path.exists():
                        img_path = dataset_path / rel_img_path
                    
                    if img_path.exists():
                        image_id = f"img-{uuid.uuid4().hex[:12]}"
                        w, h = _img_dimensions(img_path)
                        
                        anns = []
                        for ca in item.get("annotations", []):
                            label = ca.get("label", "unknown")
                            coord = ca.get("coordinates", {})
                            # CreateML coords are usually center-based pixels: {x, y, width, height}
                            if "x" in coord and "y" in coord and w > 0 and h > 0:
                                cx, cy, bw, bh = coord["x"], coord["y"], coord["width"], coord["height"]
                                # Convert to top-left normalized
                                nx = (cx - bw/2) / w
                                ny = (cy - bh/2) / h
                                nw = bw / w
                                nh = bh / h
                                anns.append(_make_ann(image_id, dataset_id, label, (nx, ny, nw, nh)))
                        
                        img_rec = {
                            "id": image_id, "filename": img_path.name,
                            "rel_path": str(img_path.relative_to(dataset_path)),
                            "width": w, "height": h, "split": split, "ann_count": len(anns)
                        }
                        yield img_rec, anns
            except: pass

class NLPAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        return any(dataset_path.rglob("*.csv")) or any(dataset_path.rglob("*.tsv"))

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.nlp

    def get_class_names(self, dataset_path: Path) -> List[str]:
        # Implementation for NLP class names
        return []

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        # Implementation for NLP items
        yield {}, []

class TabularAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        return False # Placeholder

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.classification

    def get_class_names(self, dataset_path: Path) -> List[str]:
        return []

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        yield {}, []

class RoboflowClassificationAdapter(DatasetAdapter):
    def detect(self, dataset_path: Path) -> bool:
        # Check for _annotations.txt or folder-based classification
        if list(dataset_path.rglob("_annotations.txt")): return True
        for split in ["train", "valid", "test"]:
            split_dir = dataset_path / split
            if split_dir.exists() and split_dir.is_dir():
                subdirs = [d for d in split_dir.iterdir() if d.is_dir()]
                if subdirs and not any(d.name.lower() in ["images", "labels"] for d in subdirs):
                    return True
        return False

    def get_task(self, dataset_path: Path) -> DatasetTask:
        return DatasetTask.classification

    def get_class_names(self, dataset_path: Path) -> List[str]:
        classes = set()
        for _, _, _, anns in RoboflowTXTParser.iter_dataset(dataset_path, "dummy"):
            for ann in anns: classes.add(ann["label"])
        return sorted(list(classes))

    def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
        for rel_path, image_id, split, anns in RoboflowTXTParser.iter_dataset(dataset_path, dataset_id):
            abs_path = dataset_path / rel_path
            w, h = _img_dimensions(abs_path)
            img_rec = {
                "id": image_id, "filename": Path(rel_path).name,
                "rel_path": str(rel_path), "width": w, "height": h,
                "split": split, "ann_count": len(anns),
            }
            yield img_rec, anns