Spaces:

senthil2421
/

mlforge

Sleeping

mlforge / datasets /format_adapters.py

senthil2421

Refactor cloud_backend: remove local execution routes and fix missing modules

e10cda2 29 days ago

10.8 kB

	from pathlib import Path
	import json
	import re
	from typing import Any, List, Tuple, Iterator, Dict
	from .base_adapter import DatasetAdapter
	from models.dataset import UniversalDatasetItem, DatasetContentType, UniversalAnnotation, UniversalAnnotationType, DatasetTask
	from .annotation_parser import YOLOParser, COCOParser, VOCParser, RoboflowTXTParser, _img_dimensions

	class YOLOAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	if list(dataset_path.rglob("data.yaml")):
	return True
	txt_files = list(dataset_path.rglob("*.txt"))
	label_txts = [f for f in txt_files if f.name not in ("classes.txt", "obj.names", "README.txt", "LICENSE.txt", "README.roboflow.txt")]
	if label_txts:
	try:
	content = label_txts[0].read_text(encoding="utf-8").strip().split('\n')[0]
	if re.match(r"^\d+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+", content):
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	return YOLOParser.load_class_map(dataset_path)

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	class_map = self.get_class_names(dataset_path)
	for rel_path, image_id, split, anns in YOLOParser.iter_dataset(dataset_path, dataset_id, class_map):
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class COCOAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for jf in dataset_path.rglob("*.json"):
	try:
	snippet = jf.read_text(encoding="utf-8", errors="replace")[:2048]
	if '"images"' in snippet and '"annotations"' in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.segmentation # Roboflow COCO often implies segmentation

	def get_class_names(self, dataset_path: Path) -> List[str]:
	ann_files = COCOParser.find_annotation_files(dataset_path)
	all_classes = []
	for ann_file in ann_files:
	classes, _ = COCOParser.parse_file(ann_file, "dummy")
	all_classes = list(dict.fromkeys(all_classes + classes))
	return all_classes

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	ann_files = COCOParser.find_annotation_files(dataset_path)
	for ann_file in ann_files:
	_, coco_results = COCOParser.parse_file(ann_file, dataset_id)
	for rel_path, image_id, split, anns in coco_results:
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class VOCAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for xf in dataset_path.rglob("*.xml"):
	try:
	snippet = xf.read_text(encoding="utf-8", errors="replace")[:512]
	if "<annotation>" in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for _, _, _, _, _, anns in VOCParser.iter_dataset(dataset_path, "dummy"):
	for ann in anns:
	classes.add(ann["label"])
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	for rel_path, image_id, split, w, h, anns in VOCParser.iter_dataset(dataset_path, dataset_id):
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class CreateMLAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for jf in dataset_path.rglob("*.json"):
	try:
	snippet = jf.read_text(encoding="utf-8", errors="replace")[:1024]
	if '"image"' in snippet and '"annotations"' in snippet and "[" in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for jf in dataset_path.rglob("*.json"):
	try:
	data = json.loads(jf.read_text(encoding="utf-8"))
	if isinstance(data, list):
	for item in data:
	for ann in item.get("annotations", []):
	if "label" in ann: classes.add(ann["label"])
	except: pass
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	from .annotation_parser import _make_ann
	for jf in dataset_path.rglob("*.json"):
	try:
	data = json.loads(jf.read_text(encoding="utf-8"))
	if not isinstance(data, list): continue

	# Determine split from path
	split = "train"
	if "val" in jf.parts or "valid" in jf.parts: split = "val"
	elif "test" in jf.parts: split = "test"

	for item in data:
	rel_img_path = item.get("image")
	if not rel_img_path: continue

	# Try to find the image relative to JSON or root
	img_path = jf.parent / rel_img_path
	if not img_path.exists():
	img_path = dataset_path / rel_img_path

	if img_path.exists():
	image_id = f"img-{uuid.uuid4().hex[:12]}"
	w, h = _img_dimensions(img_path)

	anns = []
	for ca in item.get("annotations", []):
	label = ca.get("label", "unknown")
	coord = ca.get("coordinates", {})
	# CreateML coords are usually center-based pixels: {x, y, width, height}
	if "x" in coord and "y" in coord and w > 0 and h > 0:
	cx, cy, bw, bh = coord["x"], coord["y"], coord["width"], coord["height"]
	# Convert to top-left normalized
	nx = (cx - bw/2) / w
	ny = (cy - bh/2) / h
	nw = bw / w
	nh = bh / h
	anns.append(_make_ann(image_id, dataset_id, label, (nx, ny, nw, nh)))

	img_rec = {
	"id": image_id, "filename": img_path.name,
	"rel_path": str(img_path.relative_to(dataset_path)),
	"width": w, "height": h, "split": split, "ann_count": len(anns)
	}
	yield img_rec, anns
	except: pass

	class NLPAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	return any(dataset_path.rglob(".csv")) or any(dataset_path.rglob(".tsv"))

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.nlp

	def get_class_names(self, dataset_path: Path) -> List[str]:
	# Implementation for NLP class names
	return []

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	# Implementation for NLP items
	yield {}, []

	class TabularAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	return False # Placeholder

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.classification

	def get_class_names(self, dataset_path: Path) -> List[str]:
	return []

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	yield {}, []

	class RoboflowClassificationAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	# Check for _annotations.txt or folder-based classification
	if list(dataset_path.rglob("_annotations.txt")): return True
	for split in ["train", "valid", "test"]:
	split_dir = dataset_path / split
	if split_dir.exists() and split_dir.is_dir():
	subdirs = [d for d in split_dir.iterdir() if d.is_dir()]
	if subdirs and not any(d.name.lower() in ["images", "labels"] for d in subdirs):
	return True
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.classification

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for _, _, _, anns in RoboflowTXTParser.iter_dataset(dataset_path, "dummy"):
	for ann in anns: classes.add(ann["label"])
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	for rel_path, image_id, split, anns in RoboflowTXTParser.iter_dataset(dataset_path, dataset_id):
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns