Spaces:
Running
Running
| """ | |
| download_data.py | |
| Arac hasar tespiti MVP icin veri seti indirme orchestratoru. | |
| Kullanim: | |
| python scripts/download_data.py --help | |
| python scripts/download_data.py --cardd-hf | |
| python scripts/download_data.py --carparts-ultra | |
| python scripts/download_data.py --roboflow-severity | |
| python scripts/download_data.py --cardd-manual C:/Downloads/CarDD_release.zip | |
| python scripts/download_data.py --all | |
| python scripts/download_data.py --all --dry-run | |
| Notlar: | |
| - Tum cikti yollari `services/ml/data/` altinda toplanir; mevcut | |
| `services/ml/prepare_data.py` ve `prepare_parts_data.py` ile uyumludur. | |
| - CarDD ana set form basvurusu gerektirir (https://cardd-ustc.github.io). | |
| HF mirror (`harpreetsahota/CarDD`) form gerektirmez ama lisansi ayni | |
| (academic non-commercial). Ticari kullanim icin yazarlardan izin gerekir. | |
| - Roboflow setleri icin ROBOFLOW_API_KEY environment variable veya .env | |
| dosyasinda tanimli olmali. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import hashlib | |
| import logging | |
| import os | |
| import shutil | |
| import sys | |
| import time | |
| import zipfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Iterable, Optional | |
| # Proje koku (scripts/ klasorunun bir ust seviyesi) | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| ML_ROOT = PROJECT_ROOT / "services" / "ml" | |
| DATA_ROOT = ML_ROOT / "data" | |
| LOG_DIR = PROJECT_ROOT / "scripts" / ".logs" | |
| # Hedef alt klasorler (prepare_data.py / prepare_parts_data.py ile uyumlu) | |
| CARDD_HF_DIR = DATA_ROOT / "cardd_hf" # HuggingFace mirror | |
| CARDD_RELEASE_DIR = DATA_ROOT / "CarDD_release" # Manuel/form ana set | |
| CARDD_YOLO_DIR = DATA_ROOT / "cardd_yolo" # prepare_data.py ciktisi | |
| PARTS_YOLO_DIR = DATA_ROOT / "parts_yolo" # prepare_parts_data.py ciktisi | |
| SEVERITY_ROBOFLOW_DIR = DATA_ROOT / "severity_roboflow" | |
| # Disk gereksinim tahminleri (GB) | |
| DISK_ESTIMATES_GB = { | |
| "cardd_hf": 6.5, | |
| "cardd_release": 6.5, | |
| "carparts_ultra": 1.2, | |
| "severity_roboflow": 0.5, | |
| "yolo_outputs": 7.0, # prepare_data.py kopyalari da dahil | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # Logging | |
| # ----------------------------------------------------------------------------- | |
| def setup_logger(name: str) -> logging.Logger: | |
| LOG_DIR.mkdir(parents=True, exist_ok=True) | |
| log_file = LOG_DIR / f"{name}_{time.strftime('%Y%m%d_%H%M%S')}.log" | |
| logger = logging.getLogger(name) | |
| logger.setLevel(logging.INFO) | |
| logger.handlers.clear() | |
| fmt = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s", | |
| datefmt="%H:%M:%S") | |
| fh = logging.FileHandler(log_file, encoding="utf-8") | |
| fh.setFormatter(fmt) | |
| logger.addHandler(fh) | |
| sh = logging.StreamHandler(sys.stdout) | |
| sh.setFormatter(fmt) | |
| logger.addHandler(sh) | |
| logger.info("Log dosyasi: %s", log_file) | |
| return logger | |
| # ----------------------------------------------------------------------------- | |
| # Yardimcilar | |
| # ----------------------------------------------------------------------------- | |
| class DownloadPlan: | |
| name: str | |
| target: Path | |
| est_gb: float | |
| requires_auth: bool | |
| requires_manual: bool | |
| notes: str | |
| def free_disk_gb(path: Path) -> float: | |
| """Belirtilen path icin bos disk alanini GB cinsinden dondurur.""" | |
| try: | |
| path.mkdir(parents=True, exist_ok=True) | |
| usage = shutil.disk_usage(str(path)) | |
| return usage.free / (1024 ** 3) | |
| except Exception: | |
| return -1.0 | |
| def sha256_file(path: Path, chunk: int = 1024 * 1024) -> str: | |
| h = hashlib.sha256() | |
| with open(path, "rb") as f: | |
| for buf in iter(lambda: f.read(chunk), b""): | |
| h.update(buf) | |
| return h.hexdigest() | |
| def write_hash_sidecar(path: Path, logger: logging.Logger) -> str: | |
| """Bir dosyanin SHA256'sini hesaplayip yanina .sha256 olarak yazar.""" | |
| digest = sha256_file(path) | |
| sidecar = path.with_suffix(path.suffix + ".sha256") | |
| sidecar.write_text(f"{digest} {path.name}\n", encoding="utf-8") | |
| logger.info("SHA256 %s = %s", path.name, digest) | |
| return digest | |
| def verify_hash(path: Path, expected: Optional[str], logger: logging.Logger) -> bool: | |
| if not expected: | |
| return True | |
| actual = sha256_file(path) | |
| ok = actual.lower() == expected.lower() | |
| if ok: | |
| logger.info("Hash dogrulandi: %s", path.name) | |
| else: | |
| logger.error("Hash UYUSMUYOR: %s\n beklenen: %s\n bulunan : %s", | |
| path.name, expected, actual) | |
| return ok | |
| def load_dotenv_if_present() -> None: | |
| """Basit .env loader (python-dotenv olmadan).""" | |
| env_path = PROJECT_ROOT / ".env" | |
| if not env_path.exists(): | |
| return | |
| for line in env_path.read_text(encoding="utf-8").splitlines(): | |
| line = line.strip() | |
| if not line or line.startswith("#") or "=" not in line: | |
| continue | |
| key, _, val = line.partition("=") | |
| key = key.strip() | |
| val = val.strip().strip('"').strip("'") | |
| os.environ.setdefault(key, val) | |
| def confirm_disk(target: Path, need_gb: float, logger: logging.Logger, | |
| dry_run: bool) -> bool: | |
| free = free_disk_gb(target) | |
| logger.info("Hedef: %s | Gereken ~%.1f GB | Bos: %.1f GB", target, need_gb, free) | |
| if free < 0: | |
| logger.warning("Disk alani okunamadi, devam ediliyor.") | |
| return True | |
| if free < need_gb * 1.2: | |
| logger.error("Yetersiz disk alani (%.1f < %.1f * 1.2).", free, need_gb) | |
| return False | |
| if dry_run: | |
| logger.info("[DRY-RUN] %s indirilecek (~%.1f GB)", target, need_gb) | |
| return True | |
| # ----------------------------------------------------------------------------- | |
| # 1) CarDD - HuggingFace mirror | |
| # ----------------------------------------------------------------------------- | |
| def download_cardd_hf(logger: logging.Logger, dry_run: bool = False) -> Optional[Path]: | |
| """harpreetsahota/CarDD setini HF Hub'dan indirir. | |
| Strateji: | |
| a) huggingface_hub.snapshot_download (resume destekli, yeniden calistirilabilir) | |
| b) datasets.load_dataset fallback | |
| """ | |
| target = CARDD_HF_DIR | |
| if not confirm_disk(target, DISK_ESTIMATES_GB["cardd_hf"], logger, dry_run): | |
| return None | |
| if dry_run: | |
| logger.info("[DRY-RUN] HF repo: harpreetsahota/CarDD -> %s", target) | |
| return target | |
| target.mkdir(parents=True, exist_ok=True) | |
| repo_id = "harpreetsahota/CarDD" | |
| try: | |
| from huggingface_hub import snapshot_download | |
| except ImportError: | |
| logger.error("huggingface_hub yok. Kur: pip install -r scripts/requirements.txt") | |
| return None | |
| logger.info("HF snapshot baslat: %s -> %s", repo_id, target) | |
| try: | |
| path = snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| local_dir=str(target), | |
| local_dir_use_symlinks=False, # Windows uyumu icin kopya | |
| resume_download=True, | |
| ) | |
| logger.info("Tamamlandi: %s", path) | |
| return Path(path) | |
| except Exception as e: | |
| logger.warning("snapshot_download basarisiz: %s", e) | |
| logger.info("Fallback: datasets.load_dataset deneniyor...") | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset(repo_id, cache_dir=str(target / ".hf_cache")) | |
| logger.info("datasets ile yuklendi: %s", list(ds.keys())) | |
| return target | |
| except Exception as e2: | |
| logger.error("HF indirme basarisiz: %s", e2) | |
| logger.error("Manuel: https://huggingface.co/datasets/%s", repo_id) | |
| return None | |
| # ----------------------------------------------------------------------------- | |
| # 2) Ultralytics CarParts-Seg | |
| # ----------------------------------------------------------------------------- | |
| def trigger_carparts_ultra(logger: logging.Logger, dry_run: bool = False) -> Optional[Path]: | |
| """Ultralytics CarParts-Seg veri setini ON-CACHE eder. | |
| Ultralytics datasets klasorune kendi indirir. Biz sadece pre-fetch tetikliyoruz. | |
| Kullanici sonrasinda: | |
| python services/ml/prepare_parts_data.py --use_ultralytics \\ | |
| --output_dir services/ml/data/parts_yolo | |
| """ | |
| if dry_run: | |
| logger.info("[DRY-RUN] Ultralytics 'carparts-seg.yaml' auto-download tetiklenecek") | |
| logger.info("[DRY-RUN] Tahmini boyut: ~%.1f GB", DISK_ESTIMATES_GB["carparts_ultra"]) | |
| return None | |
| try: | |
| from ultralytics import YOLO | |
| from ultralytics.utils import SETTINGS | |
| except ImportError: | |
| logger.error("ultralytics yok. Kur: pip install ultralytics") | |
| return None | |
| datasets_dir = Path(SETTINGS.get("datasets_dir", ".")) | |
| target = datasets_dir / "carparts-seg" | |
| logger.info("Ultralytics datasets_dir: %s", datasets_dir) | |
| logger.info("Hedef: %s", target) | |
| if not confirm_disk(datasets_dir, DISK_ESTIMATES_GB["carparts_ultra"], logger, False): | |
| return None | |
| if target.exists() and any(target.iterdir()): | |
| logger.info("Mevcut, atlandi: %s", target) | |
| else: | |
| logger.info("CarParts-Seg indirme tetikleniyor (CPU val on minimal config)...") | |
| try: | |
| m = YOLO("yolo11n-seg.pt") # backbone auto-iner; Ultralytics dataset ZIP indirir | |
| try: | |
| m.val(data="carparts-seg.yaml", batch=1, device="cpu", | |
| workers=0, verbose=False) | |
| except Exception as ve: | |
| # Indirme tetiklemek icin val'a guveniyoruz; val hatasi onemsiz | |
| logger.info("val tetikleyici tamamlandi (hata bekleniyordu): %s", | |
| type(ve).__name__) | |
| except Exception as e: | |
| logger.error("Ultralytics indirme basarisiz: %s", e) | |
| logger.error("Manuel: https://docs.ultralytics.com/datasets/segment/carparts-seg/") | |
| return None | |
| logger.info("\nSonraki adim:\n python services/ml/prepare_parts_data.py " | |
| "--use_ultralytics --output_dir %s", PARTS_YOLO_DIR) | |
| return target | |
| # ----------------------------------------------------------------------------- | |
| # 3) Roboflow severity dataset | |
| # ----------------------------------------------------------------------------- | |
| def download_roboflow_severity(logger: logging.Logger, | |
| workspace: str = "car-damage-detection-cardd", | |
| project: str = "car-damage-severity", | |
| version: int = 1, | |
| fmt: str = "yolov8", | |
| dry_run: bool = False) -> Optional[Path]: | |
| """Roboflow Universe'den severity (minor/moderate/severe) seti indirir. | |
| NOT: workspace/project/version isimleri Roboflow Universe arama ile | |
| DOGRULANMALIDIR. Kullanici kendi se?tigi seti vermeli. | |
| """ | |
| target = SEVERITY_ROBOFLOW_DIR | |
| if not confirm_disk(target, DISK_ESTIMATES_GB["severity_roboflow"], logger, dry_run): | |
| return None | |
| load_dotenv_if_present() | |
| api_key = os.environ.get("ROBOFLOW_API_KEY") | |
| if dry_run: | |
| logger.info("[DRY-RUN] Roboflow: %s/%s v%d -> %s", | |
| workspace, project, version, target) | |
| logger.info("[DRY-RUN] API key durumu: %s", | |
| "MEVCUT" if api_key else "EKSIK") | |
| return target | |
| if not api_key: | |
| logger.error("ROBOFLOW_API_KEY tanimli degil.") | |
| logger.error("Cozum:") | |
| logger.error(" 1) https://app.roboflow.com/settings/api adresinden key al") | |
| logger.error(" 2) %s\\.env dosyasina ekle:", PROJECT_ROOT) | |
| logger.error(" ROBOFLOW_API_KEY=xxxxxxxxxxxxxxxx") | |
| logger.error(" 3) Veya manuel indir:") | |
| logger.error(" https://universe.roboflow.com -> 'car damage severity' arat") | |
| logger.error(" -> Download Dataset -> YOLOv8 format -> %s altina ac", target) | |
| return None | |
| try: | |
| from roboflow import Roboflow | |
| except ImportError: | |
| logger.error("roboflow yok. Kur: pip install -r scripts/requirements.txt") | |
| return None | |
| target.mkdir(parents=True, exist_ok=True) | |
| logger.info("Roboflow indirme: %s/%s v%d (format=%s)", | |
| workspace, project, version, fmt) | |
| try: | |
| rf = Roboflow(api_key=api_key) | |
| proj = rf.workspace(workspace).project(project) | |
| ver = proj.version(version) | |
| # Roboflow paketi indirme yapip path dondurur | |
| ds = ver.download(fmt, location=str(target)) | |
| logger.info("Indirildi: %s", ds.location) | |
| return Path(ds.location) | |
| except Exception as e: | |
| logger.error("Roboflow indirme basarisiz: %s", e) | |
| logger.error("Manuel alternatif: Roboflow Universe arayuzunden ZIP indir,") | |
| logger.error(" %s altina cikart, prepare benzeri scripte yonlendir.", target) | |
| return None | |
| # ----------------------------------------------------------------------------- | |
| # 4) CarDD manuel (form sonrasi ZIP) | |
| # ----------------------------------------------------------------------------- | |
| def install_cardd_manual(zip_or_dir: Path, logger: logging.Logger, | |
| dry_run: bool = False) -> Optional[Path]: | |
| """Kullanicinin form sonrasi indirdigi CarDD_release ZIP/klasorunu yerlestirir. | |
| Kabul edilen kaynaklar: | |
| - .zip dosyasi (icinde CarDD_release/ veya CarDD_COCO/) | |
| - Acilmis dizin (CarDD_release/ veya direkt CarDD_COCO/) | |
| """ | |
| src = Path(zip_or_dir).expanduser().resolve() | |
| target = CARDD_RELEASE_DIR | |
| target.mkdir(parents=True, exist_ok=True) | |
| if not src.exists(): | |
| logger.error("Kaynak bulunamadi: %s", src) | |
| return None | |
| if not confirm_disk(target, DISK_ESTIMATES_GB["cardd_release"], logger, dry_run): | |
| return None | |
| if dry_run: | |
| logger.info("[DRY-RUN] %s -> %s", src, target) | |
| return target | |
| if src.is_file() and src.suffix.lower() == ".zip": | |
| logger.info("ZIP cikartiliyor: %s -> %s", src, target) | |
| try: | |
| write_hash_sidecar(src, logger) | |
| except Exception as e: | |
| logger.warning("Hash hesaplanamadi: %s", e) | |
| with zipfile.ZipFile(src, "r") as zf: | |
| members = zf.namelist() | |
| logger.info("Icerik: %d dosya", len(members)) | |
| zf.extractall(target) | |
| elif src.is_dir(): | |
| logger.info("Dizin kopyalaniyor: %s -> %s", src, target) | |
| # Tum icerigi merge et | |
| for item in src.iterdir(): | |
| dst = target / item.name | |
| if dst.exists(): | |
| logger.info(" atlandi (var): %s", dst.name) | |
| continue | |
| if item.is_dir(): | |
| shutil.copytree(item, dst) | |
| else: | |
| shutil.copy2(item, dst) | |
| else: | |
| logger.error("Desteklenmeyen kaynak: %s", src) | |
| return None | |
| # CarDD_COCO klasorunu bul (iki seviye derinlik kontrol) | |
| candidates = list(target.glob("**/CarDD_COCO")) | |
| cardd_coco = candidates[0] if candidates else target / "CarDD_COCO" | |
| logger.info("CarDD_COCO yolu: %s (var: %s)", cardd_coco, cardd_coco.exists()) | |
| logger.info("\nSonraki adim:\n python services/ml/prepare_data.py " | |
| "\\\n --cardd_root %s \\\n --output_dir %s", | |
| cardd_coco, CARDD_YOLO_DIR) | |
| return cardd_coco | |
| # ----------------------------------------------------------------------------- | |
| # Planning / dry-run raporu | |
| # ----------------------------------------------------------------------------- | |
| def build_plans(args: argparse.Namespace) -> Iterable[DownloadPlan]: | |
| plans = [] | |
| if args.cardd_hf or args.all: | |
| plans.append(DownloadPlan( | |
| "CarDD (HuggingFace mirror)", | |
| CARDD_HF_DIR, | |
| DISK_ESTIMATES_GB["cardd_hf"], | |
| requires_auth=False, | |
| requires_manual=False, | |
| notes="Pretrained-friendly, otomatik resume.", | |
| )) | |
| if args.carparts_ultra or args.all: | |
| plans.append(DownloadPlan( | |
| "Ultralytics CarParts-Seg", | |
| Path("<ultralytics datasets_dir>/carparts-seg"), | |
| DISK_ESTIMATES_GB["carparts_ultra"], | |
| requires_auth=False, | |
| requires_manual=False, | |
| notes="Ultralytics auto-download tetiklenir.", | |
| )) | |
| if args.roboflow_severity or args.all: | |
| plans.append(DownloadPlan( | |
| "Roboflow severity", | |
| SEVERITY_ROBOFLOW_DIR, | |
| DISK_ESTIMATES_GB["severity_roboflow"], | |
| requires_auth=True, | |
| requires_manual=False, | |
| notes="ROBOFLOW_API_KEY gerekli (.env).", | |
| )) | |
| if args.cardd_manual: | |
| plans.append(DownloadPlan( | |
| "CarDD (manuel form sonrasi)", | |
| CARDD_RELEASE_DIR, | |
| DISK_ESTIMATES_GB["cardd_release"], | |
| requires_auth=False, | |
| requires_manual=True, | |
| notes="https://cardd-ustc.github.io form basvurusu sonrasi.", | |
| )) | |
| elif args.all: | |
| plans.append(DownloadPlan( | |
| "CarDD (manuel form sonrasi)", | |
| CARDD_RELEASE_DIR, | |
| DISK_ESTIMATES_GB["cardd_release"], | |
| requires_auth=False, | |
| requires_manual=True, | |
| notes="--all otomatik indirmez; form bekliyor.", | |
| )) | |
| return plans | |
| def print_plan_table(plans: Iterable[DownloadPlan], logger: logging.Logger) -> None: | |
| logger.info("=" * 78) | |
| logger.info("INDIRME PLANI") | |
| logger.info("=" * 78) | |
| total = 0.0 | |
| for p in plans: | |
| flags = [] | |
| if p.requires_auth: | |
| flags.append("AUTH") | |
| if p.requires_manual: | |
| flags.append("MANUEL") | |
| flag_str = ",".join(flags) if flags else "-" | |
| logger.info("- %-35s ~%5.1f GB [%s]", p.name, p.est_gb, flag_str) | |
| logger.info(" hedef: %s", p.target) | |
| logger.info(" not : %s", p.notes) | |
| total += p.est_gb | |
| logger.info("-" * 78) | |
| logger.info("Toplam tahmini: ~%.1f GB", total) | |
| logger.info("=" * 78) | |
| # ----------------------------------------------------------------------------- | |
| # CLI | |
| # ----------------------------------------------------------------------------- | |
| def build_parser() -> argparse.ArgumentParser: | |
| p = argparse.ArgumentParser( | |
| description=( | |
| "Arac hasar tespiti MVP veri seti indirici. " | |
| "Tum ciktilar services/ml/data/ altina kaydedilir." | |
| ), | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=__doc__, | |
| ) | |
| p.add_argument("--cardd-hf", action="store_true", | |
| help="HuggingFace harpreetsahota/CarDD setini indir") | |
| p.add_argument("--carparts-ultra", action="store_true", | |
| help="Ultralytics CarParts-Seg setini pre-fetch et") | |
| p.add_argument("--roboflow-severity", action="store_true", | |
| help="Roboflow severity (minor/moderate/severe) setini indir") | |
| p.add_argument("--cardd-manual", type=str, default=None, metavar="PATH", | |
| help="Form sonrasi indirilen CarDD_release ZIP/klasor yolu") | |
| p.add_argument("--all", action="store_true", | |
| help="Mumkun olan setlerin hepsini indir (manuel adimlar haric)") | |
| p.add_argument("--rf-workspace", default="car-damage-detection-cardd", | |
| help="Roboflow workspace slug (DOGRULA)") | |
| p.add_argument("--rf-project", default="car-damage-severity", | |
| help="Roboflow project slug (DOGRULA)") | |
| p.add_argument("--rf-version", type=int, default=1, | |
| help="Roboflow version no") | |
| p.add_argument("--rf-format", default="yolov8", | |
| help="Roboflow export format (yolov8/yolov11/coco)") | |
| p.add_argument("--dry-run", action="store_true", | |
| help="Sadece plan/disk raporu; indirme yapma") | |
| return p | |
| def main(argv: Optional[list] = None) -> int: | |
| parser = build_parser() | |
| args = parser.parse_args(argv) | |
| if not any([args.cardd_hf, args.carparts_ultra, args.roboflow_severity, | |
| args.cardd_manual, args.all]): | |
| parser.print_help() | |
| print("\nHIC SUBCOMMAND VERILMEDI. Ornek: --all --dry-run") | |
| return 2 | |
| logger = setup_logger("download_data") | |
| logger.info("Proje koku: %s", PROJECT_ROOT) | |
| logger.info("Veri koku: %s", DATA_ROOT) | |
| DATA_ROOT.mkdir(parents=True, exist_ok=True) | |
| plans = list(build_plans(args)) | |
| print_plan_table(plans, logger) | |
| if args.dry_run: | |
| logger.info("[DRY-RUN] Hicbir indirme yapilmadi.") | |
| return 0 | |
| rc = 0 | |
| if args.cardd_hf or args.all: | |
| if download_cardd_hf(logger) is None: | |
| rc = max(rc, 1) | |
| if args.carparts_ultra or args.all: | |
| if trigger_carparts_ultra(logger) is None: | |
| rc = max(rc, 1) | |
| if args.roboflow_severity or args.all: | |
| if download_roboflow_severity( | |
| logger, | |
| workspace=args.rf_workspace, | |
| project=args.rf_project, | |
| version=args.rf_version, | |
| fmt=args.rf_format) is None: | |
| rc = max(rc, 1) | |
| if args.cardd_manual: | |
| if install_cardd_manual(Path(args.cardd_manual), logger) is None: | |
| rc = max(rc, 1) | |
| elif args.all: | |
| logger.info("") | |
| logger.info("MANUEL ADIM (CarDD ana set):") | |
| logger.info(" 1) https://cardd-ustc.github.io adresinde forma basvur (1-2 gun).") | |
| logger.info(" 2) ZIP gelince:") | |
| logger.info(" python scripts/download_data.py --cardd-manual <ZIP_YOLU>") | |
| logger.info(" 3) Bu arada HF mirror'i kullanarak pretrained baselineu egit.") | |
| logger.info("Bitti. RC=%d", rc) | |
| return rc | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |