hasari-api / scripts /download_data.py
erdoganpeker's picture
v0.3.0 — multimodal vehicle damage MVP
e327f0d
"""
download_data.py
Arac hasar tespiti MVP icin veri seti indirme orchestratoru.
Kullanim:
python scripts/download_data.py --help
python scripts/download_data.py --cardd-hf
python scripts/download_data.py --carparts-ultra
python scripts/download_data.py --roboflow-severity
python scripts/download_data.py --cardd-manual C:/Downloads/CarDD_release.zip
python scripts/download_data.py --all
python scripts/download_data.py --all --dry-run
Notlar:
- Tum cikti yollari `services/ml/data/` altinda toplanir; mevcut
`services/ml/prepare_data.py` ve `prepare_parts_data.py` ile uyumludur.
- CarDD ana set form basvurusu gerektirir (https://cardd-ustc.github.io).
HF mirror (`harpreetsahota/CarDD`) form gerektirmez ama lisansi ayni
(academic non-commercial). Ticari kullanim icin yazarlardan izin gerekir.
- Roboflow setleri icin ROBOFLOW_API_KEY environment variable veya .env
dosyasinda tanimli olmali.
"""
from __future__ import annotations
import argparse
import hashlib
import logging
import os
import shutil
import sys
import time
import zipfile
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional
# Proje koku (scripts/ klasorunun bir ust seviyesi)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
ML_ROOT = PROJECT_ROOT / "services" / "ml"
DATA_ROOT = ML_ROOT / "data"
LOG_DIR = PROJECT_ROOT / "scripts" / ".logs"
# Hedef alt klasorler (prepare_data.py / prepare_parts_data.py ile uyumlu)
CARDD_HF_DIR = DATA_ROOT / "cardd_hf" # HuggingFace mirror
CARDD_RELEASE_DIR = DATA_ROOT / "CarDD_release" # Manuel/form ana set
CARDD_YOLO_DIR = DATA_ROOT / "cardd_yolo" # prepare_data.py ciktisi
PARTS_YOLO_DIR = DATA_ROOT / "parts_yolo" # prepare_parts_data.py ciktisi
SEVERITY_ROBOFLOW_DIR = DATA_ROOT / "severity_roboflow"
# Disk gereksinim tahminleri (GB)
DISK_ESTIMATES_GB = {
"cardd_hf": 6.5,
"cardd_release": 6.5,
"carparts_ultra": 1.2,
"severity_roboflow": 0.5,
"yolo_outputs": 7.0, # prepare_data.py kopyalari da dahil
}
# -----------------------------------------------------------------------------
# Logging
# -----------------------------------------------------------------------------
def setup_logger(name: str) -> logging.Logger:
LOG_DIR.mkdir(parents=True, exist_ok=True)
log_file = LOG_DIR / f"{name}_{time.strftime('%Y%m%d_%H%M%S')}.log"
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
logger.handlers.clear()
fmt = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s",
datefmt="%H:%M:%S")
fh = logging.FileHandler(log_file, encoding="utf-8")
fh.setFormatter(fmt)
logger.addHandler(fh)
sh = logging.StreamHandler(sys.stdout)
sh.setFormatter(fmt)
logger.addHandler(sh)
logger.info("Log dosyasi: %s", log_file)
return logger
# -----------------------------------------------------------------------------
# Yardimcilar
# -----------------------------------------------------------------------------
@dataclass
class DownloadPlan:
name: str
target: Path
est_gb: float
requires_auth: bool
requires_manual: bool
notes: str
def free_disk_gb(path: Path) -> float:
"""Belirtilen path icin bos disk alanini GB cinsinden dondurur."""
try:
path.mkdir(parents=True, exist_ok=True)
usage = shutil.disk_usage(str(path))
return usage.free / (1024 ** 3)
except Exception:
return -1.0
def sha256_file(path: Path, chunk: int = 1024 * 1024) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
for buf in iter(lambda: f.read(chunk), b""):
h.update(buf)
return h.hexdigest()
def write_hash_sidecar(path: Path, logger: logging.Logger) -> str:
"""Bir dosyanin SHA256'sini hesaplayip yanina .sha256 olarak yazar."""
digest = sha256_file(path)
sidecar = path.with_suffix(path.suffix + ".sha256")
sidecar.write_text(f"{digest} {path.name}\n", encoding="utf-8")
logger.info("SHA256 %s = %s", path.name, digest)
return digest
def verify_hash(path: Path, expected: Optional[str], logger: logging.Logger) -> bool:
if not expected:
return True
actual = sha256_file(path)
ok = actual.lower() == expected.lower()
if ok:
logger.info("Hash dogrulandi: %s", path.name)
else:
logger.error("Hash UYUSMUYOR: %s\n beklenen: %s\n bulunan : %s",
path.name, expected, actual)
return ok
def load_dotenv_if_present() -> None:
"""Basit .env loader (python-dotenv olmadan)."""
env_path = PROJECT_ROOT / ".env"
if not env_path.exists():
return
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, val = line.partition("=")
key = key.strip()
val = val.strip().strip('"').strip("'")
os.environ.setdefault(key, val)
def confirm_disk(target: Path, need_gb: float, logger: logging.Logger,
dry_run: bool) -> bool:
free = free_disk_gb(target)
logger.info("Hedef: %s | Gereken ~%.1f GB | Bos: %.1f GB", target, need_gb, free)
if free < 0:
logger.warning("Disk alani okunamadi, devam ediliyor.")
return True
if free < need_gb * 1.2:
logger.error("Yetersiz disk alani (%.1f < %.1f * 1.2).", free, need_gb)
return False
if dry_run:
logger.info("[DRY-RUN] %s indirilecek (~%.1f GB)", target, need_gb)
return True
# -----------------------------------------------------------------------------
# 1) CarDD - HuggingFace mirror
# -----------------------------------------------------------------------------
def download_cardd_hf(logger: logging.Logger, dry_run: bool = False) -> Optional[Path]:
"""harpreetsahota/CarDD setini HF Hub'dan indirir.
Strateji:
a) huggingface_hub.snapshot_download (resume destekli, yeniden calistirilabilir)
b) datasets.load_dataset fallback
"""
target = CARDD_HF_DIR
if not confirm_disk(target, DISK_ESTIMATES_GB["cardd_hf"], logger, dry_run):
return None
if dry_run:
logger.info("[DRY-RUN] HF repo: harpreetsahota/CarDD -> %s", target)
return target
target.mkdir(parents=True, exist_ok=True)
repo_id = "harpreetsahota/CarDD"
try:
from huggingface_hub import snapshot_download
except ImportError:
logger.error("huggingface_hub yok. Kur: pip install -r scripts/requirements.txt")
return None
logger.info("HF snapshot baslat: %s -> %s", repo_id, target)
try:
path = snapshot_download(
repo_id=repo_id,
repo_type="dataset",
local_dir=str(target),
local_dir_use_symlinks=False, # Windows uyumu icin kopya
resume_download=True,
)
logger.info("Tamamlandi: %s", path)
return Path(path)
except Exception as e:
logger.warning("snapshot_download basarisiz: %s", e)
logger.info("Fallback: datasets.load_dataset deneniyor...")
try:
from datasets import load_dataset
ds = load_dataset(repo_id, cache_dir=str(target / ".hf_cache"))
logger.info("datasets ile yuklendi: %s", list(ds.keys()))
return target
except Exception as e2:
logger.error("HF indirme basarisiz: %s", e2)
logger.error("Manuel: https://huggingface.co/datasets/%s", repo_id)
return None
# -----------------------------------------------------------------------------
# 2) Ultralytics CarParts-Seg
# -----------------------------------------------------------------------------
def trigger_carparts_ultra(logger: logging.Logger, dry_run: bool = False) -> Optional[Path]:
"""Ultralytics CarParts-Seg veri setini ON-CACHE eder.
Ultralytics datasets klasorune kendi indirir. Biz sadece pre-fetch tetikliyoruz.
Kullanici sonrasinda:
python services/ml/prepare_parts_data.py --use_ultralytics \\
--output_dir services/ml/data/parts_yolo
"""
if dry_run:
logger.info("[DRY-RUN] Ultralytics 'carparts-seg.yaml' auto-download tetiklenecek")
logger.info("[DRY-RUN] Tahmini boyut: ~%.1f GB", DISK_ESTIMATES_GB["carparts_ultra"])
return None
try:
from ultralytics import YOLO
from ultralytics.utils import SETTINGS
except ImportError:
logger.error("ultralytics yok. Kur: pip install ultralytics")
return None
datasets_dir = Path(SETTINGS.get("datasets_dir", "."))
target = datasets_dir / "carparts-seg"
logger.info("Ultralytics datasets_dir: %s", datasets_dir)
logger.info("Hedef: %s", target)
if not confirm_disk(datasets_dir, DISK_ESTIMATES_GB["carparts_ultra"], logger, False):
return None
if target.exists() and any(target.iterdir()):
logger.info("Mevcut, atlandi: %s", target)
else:
logger.info("CarParts-Seg indirme tetikleniyor (CPU val on minimal config)...")
try:
m = YOLO("yolo11n-seg.pt") # backbone auto-iner; Ultralytics dataset ZIP indirir
try:
m.val(data="carparts-seg.yaml", batch=1, device="cpu",
workers=0, verbose=False)
except Exception as ve:
# Indirme tetiklemek icin val'a guveniyoruz; val hatasi onemsiz
logger.info("val tetikleyici tamamlandi (hata bekleniyordu): %s",
type(ve).__name__)
except Exception as e:
logger.error("Ultralytics indirme basarisiz: %s", e)
logger.error("Manuel: https://docs.ultralytics.com/datasets/segment/carparts-seg/")
return None
logger.info("\nSonraki adim:\n python services/ml/prepare_parts_data.py "
"--use_ultralytics --output_dir %s", PARTS_YOLO_DIR)
return target
# -----------------------------------------------------------------------------
# 3) Roboflow severity dataset
# -----------------------------------------------------------------------------
def download_roboflow_severity(logger: logging.Logger,
workspace: str = "car-damage-detection-cardd",
project: str = "car-damage-severity",
version: int = 1,
fmt: str = "yolov8",
dry_run: bool = False) -> Optional[Path]:
"""Roboflow Universe'den severity (minor/moderate/severe) seti indirir.
NOT: workspace/project/version isimleri Roboflow Universe arama ile
DOGRULANMALIDIR. Kullanici kendi se?tigi seti vermeli.
"""
target = SEVERITY_ROBOFLOW_DIR
if not confirm_disk(target, DISK_ESTIMATES_GB["severity_roboflow"], logger, dry_run):
return None
load_dotenv_if_present()
api_key = os.environ.get("ROBOFLOW_API_KEY")
if dry_run:
logger.info("[DRY-RUN] Roboflow: %s/%s v%d -> %s",
workspace, project, version, target)
logger.info("[DRY-RUN] API key durumu: %s",
"MEVCUT" if api_key else "EKSIK")
return target
if not api_key:
logger.error("ROBOFLOW_API_KEY tanimli degil.")
logger.error("Cozum:")
logger.error(" 1) https://app.roboflow.com/settings/api adresinden key al")
logger.error(" 2) %s\\.env dosyasina ekle:", PROJECT_ROOT)
logger.error(" ROBOFLOW_API_KEY=xxxxxxxxxxxxxxxx")
logger.error(" 3) Veya manuel indir:")
logger.error(" https://universe.roboflow.com -> 'car damage severity' arat")
logger.error(" -> Download Dataset -> YOLOv8 format -> %s altina ac", target)
return None
try:
from roboflow import Roboflow
except ImportError:
logger.error("roboflow yok. Kur: pip install -r scripts/requirements.txt")
return None
target.mkdir(parents=True, exist_ok=True)
logger.info("Roboflow indirme: %s/%s v%d (format=%s)",
workspace, project, version, fmt)
try:
rf = Roboflow(api_key=api_key)
proj = rf.workspace(workspace).project(project)
ver = proj.version(version)
# Roboflow paketi indirme yapip path dondurur
ds = ver.download(fmt, location=str(target))
logger.info("Indirildi: %s", ds.location)
return Path(ds.location)
except Exception as e:
logger.error("Roboflow indirme basarisiz: %s", e)
logger.error("Manuel alternatif: Roboflow Universe arayuzunden ZIP indir,")
logger.error(" %s altina cikart, prepare benzeri scripte yonlendir.", target)
return None
# -----------------------------------------------------------------------------
# 4) CarDD manuel (form sonrasi ZIP)
# -----------------------------------------------------------------------------
def install_cardd_manual(zip_or_dir: Path, logger: logging.Logger,
dry_run: bool = False) -> Optional[Path]:
"""Kullanicinin form sonrasi indirdigi CarDD_release ZIP/klasorunu yerlestirir.
Kabul edilen kaynaklar:
- .zip dosyasi (icinde CarDD_release/ veya CarDD_COCO/)
- Acilmis dizin (CarDD_release/ veya direkt CarDD_COCO/)
"""
src = Path(zip_or_dir).expanduser().resolve()
target = CARDD_RELEASE_DIR
target.mkdir(parents=True, exist_ok=True)
if not src.exists():
logger.error("Kaynak bulunamadi: %s", src)
return None
if not confirm_disk(target, DISK_ESTIMATES_GB["cardd_release"], logger, dry_run):
return None
if dry_run:
logger.info("[DRY-RUN] %s -> %s", src, target)
return target
if src.is_file() and src.suffix.lower() == ".zip":
logger.info("ZIP cikartiliyor: %s -> %s", src, target)
try:
write_hash_sidecar(src, logger)
except Exception as e:
logger.warning("Hash hesaplanamadi: %s", e)
with zipfile.ZipFile(src, "r") as zf:
members = zf.namelist()
logger.info("Icerik: %d dosya", len(members))
zf.extractall(target)
elif src.is_dir():
logger.info("Dizin kopyalaniyor: %s -> %s", src, target)
# Tum icerigi merge et
for item in src.iterdir():
dst = target / item.name
if dst.exists():
logger.info(" atlandi (var): %s", dst.name)
continue
if item.is_dir():
shutil.copytree(item, dst)
else:
shutil.copy2(item, dst)
else:
logger.error("Desteklenmeyen kaynak: %s", src)
return None
# CarDD_COCO klasorunu bul (iki seviye derinlik kontrol)
candidates = list(target.glob("**/CarDD_COCO"))
cardd_coco = candidates[0] if candidates else target / "CarDD_COCO"
logger.info("CarDD_COCO yolu: %s (var: %s)", cardd_coco, cardd_coco.exists())
logger.info("\nSonraki adim:\n python services/ml/prepare_data.py "
"\\\n --cardd_root %s \\\n --output_dir %s",
cardd_coco, CARDD_YOLO_DIR)
return cardd_coco
# -----------------------------------------------------------------------------
# Planning / dry-run raporu
# -----------------------------------------------------------------------------
def build_plans(args: argparse.Namespace) -> Iterable[DownloadPlan]:
plans = []
if args.cardd_hf or args.all:
plans.append(DownloadPlan(
"CarDD (HuggingFace mirror)",
CARDD_HF_DIR,
DISK_ESTIMATES_GB["cardd_hf"],
requires_auth=False,
requires_manual=False,
notes="Pretrained-friendly, otomatik resume.",
))
if args.carparts_ultra or args.all:
plans.append(DownloadPlan(
"Ultralytics CarParts-Seg",
Path("<ultralytics datasets_dir>/carparts-seg"),
DISK_ESTIMATES_GB["carparts_ultra"],
requires_auth=False,
requires_manual=False,
notes="Ultralytics auto-download tetiklenir.",
))
if args.roboflow_severity or args.all:
plans.append(DownloadPlan(
"Roboflow severity",
SEVERITY_ROBOFLOW_DIR,
DISK_ESTIMATES_GB["severity_roboflow"],
requires_auth=True,
requires_manual=False,
notes="ROBOFLOW_API_KEY gerekli (.env).",
))
if args.cardd_manual:
plans.append(DownloadPlan(
"CarDD (manuel form sonrasi)",
CARDD_RELEASE_DIR,
DISK_ESTIMATES_GB["cardd_release"],
requires_auth=False,
requires_manual=True,
notes="https://cardd-ustc.github.io form basvurusu sonrasi.",
))
elif args.all:
plans.append(DownloadPlan(
"CarDD (manuel form sonrasi)",
CARDD_RELEASE_DIR,
DISK_ESTIMATES_GB["cardd_release"],
requires_auth=False,
requires_manual=True,
notes="--all otomatik indirmez; form bekliyor.",
))
return plans
def print_plan_table(plans: Iterable[DownloadPlan], logger: logging.Logger) -> None:
logger.info("=" * 78)
logger.info("INDIRME PLANI")
logger.info("=" * 78)
total = 0.0
for p in plans:
flags = []
if p.requires_auth:
flags.append("AUTH")
if p.requires_manual:
flags.append("MANUEL")
flag_str = ",".join(flags) if flags else "-"
logger.info("- %-35s ~%5.1f GB [%s]", p.name, p.est_gb, flag_str)
logger.info(" hedef: %s", p.target)
logger.info(" not : %s", p.notes)
total += p.est_gb
logger.info("-" * 78)
logger.info("Toplam tahmini: ~%.1f GB", total)
logger.info("=" * 78)
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description=(
"Arac hasar tespiti MVP veri seti indirici. "
"Tum ciktilar services/ml/data/ altina kaydedilir."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
p.add_argument("--cardd-hf", action="store_true",
help="HuggingFace harpreetsahota/CarDD setini indir")
p.add_argument("--carparts-ultra", action="store_true",
help="Ultralytics CarParts-Seg setini pre-fetch et")
p.add_argument("--roboflow-severity", action="store_true",
help="Roboflow severity (minor/moderate/severe) setini indir")
p.add_argument("--cardd-manual", type=str, default=None, metavar="PATH",
help="Form sonrasi indirilen CarDD_release ZIP/klasor yolu")
p.add_argument("--all", action="store_true",
help="Mumkun olan setlerin hepsini indir (manuel adimlar haric)")
p.add_argument("--rf-workspace", default="car-damage-detection-cardd",
help="Roboflow workspace slug (DOGRULA)")
p.add_argument("--rf-project", default="car-damage-severity",
help="Roboflow project slug (DOGRULA)")
p.add_argument("--rf-version", type=int, default=1,
help="Roboflow version no")
p.add_argument("--rf-format", default="yolov8",
help="Roboflow export format (yolov8/yolov11/coco)")
p.add_argument("--dry-run", action="store_true",
help="Sadece plan/disk raporu; indirme yapma")
return p
def main(argv: Optional[list] = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if not any([args.cardd_hf, args.carparts_ultra, args.roboflow_severity,
args.cardd_manual, args.all]):
parser.print_help()
print("\nHIC SUBCOMMAND VERILMEDI. Ornek: --all --dry-run")
return 2
logger = setup_logger("download_data")
logger.info("Proje koku: %s", PROJECT_ROOT)
logger.info("Veri koku: %s", DATA_ROOT)
DATA_ROOT.mkdir(parents=True, exist_ok=True)
plans = list(build_plans(args))
print_plan_table(plans, logger)
if args.dry_run:
logger.info("[DRY-RUN] Hicbir indirme yapilmadi.")
return 0
rc = 0
if args.cardd_hf or args.all:
if download_cardd_hf(logger) is None:
rc = max(rc, 1)
if args.carparts_ultra or args.all:
if trigger_carparts_ultra(logger) is None:
rc = max(rc, 1)
if args.roboflow_severity or args.all:
if download_roboflow_severity(
logger,
workspace=args.rf_workspace,
project=args.rf_project,
version=args.rf_version,
fmt=args.rf_format) is None:
rc = max(rc, 1)
if args.cardd_manual:
if install_cardd_manual(Path(args.cardd_manual), logger) is None:
rc = max(rc, 1)
elif args.all:
logger.info("")
logger.info("MANUEL ADIM (CarDD ana set):")
logger.info(" 1) https://cardd-ustc.github.io adresinde forma basvur (1-2 gun).")
logger.info(" 2) ZIP gelince:")
logger.info(" python scripts/download_data.py --cardd-manual <ZIP_YOLU>")
logger.info(" 3) Bu arada HF mirror'i kullanarak pretrained baselineu egit.")
logger.info("Bitti. RC=%d", rc)
return rc
if __name__ == "__main__":
sys.exit(main())