Spaces:
Sleeping
Sleeping
File size: 1,424 Bytes
eef8873 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import logging
from pathlib import Path
from src.config import DATASET_DIR, CLASS_TO_IDX
logger = logging.getLogger(__name__)
VALID_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
def collect_image_paths():
logger.info("Starting dataset ingestion...")
if not DATASET_DIR.exists():
raise FileNotFoundError(f"Dataset directory not found: {DATASET_DIR}")
samples = []
for class_name, label in CLASS_TO_IDX.items():
class_dir = DATASET_DIR / class_name
if not class_dir.exists():
logger.warning(f"Missing class folder: {class_dir}")
continue
image_count = 0
for image_path in class_dir.iterdir():
if image_path.suffix.lower() in VALID_EXTENSIONS:
samples.append((str(image_path), label))
image_count += 1
logger.info(f"{class_name}: {image_count} images found")
if not samples:
raise ValueError("No valid images found in dataset.")
logger.info(f"Total images collected: {len(samples)}")
return samples
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
data = collect_image_paths()
print(f"\nTotal samples: {len(data)}")
print("First 5 samples:")
for sample in data[:5]:
print(sample) |