Spaces:
Running
Running
File size: 6,096 Bytes
6276d4c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | """
Data Setup: Scan raw dataset folders, build manifest CSVs, stratified train/val/test split.
Usage:
python -m src.data_setup
Expects dataset extracted to data/raw/ with structure:
data/raw/
βββ Ajwa/
β βββ img001.jpg
β βββ ...
βββ Galaxy/
βββ Medjool/
βββ ...
"""
import sys
from pathlib import Path
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image
from src.utils import load_config, seed_everything
VALID_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
def scan_dataset(raw_dir: str) -> pd.DataFrame:
"""
Scan the raw dataset directory and build a manifest DataFrame.
Returns DataFrame with columns: image_path, variety, label_idx
"""
raw_path = Path(raw_dir)
if not raw_path.exists():
print(f"ERROR: Raw data directory not found: {raw_path.resolve()}")
print("\nPlease download the dataset from Kaggle:")
print(" https://www.kaggle.com/datasets/wadhasnalhamdan/date-fruit-image-dataset-in-controlled-environment")
print(f"\nExtract the ZIP so variety folders are directly inside: {raw_path.resolve()}/")
sys.exit(1)
# Discover variety folders
variety_dirs = sorted([
d for d in raw_path.iterdir()
if d.is_dir() and not d.name.startswith(".")
])
if len(variety_dirs) == 0:
print(f"ERROR: No variety folders found in {raw_path.resolve()}")
print("Expected folders like: Ajwa/, Galaxy/, Medjool/, etc.")
sys.exit(1)
# Build class-to-index mapping
class_names = [d.name for d in variety_dirs]
class_to_idx = {name: idx for idx, name in enumerate(class_names)}
print(f"Found {len(class_names)} varieties: {class_names}")
# Collect all image paths
records = []
skipped = 0
for variety_dir in variety_dirs:
variety = variety_dir.name
# Walk recursively to handle nested structures
for img_path in sorted(variety_dir.rglob("*")):
if img_path.suffix.lower() not in VALID_EXTENSIONS:
continue
# Verify image is readable
try:
with Image.open(img_path) as img:
img.verify()
records.append({
"image_path": str(img_path),
"variety": variety,
"label_idx": class_to_idx[variety],
})
except Exception:
skipped += 1
if skipped > 0:
print(f"Warning: Skipped {skipped} corrupted/unreadable images")
df = pd.DataFrame(records)
print(f"Total valid images: {len(df)}")
return df
def stratified_split(
df: pd.DataFrame,
train_ratio: float = 0.70,
val_ratio: float = 0.15,
test_ratio: float = 0.15,
seed: int = 42,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Split dataset into train/val/test with stratification by variety.
"""
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
"Split ratios must sum to 1.0"
# First split: train vs (val + test)
train_df, temp_df = train_test_split(
df,
test_size=(val_ratio + test_ratio),
stratify=df["variety"],
random_state=seed,
)
# Second split: val vs test
relative_test_ratio = test_ratio / (val_ratio + test_ratio)
val_df, test_df = train_test_split(
temp_df,
test_size=relative_test_ratio,
stratify=temp_df["variety"],
random_state=seed,
)
return train_df, val_df, test_df
def print_split_summary(train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
"""Print detailed split statistics."""
print("\n" + "=" * 60)
print("DATASET SPLIT SUMMARY")
print("=" * 60)
print(f" Train: {len(train_df):>5} images ({len(train_df)/len(train_df)+len(val_df)+len(test_df):.0%}... )")
print(f" Val: {len(val_df):>5} images")
print(f" Test: {len(test_df):>5} images")
total = len(train_df) + len(val_df) + len(test_df)
print(f" Total: {total:>5} images")
print(f"\n{'Variety':<15} {'Train':>6} {'Val':>6} {'Test':>6} {'Total':>6}")
print("-" * 45)
all_varieties = sorted(train_df["variety"].unique())
for variety in all_varieties:
n_train = len(train_df[train_df["variety"] == variety])
n_val = len(val_df[val_df["variety"] == variety])
n_test = len(test_df[test_df["variety"] == variety])
n_total = n_train + n_val + n_test
print(f" {variety:<13} {n_train:>6} {n_val:>6} {n_test:>6} {n_total:>6}")
print("=" * 60)
def main():
"""Main data setup pipeline."""
config = load_config()
seed_everything(config["data"]["seed"])
# Step 1: Scan dataset
print("Step 1: Scanning dataset...")
df = scan_dataset(config["data"]["raw_dir"])
# Step 2: Stratified split
print("\nStep 2: Splitting dataset...")
splits = config["data"]["splits"]
train_df, val_df, test_df = stratified_split(
df,
train_ratio=splits[0],
val_ratio=splits[1],
test_ratio=splits[2],
seed=config["data"]["seed"],
)
# Step 3: Save CSVs
data_dir = Path("data")
data_dir.mkdir(parents=True, exist_ok=True)
train_df.to_csv(data_dir / "train.csv", index=False)
val_df.to_csv(data_dir / "val.csv", index=False)
test_df.to_csv(data_dir / "test.csv", index=False)
print(f"\nSaved: data/train.csv ({len(train_df)} rows)")
print(f"Saved: data/val.csv ({len(val_df)} rows)")
print(f"Saved: data/test.csv ({len(test_df)} rows)")
# Step 4: Print summary
print_split_summary(train_df, val_df, test_df)
# Save class mapping for reference
class_names = sorted(df["variety"].unique())
class_map = {name: idx for idx, name in enumerate(class_names)}
print(f"\nClass mapping: {class_map}")
print("\nData setup complete. Ready for training.")
if __name__ == "__main__":
main()
|