File size: 3,258 Bytes
d2885a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
This file builds processed datasets for posture and phone-related training stages.
It keeps your data preparation logic separate from model training logic.
Right now it prepares clean folder-based outputs and metadata summaries.
Later it can be extended for train/val/test split generation and manifest files.
"""

from pathlib import Path

from src.entity.config_entity import PathsConfig
from src.utils.common import create_directories, save_json
from src.utils.helpers import list_image_files, split_list
from src.utils.logger import get_logger


class DatasetBuilder:
    """
    Build simple dataset metadata and split summaries for downstream pipelines.
    """

    def __init__(
        self,
        paths_config: PathsConfig,
        log_dir: Path | None = None,
        log_level: str = "INFO",
    ) -> None:
        self.paths_config = paths_config
        self.logger = get_logger(
            self.__class__.__name__, log_dir=log_dir, level=log_level
        )

    def build_image_dataset_manifest(
        self,
        source_dir: Path,
        manifest_name: str,
        train_ratio: float = 0.70,
        val_ratio: float = 0.15,
        test_ratio: float = 0.15,
    ) -> dict:
        """
        Create a simple dataset manifest from a folder of images.
        """
        images = list_image_files(source_dir)

        train_items, val_items, test_items = split_list(
            items=images,
            train_ratio=train_ratio,
            val_ratio=val_ratio,
            test_ratio=test_ratio,
            shuffle=True,
            seed=42,
        )

        manifest = {
            "source_dir": str(source_dir),
            "total_images": len(images),
            "train_count": len(train_items),
            "val_count": len(val_items),
            "test_count": len(test_items),
            "train_files": [str(path) for path in train_items],
            "val_files": [str(path) for path in val_items],
            "test_files": [str(path) for path in test_items],
        }

        create_directories([self.paths_config.metrics_dir])
        save_json(self.paths_config.metrics_dir / manifest_name, manifest)

        self.logger.info("Saved dataset manifest: %s", manifest_name)
        return manifest

    def run(self) -> dict:
        """
        Build manifests for available processed datasets.
        """
        create_directories(
            [
                self.paths_config.posture_feature_dir,
                self.paths_config.hand_crop_dir,
                self.paths_config.phone_dataset_dir,
                self.paths_config.metrics_dir,
            ]
        )

        posture_manifest = self.build_image_dataset_manifest(
            source_dir=self.paths_config.raw_image_dir,
            manifest_name="raw_image_manifest.json",
        )

        phone_manifest = self.build_image_dataset_manifest(
            source_dir=self.paths_config.phone_dataset_dir,
            manifest_name="phone_dataset_manifest.json",
        )

        summary = {
            "posture_manifest_total": posture_manifest["total_images"],
            "phone_manifest_total": phone_manifest["total_images"],
        }

        self.logger.info("Dataset builder summary: %s", summary)
        return summary