Spaces:

ALYYAN
/

Age-and-Gender-detection

Runtime error

App Files Files Community

ALYYAN commited on Sep 25

Commit

30672d3

1 Parent(s): 8cafdb8

Define DVC pipeline for data ingestion and training

Browse files

Files changed (27) hide show

.dvc/config +4 -0
.github/workflows/.gitkeep +0 -0
app.py +0 -0
config/config.yaml +18 -0
dvc.yaml +20 -0
main.py +23 -0
params.yaml +9 -0
readme.md +0 -0
requirements.txt +34 -0
research/trials.ipynb +0 -0
setup.py +28 -0
src/cnnClassifier/__init__.py +0 -0
src/cnnClassifier/components/__init__.py +0 -0
src/cnnClassifier/components/data_ingestion.py +34 -0
src/cnnClassifier/components/model_trainer.py +173 -0
src/cnnClassifier/config/__init__.py +0 -0
src/cnnClassifier/config/configuration.py +58 -0
src/cnnClassifier/constants/__init__.py +4 -0
src/cnnClassifier/entity/__init__.py +0 -0
src/cnnClassifier/entity/config_entity.py +29 -0
src/cnnClassifier/pipeline/__init__.py +0 -0
src/cnnClassifier/pipeline/stage_01_data_ingestion.py +27 -0
src/cnnClassifier/pipeline/stage_02_model_training.py +26 -0
src/cnnClassifier/utils/__init__.py +0 -0
src/cnnClassifier/utils/common.py +43 -0
template.py +44 -0
templates/index.html +0 -0

.dvc/config CHANGED Viewed

	@@ -0,0 +1,4 @@

+[core]
+    remote = myremote
+['remote "myremote"']
+    url = ../../dvc-storage

.github/workflows/.gitkeep ADDED Viewed

File without changes

app.py ADDED Viewed

File without changes

config/config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+artifacts_root: artifacts
+data_ingestion:
+  root_dir: artifacts/data_ingestion
+  dataset_name: frabbisw/facial-age
+  local_data_file: artifacts/data_ingestion/data.zip
+  unzip_dir: artifacts/data_ingestion
+data_preparation:
+  root_dir: artifacts/data_preparation
+  data_path: artifacts/data_ingestion/face_age
+  dataset_name: facial_age_prepared_dataset
+model_trainer:
+  root_dir: artifacts/model_trainer
+  trained_model_path: artifacts/model_trainer/facial_age_detector_model
+  # Using EfficientFormer-L1, a much lighter model than ViT
+  model_name: "snap-research/efficientformer-l1"

dvc.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+stages:
+  data_ingestion:
+    cmd: python src/cnnClassifier/pipeline/stage_01_data_ingestion.py
+    deps:
+      - src/cnnClassifier/pipeline/stage_01_data_ingestion.py
+      - src/cnnClassifier/components/data_ingestion.py
+      - config/config.yaml
+    outs:
+      - artifacts/data_ingestion
+  model_training:
+    cmd: python src/cnnClassifier/pipeline/stage_02_model_training.py
+    deps:
+      - src/cnnClassifier/pipeline/stage_02_model_training.py
+      - src/cnnClassifier/components/model_trainer.py
+      - config/config.yaml
+      - params.yaml
+      - artifacts/data_ingestion  # Depends on the output of the previous stage
+    outs:
+      - artifacts/model_trainer

main.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from cnnClassifier import logger
+from cnnClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
+from cnnClassifier.pipeline.stage_02_model_training import ModelTrainingPipeline
+STAGE_NAME = "Data Ingestion stage"
+try:
+   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+   data_ingestion = DataIngestionTrainingPipeline()
+   data_ingestion.main()
+   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+except Exception as e:
+        logger.exception(e)
+        raise e
+STAGE_NAME = "Model Training stage"
+try:
+   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+   model_training = ModelTrainingPipeline()
+   model_training.main()
+   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+except Exception as e:
+        logger.exception(e)
+        raise e

params.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# Training Parameters
+IMAGE_SIZE: 224 # EfficientFormer-L1 was trained on 224x224
+LEARNING_RATE: 2e-5
+BATCH_SIZE: 32
+NUM_TRAIN_EPOCHS: 20 # Adjust as needed
+WEIGHT_DECAY: 0.01
+WARMUP_STEPS: 100
+TEST_SPLIT_SIZE: 0.2
+RANDOM_STATE: 42

readme.md ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+# For PyTorch with CUDA 11.8 - MUST be installed with the extra index URL
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.1.0+cu118
+torchvision==0.16.0+cu118
+torchaudio==2.1.0
+# Pin NumPy to a version compatible with Torch 2.1.0
+numpy<2.0
+# Hugging Face
+transformers
+datasets>=2.14.5
+evaluate
+accelerate>=0.27
+# MLOps and Utilities
+mlflow
+dvc[s3] # Assuming you might use S3 with DVC for AWS
+python-box
+PyYAML
+ensure
+pandas
+scikit-learn
+Pillow
+tqdm
+imblearn
+# Frontend and Real-time Processing
+streamlit
+opencv-python
+mtcnn
+# AWS Deployment
+boto3

research/trials.ipynb ADDED Viewed

File without changes

setup.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import setuptools
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+__version__ = "0.0.0"
+REPO_NAME = "Facial-Age-Detection"
+AUTHOR_USER_NAME = "AlyyanAhmed21" # Change this
+SRC_REPO = "cnnClassifier"
+AUTHOR_EMAIL = "alyyanawan19@gmail.com" # Change this
+setuptools.setup(
+    name=SRC_REPO,
+    version=__version__,
+    author=AUTHOR_USER_NAME,
+    author_email=AUTHOR_EMAIL,
+    description="A python package for facial age detection app",
+    long_description=long_description,
+    long_description_content="text/markdown",
+    url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
+    project_urls={
+        "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
+    },
+    package_dir={"": "src"},
+    packages=setuptools.find_packages(where="src")
+)

src/cnnClassifier/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/components/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/components/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import zipfile
+from cnnClassifier import logger
+from cnnClassifier.entity.config_entity import DataIngestionConfig
+class DataIngestion:
+    def __init__(self, config: DataIngestionConfig):
+        self.config = config
+    def download_file(self):
+        """
+        Downloads the dataset from Kaggle.
+        Make sure to have your kaggle.json file in ~/.kaggle/ or set KAGGLE_USERNAME and KAGGLE_KEY env variables.
+        """
+        try:
+            logger.info(f"Downloading dataset from kaggle: {self.config.dataset_name}")
+            os.system(f"kaggle datasets download {self.config.dataset_name} -p {os.path.dirname(self.config.local_data_file)}")
+            # The downloaded file will be named 'facial-age.zip'. We need to rename it to 'data.zip' as per our config.
+            downloaded_zip_path = os.path.join(os.path.dirname(self.config.local_data_file), 'facial-age.zip')
+            os.rename(downloaded_zip_path, self.config.local_data_file)
+            logger.info(f"Dataset downloaded and saved at {self.config.local_data_file}")
+        except Exception as e:
+            logger.error(f"Failed to download dataset. Error: {e}")
+            raise e
+    def extract_zip_file(self):
+        """
+        Extracts the zip file into the data directory
+        """
+        unzip_path = self.config.unzip_dir
+        os.makedirs(unzip_path, exist_ok=True)
+        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
+            zip_ref.extractall(unzip_path)
+        logger.info(f"Dataset extracted to {unzip_path}")

src/cnnClassifier/components/model_trainer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import pandas as pd
+from pathlib import Path
+from tqdm import tqdm
+from datasets import Dataset, Image, ClassLabel
+from imblearn.over_sampling import RandomOverSampler
+from transformers import (
+    EfficientFormerImageProcessor,
+    EfficientFormerForImageClassification,
+    TrainingArguments,
+    Trainer,
+    DefaultDataCollator
+)
+from torchvision.transforms import (
+    Compose,
+    Normalize,
+    RandomRotation,
+    RandomResizedCrop,
+    RandomHorizontalFlip,
+    Resize,
+    ToTensor
+)
+import evaluate
+from cnnClassifier.entity.config_entity import ModelTrainerConfig
+class ModelTrainer:
+    def __init__(self, config: ModelTrainerConfig):
+        self.config = config
+        self.label2id = None
+        self.id2label = None
+    def _prepare_data(self):
+        logger.info("Preparing data...")
+        label_dict = {'001': '01', '002': '02', '003': '03', '004': '04', '005': '05',
+                      '006': '06-07', '007': '06-07', '008': '08-09', '009': '08-09',
+                      '010': '10-12', '011': '10-12', '012': '10-12', '013': '13-15',
+                      '014': '13-15', '015': '13-15', '016': '16-20', '017': '16-20',
+                      '018': '16-20', '019': '16-20', '020': '16-20', '021': '21-25',
+                      '022': '21-25', '023': '21-25', '024': '21-25', '025': '21-25',
+                      '026': '26-30', '027': '26-30', '028': '26-30', '029': '26-30',
+                      '030': '26-30', '031': '31-35', '032': '31-35', '033': '31-35',
+                      '034': '31-35', '035': '31-35', '036': '36-40', '037': '36-40',
+                      '038': '36-40', '039': '36-40', '040': '36-40', '041': '41-45',
+                      '042': '41-45', '043': '41-45', '044': '41-45', '045': '41-45',
+                      '046': '46-50', '047': '46-50', '048': '46-50', '049': '46-50',
+                      '050': '46-50', '051': '51-55', '052': '51-55', '053': '51-55',
+                      '054': '51-55', '055': '51-55', '056': '56-60', '057': '56-60',
+                      '058': '56-60', '059': '56-60', '060': '56-60', '061': '61-65',
+                      '062': '61-65', '063': '61-65', '064': '61-65', '065': '61-65',
+                      '066': '66-70', '067': '66-70', '068': '66-70', '069': '66-70',
+                      '070': '66-70', '071': '71-80', '072': '71-80', '073': '71-80',
+                      '074': '71-80', '075': '71-80', '076': '71-80', '077': '71-80',
+                      '078': '71-80', '079': '71-80', '080': '71-80', '081': '81-90',
+                      '082': '81-90', '083': '81-90', '084': '81-90', '085': '81-90',
+                      '086': '81-90', '087': '81-90', '088': '81-90', '089': '81-90',
+                      '090': '81-90', '091': '90+', '092': '90+', '093': '90+',
+                      '095': '90+', '096': '90+', '099': '90+', '100': '90+',
+                      '101': '90+', '110': '90+'}
+        file_names, labels = [], []
+        data_path = Path(self.config.data_path)
+        for file in tqdm(sorted(data_path.glob('*/*.*'))):
+            label = str(file).split('/')[-2]
+            labels.append(label_dict[label])
+            file_names.append(str(file))
+        df = pd.DataFrame.from_dict({"image": file_names, "label": labels})
+        # Random oversampling
+        ros = RandomOverSampler(random_state=self.config.random_state)
+        df_resampled, y_resampled = ros.fit_resample(df[['image']], df['label'])
+        df = pd.concat([df_resampled, y_resampled], axis=1)
+        dataset = Dataset.from_pandas(df).cast_column("image", Image())
+        labels_list = sorted(list(set(labels)))
+        self.label2id = {label: i for i, label in enumerate(labels_list)}
+        self.id2label = {i: label for i, label in enumerate(labels_list)}
+        ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
+        dataset = dataset.map(lambda x: {'label': ClassLabels.str2int(x['label'])}, batched=True)
+        dataset = dataset.cast_column('label', ClassLabels)
+        return dataset.train_test_split(test_size=self.config.test_split_size, shuffle=True, stratify_by_column="label")
+    def train(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
+        split_dataset = self._prepare_data()
+        train_data = split_dataset['train']
+        test_data = split_dataset['test']
+        processor = EfficientFormerImageProcessor.from_pretrained(self.config.model_name)
+        image_mean, image_std = processor.image_mean, processor.image_std
+        size = self.config.image_size
+        normalize = Normalize(mean=image_mean, std=image_std)
+        _train_transforms = Compose([
+            Resize((size, size)),
+            RandomRotation(15),
+            RandomHorizontalFlip(0.5),
+            ToTensor(),
+            normalize
+        ])
+        _val_transforms = Compose([
+            Resize((size, size)),
+            ToTensor(),
+            normalize
+        ])
+        def train_transforms(examples):
+            examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
+            return examples
+        def val_transforms(examples):
+            examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
+            return examples
+        train_data.set_transform(train_transforms)
+        test_data.set_transform(val_transforms)
+        def collate_fn(examples):
+            pixel_values = torch.stack([example["pixel_values"] for example in examples])
+            labels = torch.tensor([example['label'] for example in examples])
+            return {"pixel_values": pixel_values, "labels": labels}
+        model = EfficientFormerForImageClassification.from_pretrained(
+            self.config.model_name,
+            num_labels=len(self.id2label),
+            id2label=self.id2label,
+            label2id=self.label2id,
+            ignore_mismatched_sizes=True # Important for transfer learning
+        ).to(device)
+        accuracy = evaluate.load("accuracy")
+        def compute_metrics(eval_pred):
+            predictions, label_ids = eval_pred
+            predicted_labels = predictions.argmax(axis=1)
+            return accuracy.compute(predictions=predicted_labels, references=label_ids)
+        args = TrainingArguments(
+            output_dir=self.config.root_dir,
+            logging_dir=f'{self.config.root_dir}/logs',
+            evaluation_strategy="epoch",
+            learning_rate=self.config.learning_rate,
+            per_device_train_batch_size=self.config.batch_size,
+            per_device_eval_batch_size=self.config.batch_size,
+            num_train_epochs=self.config.num_train_epochs,
+            weight_decay=self.config.weight_decay,
+            warmup_steps=self.config.warmup_steps,
+            save_strategy='epoch',
+            load_best_model_at_end=True,
+            metric_for_best_model="accuracy",
+            save_total_limit=1,
+            report_to="none"
+        )
+        trainer = Trainer(
+            model=model,
+            args=args,
+            train_dataset=train_data,
+            eval_dataset=test_data,
+            data_collator=collate_fn,
+            compute_metrics=compute_metrics,
+            tokenizer=processor,
+        )
+        trainer.train()
+        logger.info(f"Saving best model to {self.config.trained_model_path}")
+        trainer.save_model(self.config.trained_model_path)

src/cnnClassifier/config/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/config/configuration.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from cnnClassifier.constants import *
+from cnnClassifier.utils.common import read_yaml, create_directories
+from cnnClassifier.entity.config_entity import DataIngestionConfig, DataPreparationConfig, ModelTrainerConfig
+class ConfigurationManager:
+    def __init__(
+        self,
+        config_filepath = CONFIG_FILE_PATH,
+        params_filepath = PARAMS_FILE_PATH):
+        self.config = read_yaml(config_filepath)
+        self.params = read_yaml(params_filepath)
+        create_directories([self.config.artifacts_root])
+    def get_data_ingestion_config(self) -> DataIngestionConfig:
+        config = self.config.data_ingestion
+        create_directories([config.root_dir])
+        data_ingestion_config = DataIngestionConfig(
+            root_dir=config.root_dir,
+            dataset_name=config.dataset_name,
+            local_data_file=config.local_data_file,
+            unzip_dir=config.unzip_dir
+        )
+        return data_ingestion_config
+    def get_data_preparation_config(self) -> DataPreparationConfig:
+        config = self.config.data_preparation
+        create_directories([config.root_dir])
+        data_preparation_config = DataPreparationConfig(
+            root_dir=config.root_dir,
+            data_path=config.data_path,
+            dataset_name=config.dataset_name
+        )
+        return data_preparation_config
+    def get_model_trainer_config(self) -> ModelTrainerConfig:
+        config = self.config.model_trainer
+        params = self.params
+        create_directories([config.root_dir])
+        model_trainer_config = ModelTrainerConfig(
+            root_dir=Path(config.root_dir),
+            trained_model_path=Path(config.trained_model_path),
+            model_name=config.model_name,
+            image_size=params.IMAGE_SIZE,
+            learning_rate=params.LEARNING_RATE,
+            batch_size=params.BATCH_SIZE,
+            num_train_epochs=params.NUM_TRAIN_EPOCHS,
+            weight_decay=params.WEIGHT_DECAY,
+            warmup_steps=params.WARMUP_STEPS,
+            test_split_size=params.TEST_SPLIT_SIZE,
+            random_state=params.RANDOM_STATE
+        )
+        return model_trainer_config

src/cnnClassifier/constants/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from pathlib import Path
+CONFIG_FILE_PATH = Path("config/config.yaml")
+PARAMS_FILE_PATH = Path("params.yaml")

src/cnnClassifier/entity/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/entity/config_entity.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class DataIngestionConfig:
+    root_dir: Path
+    dataset_name: str
+    local_data_file: Path
+    unzip_dir: Path
+@dataclass(frozen=True)
+class DataPreparationConfig:
+    root_dir: Path
+    data_path: Path
+    dataset_name: str
+@dataclass(frozen=True)
+class ModelTrainerConfig:
+    root_dir: Path
+    trained_model_path: Path
+    model_name: str
+    image_size: int
+    learning_rate: float
+    batch_size: int
+    num_train_epochs: int
+    weight_decay: float
+    warmup_steps: int
+    test_split_size: float
+    random_state: int

src/cnnClassifier/pipeline/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/pipeline/stage_01_data_ingestion.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from cnnClassifier.config.configuration import ConfigurationManager
+from cnnClassifier.components.data_ingestion import DataIngestion
+from cnnClassifier import logger
+STAGE_NAME = "Data Ingestion stage"
+class DataIngestionTrainingPipeline:
+    def __init__(self):
+        pass
+    def main(self):
+        config = ConfigurationManager()
+        data_ingestion_config = config.get_data_ingestion_config()
+        data_ingestion = DataIngestion(config=data_ingestion_config)
+        data_ingestion.download_file()
+        data_ingestion.extract_zip_file()
+if __name__ == '__main__':
+    try:
+        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+        obj = DataIngestionTrainingPipeline()
+        obj.main()
+        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+    except Exception as e:
+        logger.exception(e)
+        raise e

src/cnnClassifier/pipeline/stage_02_model_training.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from cnnClassifier.config.configuration import ConfigurationManager
+from cnnClassifier.components.model_trainer import ModelTrainer
+from cnnClassifier import logger
+STAGE_NAME = "Model Training stage"
+class ModelTrainingPipeline:
+    def __init__(self):
+        pass
+    def main(self):
+        config = ConfigurationManager()
+        model_trainer_config = config.get_model_trainer_config()
+        model_trainer = ModelTrainer(config=model_trainer_config)
+        model_trainer.train()
+if __name__ == '__main__':
+    try:
+        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+        obj = ModelTrainingPipeline()
+        obj.main()
+        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+    except Exception as e:
+        logger.exception(e)
+        raise e

src/cnnClassifier/utils/__init__.py ADDED Viewed

File without changes

src/cnnClassifier/utils/common.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from box.exceptions import BoxValueError
+import yaml
+from cnnClassifier import logger
+import json
+import joblib
+from ensure import ensure_annotations
+from box import ConfigBox
+from pathlib import Path
+from typing import Any
+@ensure_annotations
+def read_yaml(path_to_yaml: Path) -> ConfigBox:
+    """reads yaml file and returns
+    Args:
+        path_to_yaml (str): path like input
+    Raises:
+        ValueError: if yaml file is empty
+        e: empty file
+    Returns:
+        ConfigBox: ConfigBox type
+    """
+    try:
+        with open(path_to_yaml) as yaml_file:
+            content = yaml.safe_load(yaml_file)
+            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
+            return ConfigBox(content)
+    except BoxValueError:
+        raise ValueError("yaml file is empty")
+    except Exception as e:
+        raise e
+@ensure_annotations
+def create_directories(path_to_directories: list, verbose=True):
+    """create list of directories
+    Args:
+        path_to_directories (list): list of path of directories
+        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
+    """
+    for path in path_to_directories:
+        os.makedirs(path, exist_ok=True)
+        if verbose:
+            logger.info(f"created directory at: {path}")

template.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from pathlib import Path
+import logging
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
+project_name = "cnnClassifier"
+list_of_files = [
+    ".github/workflows/.gitkeep",
+    f"src/{project_name}/__init__.py",
+    f"src/{project_name}/components/__init__.py",
+    f"src/{project_name}/utils/__init__.py",
+    f"src/{project_name}/utils/common.py",
+    f"src/{project_name}/config/__init__.py",
+    f"src/{project_name}/config/configuration.py",
+    f"src/{project_name}/pipeline/__init__.py",
+    f"src/{project_name}/entity/__init__.py",
+    f"src/{project_name}/entity/config_entity.py",
+    f"src/{project_name}/constants/__init__.py",
+    "config/config.yaml",
+    "dvc.yaml",
+    "params.yaml",
+    "requirements.txt",
+    "setup.py",
+    "research/trials.ipynb",
+    "templates/index.html",
+    "app.py" # For Streamlit
+]
+for filepath in list_of_files:
+    filepath = Path(filepath)
+    filedir, filename = os.path.split(filepath)
+    if filedir != "":
+        os.makedirs(filedir, exist_ok=True)
+        logging.info(f"Creating directory; {filedir} for the file: {filename}")
+    if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
+        with open(filepath, "w") as f:
+            pass
+            logging.info(f"Creating empty file: {filepath}")
+    else:
+        logging.info(f"{filename} is already exists")

templates/index.html ADDED Viewed

File without changes