Spaces:
Runtime error
Runtime error
Define DVC pipeline for data ingestion and training
Browse files- .dvc/config +4 -0
- .github/workflows/.gitkeep +0 -0
- app.py +0 -0
- config/config.yaml +18 -0
- dvc.yaml +20 -0
- main.py +23 -0
- params.yaml +9 -0
- readme.md +0 -0
- requirements.txt +34 -0
- research/trials.ipynb +0 -0
- setup.py +28 -0
- src/cnnClassifier/__init__.py +0 -0
- src/cnnClassifier/components/__init__.py +0 -0
- src/cnnClassifier/components/data_ingestion.py +34 -0
- src/cnnClassifier/components/model_trainer.py +173 -0
- src/cnnClassifier/config/__init__.py +0 -0
- src/cnnClassifier/config/configuration.py +58 -0
- src/cnnClassifier/constants/__init__.py +4 -0
- src/cnnClassifier/entity/__init__.py +0 -0
- src/cnnClassifier/entity/config_entity.py +29 -0
- src/cnnClassifier/pipeline/__init__.py +0 -0
- src/cnnClassifier/pipeline/stage_01_data_ingestion.py +27 -0
- src/cnnClassifier/pipeline/stage_02_model_training.py +26 -0
- src/cnnClassifier/utils/__init__.py +0 -0
- src/cnnClassifier/utils/common.py +43 -0
- template.py +44 -0
- templates/index.html +0 -0
.dvc/config
CHANGED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[core]
|
| 2 |
+
remote = myremote
|
| 3 |
+
['remote "myremote"']
|
| 4 |
+
url = ../../dvc-storage
|
.github/workflows/.gitkeep
ADDED
|
File without changes
|
app.py
ADDED
|
File without changes
|
config/config.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
artifacts_root: artifacts
|
| 2 |
+
|
| 3 |
+
data_ingestion:
|
| 4 |
+
root_dir: artifacts/data_ingestion
|
| 5 |
+
dataset_name: frabbisw/facial-age
|
| 6 |
+
local_data_file: artifacts/data_ingestion/data.zip
|
| 7 |
+
unzip_dir: artifacts/data_ingestion
|
| 8 |
+
|
| 9 |
+
data_preparation:
|
| 10 |
+
root_dir: artifacts/data_preparation
|
| 11 |
+
data_path: artifacts/data_ingestion/face_age
|
| 12 |
+
dataset_name: facial_age_prepared_dataset
|
| 13 |
+
|
| 14 |
+
model_trainer:
|
| 15 |
+
root_dir: artifacts/model_trainer
|
| 16 |
+
trained_model_path: artifacts/model_trainer/facial_age_detector_model
|
| 17 |
+
# Using EfficientFormer-L1, a much lighter model than ViT
|
| 18 |
+
model_name: "snap-research/efficientformer-l1"
|
dvc.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stages:
|
| 2 |
+
data_ingestion:
|
| 3 |
+
cmd: python src/cnnClassifier/pipeline/stage_01_data_ingestion.py
|
| 4 |
+
deps:
|
| 5 |
+
- src/cnnClassifier/pipeline/stage_01_data_ingestion.py
|
| 6 |
+
- src/cnnClassifier/components/data_ingestion.py
|
| 7 |
+
- config/config.yaml
|
| 8 |
+
outs:
|
| 9 |
+
- artifacts/data_ingestion
|
| 10 |
+
|
| 11 |
+
model_training:
|
| 12 |
+
cmd: python src/cnnClassifier/pipeline/stage_02_model_training.py
|
| 13 |
+
deps:
|
| 14 |
+
- src/cnnClassifier/pipeline/stage_02_model_training.py
|
| 15 |
+
- src/cnnClassifier/components/model_trainer.py
|
| 16 |
+
- config/config.yaml
|
| 17 |
+
- params.yaml
|
| 18 |
+
- artifacts/data_ingestion # Depends on the output of the previous stage
|
| 19 |
+
outs:
|
| 20 |
+
- artifacts/model_trainer
|
main.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cnnClassifier import logger
|
| 2 |
+
from cnnClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
|
| 3 |
+
from cnnClassifier.pipeline.stage_02_model_training import ModelTrainingPipeline
|
| 4 |
+
|
| 5 |
+
STAGE_NAME = "Data Ingestion stage"
|
| 6 |
+
try:
|
| 7 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 8 |
+
data_ingestion = DataIngestionTrainingPipeline()
|
| 9 |
+
data_ingestion.main()
|
| 10 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 11 |
+
except Exception as e:
|
| 12 |
+
logger.exception(e)
|
| 13 |
+
raise e
|
| 14 |
+
|
| 15 |
+
STAGE_NAME = "Model Training stage"
|
| 16 |
+
try:
|
| 17 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 18 |
+
model_training = ModelTrainingPipeline()
|
| 19 |
+
model_training.main()
|
| 20 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
logger.exception(e)
|
| 23 |
+
raise e
|
params.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training Parameters
|
| 2 |
+
IMAGE_SIZE: 224 # EfficientFormer-L1 was trained on 224x224
|
| 3 |
+
LEARNING_RATE: 2e-5
|
| 4 |
+
BATCH_SIZE: 32
|
| 5 |
+
NUM_TRAIN_EPOCHS: 20 # Adjust as needed
|
| 6 |
+
WEIGHT_DECAY: 0.01
|
| 7 |
+
WARMUP_STEPS: 100
|
| 8 |
+
TEST_SPLIT_SIZE: 0.2
|
| 9 |
+
RANDOM_STATE: 42
|
readme.md
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# For PyTorch with CUDA 11.8 - MUST be installed with the extra index URL
|
| 2 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
| 3 |
+
torch==2.1.0+cu118
|
| 4 |
+
torchvision==0.16.0+cu118
|
| 5 |
+
torchaudio==2.1.0
|
| 6 |
+
|
| 7 |
+
# Pin NumPy to a version compatible with Torch 2.1.0
|
| 8 |
+
numpy<2.0
|
| 9 |
+
|
| 10 |
+
# Hugging Face
|
| 11 |
+
transformers
|
| 12 |
+
datasets>=2.14.5
|
| 13 |
+
evaluate
|
| 14 |
+
accelerate>=0.27
|
| 15 |
+
|
| 16 |
+
# MLOps and Utilities
|
| 17 |
+
mlflow
|
| 18 |
+
dvc[s3] # Assuming you might use S3 with DVC for AWS
|
| 19 |
+
python-box
|
| 20 |
+
PyYAML
|
| 21 |
+
ensure
|
| 22 |
+
pandas
|
| 23 |
+
scikit-learn
|
| 24 |
+
Pillow
|
| 25 |
+
tqdm
|
| 26 |
+
imblearn
|
| 27 |
+
|
| 28 |
+
# Frontend and Real-time Processing
|
| 29 |
+
streamlit
|
| 30 |
+
opencv-python
|
| 31 |
+
mtcnn
|
| 32 |
+
|
| 33 |
+
# AWS Deployment
|
| 34 |
+
boto3
|
research/trials.ipynb
ADDED
|
File without changes
|
setup.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import setuptools
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
| 4 |
+
long_description = f.read()
|
| 5 |
+
|
| 6 |
+
__version__ = "0.0.0"
|
| 7 |
+
|
| 8 |
+
REPO_NAME = "Facial-Age-Detection"
|
| 9 |
+
AUTHOR_USER_NAME = "AlyyanAhmed21" # Change this
|
| 10 |
+
SRC_REPO = "cnnClassifier"
|
| 11 |
+
AUTHOR_EMAIL = "alyyanawan19@gmail.com" # Change this
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
setuptools.setup(
|
| 15 |
+
name=SRC_REPO,
|
| 16 |
+
version=__version__,
|
| 17 |
+
author=AUTHOR_USER_NAME,
|
| 18 |
+
author_email=AUTHOR_EMAIL,
|
| 19 |
+
description="A python package for facial age detection app",
|
| 20 |
+
long_description=long_description,
|
| 21 |
+
long_description_content="text/markdown",
|
| 22 |
+
url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
|
| 23 |
+
project_urls={
|
| 24 |
+
"Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
|
| 25 |
+
},
|
| 26 |
+
package_dir={"": "src"},
|
| 27 |
+
packages=setuptools.find_packages(where="src")
|
| 28 |
+
)
|
src/cnnClassifier/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/components/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
from cnnClassifier import logger
|
| 4 |
+
from cnnClassifier.entity.config_entity import DataIngestionConfig
|
| 5 |
+
|
| 6 |
+
class DataIngestion:
|
| 7 |
+
def __init__(self, config: DataIngestionConfig):
|
| 8 |
+
self.config = config
|
| 9 |
+
|
| 10 |
+
def download_file(self):
|
| 11 |
+
"""
|
| 12 |
+
Downloads the dataset from Kaggle.
|
| 13 |
+
Make sure to have your kaggle.json file in ~/.kaggle/ or set KAGGLE_USERNAME and KAGGLE_KEY env variables.
|
| 14 |
+
"""
|
| 15 |
+
try:
|
| 16 |
+
logger.info(f"Downloading dataset from kaggle: {self.config.dataset_name}")
|
| 17 |
+
os.system(f"kaggle datasets download {self.config.dataset_name} -p {os.path.dirname(self.config.local_data_file)}")
|
| 18 |
+
# The downloaded file will be named 'facial-age.zip'. We need to rename it to 'data.zip' as per our config.
|
| 19 |
+
downloaded_zip_path = os.path.join(os.path.dirname(self.config.local_data_file), 'facial-age.zip')
|
| 20 |
+
os.rename(downloaded_zip_path, self.config.local_data_file)
|
| 21 |
+
logger.info(f"Dataset downloaded and saved at {self.config.local_data_file}")
|
| 22 |
+
except Exception as e:
|
| 23 |
+
logger.error(f"Failed to download dataset. Error: {e}")
|
| 24 |
+
raise e
|
| 25 |
+
|
| 26 |
+
def extract_zip_file(self):
|
| 27 |
+
"""
|
| 28 |
+
Extracts the zip file into the data directory
|
| 29 |
+
"""
|
| 30 |
+
unzip_path = self.config.unzip_dir
|
| 31 |
+
os.makedirs(unzip_path, exist_ok=True)
|
| 32 |
+
with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
|
| 33 |
+
zip_ref.extractall(unzip_path)
|
| 34 |
+
logger.info(f"Dataset extracted to {unzip_path}")
|
src/cnnClassifier/components/model_trainer.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from datasets import Dataset, Image, ClassLabel
|
| 6 |
+
from imblearn.over_sampling import RandomOverSampler
|
| 7 |
+
from transformers import (
|
| 8 |
+
EfficientFormerImageProcessor,
|
| 9 |
+
EfficientFormerForImageClassification,
|
| 10 |
+
TrainingArguments,
|
| 11 |
+
Trainer,
|
| 12 |
+
DefaultDataCollator
|
| 13 |
+
)
|
| 14 |
+
from torchvision.transforms import (
|
| 15 |
+
Compose,
|
| 16 |
+
Normalize,
|
| 17 |
+
RandomRotation,
|
| 18 |
+
RandomResizedCrop,
|
| 19 |
+
RandomHorizontalFlip,
|
| 20 |
+
Resize,
|
| 21 |
+
ToTensor
|
| 22 |
+
)
|
| 23 |
+
import evaluate
|
| 24 |
+
from cnnClassifier.entity.config_entity import ModelTrainerConfig
|
| 25 |
+
|
| 26 |
+
class ModelTrainer:
|
| 27 |
+
def __init__(self, config: ModelTrainerConfig):
|
| 28 |
+
self.config = config
|
| 29 |
+
self.label2id = None
|
| 30 |
+
self.id2label = None
|
| 31 |
+
|
| 32 |
+
def _prepare_data(self):
|
| 33 |
+
logger.info("Preparing data...")
|
| 34 |
+
label_dict = {'001': '01', '002': '02', '003': '03', '004': '04', '005': '05',
|
| 35 |
+
'006': '06-07', '007': '06-07', '008': '08-09', '009': '08-09',
|
| 36 |
+
'010': '10-12', '011': '10-12', '012': '10-12', '013': '13-15',
|
| 37 |
+
'014': '13-15', '015': '13-15', '016': '16-20', '017': '16-20',
|
| 38 |
+
'018': '16-20', '019': '16-20', '020': '16-20', '021': '21-25',
|
| 39 |
+
'022': '21-25', '023': '21-25', '024': '21-25', '025': '21-25',
|
| 40 |
+
'026': '26-30', '027': '26-30', '028': '26-30', '029': '26-30',
|
| 41 |
+
'030': '26-30', '031': '31-35', '032': '31-35', '033': '31-35',
|
| 42 |
+
'034': '31-35', '035': '31-35', '036': '36-40', '037': '36-40',
|
| 43 |
+
'038': '36-40', '039': '36-40', '040': '36-40', '041': '41-45',
|
| 44 |
+
'042': '41-45', '043': '41-45', '044': '41-45', '045': '41-45',
|
| 45 |
+
'046': '46-50', '047': '46-50', '048': '46-50', '049': '46-50',
|
| 46 |
+
'050': '46-50', '051': '51-55', '052': '51-55', '053': '51-55',
|
| 47 |
+
'054': '51-55', '055': '51-55', '056': '56-60', '057': '56-60',
|
| 48 |
+
'058': '56-60', '059': '56-60', '060': '56-60', '061': '61-65',
|
| 49 |
+
'062': '61-65', '063': '61-65', '064': '61-65', '065': '61-65',
|
| 50 |
+
'066': '66-70', '067': '66-70', '068': '66-70', '069': '66-70',
|
| 51 |
+
'070': '66-70', '071': '71-80', '072': '71-80', '073': '71-80',
|
| 52 |
+
'074': '71-80', '075': '71-80', '076': '71-80', '077': '71-80',
|
| 53 |
+
'078': '71-80', '079': '71-80', '080': '71-80', '081': '81-90',
|
| 54 |
+
'082': '81-90', '083': '81-90', '084': '81-90', '085': '81-90',
|
| 55 |
+
'086': '81-90', '087': '81-90', '088': '81-90', '089': '81-90',
|
| 56 |
+
'090': '81-90', '091': '90+', '092': '90+', '093': '90+',
|
| 57 |
+
'095': '90+', '096': '90+', '099': '90+', '100': '90+',
|
| 58 |
+
'101': '90+', '110': '90+'}
|
| 59 |
+
|
| 60 |
+
file_names, labels = [], []
|
| 61 |
+
data_path = Path(self.config.data_path)
|
| 62 |
+
for file in tqdm(sorted(data_path.glob('*/*.*'))):
|
| 63 |
+
label = str(file).split('/')[-2]
|
| 64 |
+
labels.append(label_dict[label])
|
| 65 |
+
file_names.append(str(file))
|
| 66 |
+
|
| 67 |
+
df = pd.DataFrame.from_dict({"image": file_names, "label": labels})
|
| 68 |
+
|
| 69 |
+
# Random oversampling
|
| 70 |
+
ros = RandomOverSampler(random_state=self.config.random_state)
|
| 71 |
+
df_resampled, y_resampled = ros.fit_resample(df[['image']], df['label'])
|
| 72 |
+
df = pd.concat([df_resampled, y_resampled], axis=1)
|
| 73 |
+
|
| 74 |
+
dataset = Dataset.from_pandas(df).cast_column("image", Image())
|
| 75 |
+
|
| 76 |
+
labels_list = sorted(list(set(labels)))
|
| 77 |
+
self.label2id = {label: i for i, label in enumerate(labels_list)}
|
| 78 |
+
self.id2label = {i: label for i, label in enumerate(labels_list)}
|
| 79 |
+
|
| 80 |
+
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
|
| 81 |
+
dataset = dataset.map(lambda x: {'label': ClassLabels.str2int(x['label'])}, batched=True)
|
| 82 |
+
dataset = dataset.cast_column('label', ClassLabels)
|
| 83 |
+
|
| 84 |
+
return dataset.train_test_split(test_size=self.config.test_split_size, shuffle=True, stratify_by_column="label")
|
| 85 |
+
|
| 86 |
+
def train(self):
|
| 87 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 88 |
+
logger.info(f"Using device: {device}")
|
| 89 |
+
|
| 90 |
+
split_dataset = self._prepare_data()
|
| 91 |
+
train_data = split_dataset['train']
|
| 92 |
+
test_data = split_dataset['test']
|
| 93 |
+
|
| 94 |
+
processor = EfficientFormerImageProcessor.from_pretrained(self.config.model_name)
|
| 95 |
+
|
| 96 |
+
image_mean, image_std = processor.image_mean, processor.image_std
|
| 97 |
+
size = self.config.image_size
|
| 98 |
+
|
| 99 |
+
normalize = Normalize(mean=image_mean, std=image_std)
|
| 100 |
+
_train_transforms = Compose([
|
| 101 |
+
Resize((size, size)),
|
| 102 |
+
RandomRotation(15),
|
| 103 |
+
RandomHorizontalFlip(0.5),
|
| 104 |
+
ToTensor(),
|
| 105 |
+
normalize
|
| 106 |
+
])
|
| 107 |
+
_val_transforms = Compose([
|
| 108 |
+
Resize((size, size)),
|
| 109 |
+
ToTensor(),
|
| 110 |
+
normalize
|
| 111 |
+
])
|
| 112 |
+
|
| 113 |
+
def train_transforms(examples):
|
| 114 |
+
examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
|
| 115 |
+
return examples
|
| 116 |
+
|
| 117 |
+
def val_transforms(examples):
|
| 118 |
+
examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
|
| 119 |
+
return examples
|
| 120 |
+
|
| 121 |
+
train_data.set_transform(train_transforms)
|
| 122 |
+
test_data.set_transform(val_transforms)
|
| 123 |
+
|
| 124 |
+
def collate_fn(examples):
|
| 125 |
+
pixel_values = torch.stack([example["pixel_values"] for example in examples])
|
| 126 |
+
labels = torch.tensor([example['label'] for example in examples])
|
| 127 |
+
return {"pixel_values": pixel_values, "labels": labels}
|
| 128 |
+
|
| 129 |
+
model = EfficientFormerForImageClassification.from_pretrained(
|
| 130 |
+
self.config.model_name,
|
| 131 |
+
num_labels=len(self.id2label),
|
| 132 |
+
id2label=self.id2label,
|
| 133 |
+
label2id=self.label2id,
|
| 134 |
+
ignore_mismatched_sizes=True # Important for transfer learning
|
| 135 |
+
).to(device)
|
| 136 |
+
|
| 137 |
+
accuracy = evaluate.load("accuracy")
|
| 138 |
+
def compute_metrics(eval_pred):
|
| 139 |
+
predictions, label_ids = eval_pred
|
| 140 |
+
predicted_labels = predictions.argmax(axis=1)
|
| 141 |
+
return accuracy.compute(predictions=predicted_labels, references=label_ids)
|
| 142 |
+
|
| 143 |
+
args = TrainingArguments(
|
| 144 |
+
output_dir=self.config.root_dir,
|
| 145 |
+
logging_dir=f'{self.config.root_dir}/logs',
|
| 146 |
+
evaluation_strategy="epoch",
|
| 147 |
+
learning_rate=self.config.learning_rate,
|
| 148 |
+
per_device_train_batch_size=self.config.batch_size,
|
| 149 |
+
per_device_eval_batch_size=self.config.batch_size,
|
| 150 |
+
num_train_epochs=self.config.num_train_epochs,
|
| 151 |
+
weight_decay=self.config.weight_decay,
|
| 152 |
+
warmup_steps=self.config.warmup_steps,
|
| 153 |
+
save_strategy='epoch',
|
| 154 |
+
load_best_model_at_end=True,
|
| 155 |
+
metric_for_best_model="accuracy",
|
| 156 |
+
save_total_limit=1,
|
| 157 |
+
report_to="none"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
trainer = Trainer(
|
| 161 |
+
model=model,
|
| 162 |
+
args=args,
|
| 163 |
+
train_dataset=train_data,
|
| 164 |
+
eval_dataset=test_data,
|
| 165 |
+
data_collator=collate_fn,
|
| 166 |
+
compute_metrics=compute_metrics,
|
| 167 |
+
tokenizer=processor,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
trainer.train()
|
| 171 |
+
|
| 172 |
+
logger.info(f"Saving best model to {self.config.trained_model_path}")
|
| 173 |
+
trainer.save_model(self.config.trained_model_path)
|
src/cnnClassifier/config/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/config/configuration.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cnnClassifier.constants import *
|
| 2 |
+
from cnnClassifier.utils.common import read_yaml, create_directories
|
| 3 |
+
from cnnClassifier.entity.config_entity import DataIngestionConfig, DataPreparationConfig, ModelTrainerConfig
|
| 4 |
+
|
| 5 |
+
class ConfigurationManager:
|
| 6 |
+
def __init__(
|
| 7 |
+
self,
|
| 8 |
+
config_filepath = CONFIG_FILE_PATH,
|
| 9 |
+
params_filepath = PARAMS_FILE_PATH):
|
| 10 |
+
|
| 11 |
+
self.config = read_yaml(config_filepath)
|
| 12 |
+
self.params = read_yaml(params_filepath)
|
| 13 |
+
|
| 14 |
+
create_directories([self.config.artifacts_root])
|
| 15 |
+
|
| 16 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
| 17 |
+
config = self.config.data_ingestion
|
| 18 |
+
|
| 19 |
+
create_directories([config.root_dir])
|
| 20 |
+
|
| 21 |
+
data_ingestion_config = DataIngestionConfig(
|
| 22 |
+
root_dir=config.root_dir,
|
| 23 |
+
dataset_name=config.dataset_name,
|
| 24 |
+
local_data_file=config.local_data_file,
|
| 25 |
+
unzip_dir=config.unzip_dir
|
| 26 |
+
)
|
| 27 |
+
return data_ingestion_config
|
| 28 |
+
|
| 29 |
+
def get_data_preparation_config(self) -> DataPreparationConfig:
|
| 30 |
+
config = self.config.data_preparation
|
| 31 |
+
create_directories([config.root_dir])
|
| 32 |
+
|
| 33 |
+
data_preparation_config = DataPreparationConfig(
|
| 34 |
+
root_dir=config.root_dir,
|
| 35 |
+
data_path=config.data_path,
|
| 36 |
+
dataset_name=config.dataset_name
|
| 37 |
+
)
|
| 38 |
+
return data_preparation_config
|
| 39 |
+
|
| 40 |
+
def get_model_trainer_config(self) -> ModelTrainerConfig:
|
| 41 |
+
config = self.config.model_trainer
|
| 42 |
+
params = self.params
|
| 43 |
+
create_directories([config.root_dir])
|
| 44 |
+
|
| 45 |
+
model_trainer_config = ModelTrainerConfig(
|
| 46 |
+
root_dir=Path(config.root_dir),
|
| 47 |
+
trained_model_path=Path(config.trained_model_path),
|
| 48 |
+
model_name=config.model_name,
|
| 49 |
+
image_size=params.IMAGE_SIZE,
|
| 50 |
+
learning_rate=params.LEARNING_RATE,
|
| 51 |
+
batch_size=params.BATCH_SIZE,
|
| 52 |
+
num_train_epochs=params.NUM_TRAIN_EPOCHS,
|
| 53 |
+
weight_decay=params.WEIGHT_DECAY,
|
| 54 |
+
warmup_steps=params.WARMUP_STEPS,
|
| 55 |
+
test_split_size=params.TEST_SPLIT_SIZE,
|
| 56 |
+
random_state=params.RANDOM_STATE
|
| 57 |
+
)
|
| 58 |
+
return model_trainer_config
|
src/cnnClassifier/constants/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
CONFIG_FILE_PATH = Path("config/config.yaml")
|
| 4 |
+
PARAMS_FILE_PATH = Path("params.yaml")
|
src/cnnClassifier/entity/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/entity/config_entity.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
@dataclass(frozen=True)
|
| 5 |
+
class DataIngestionConfig:
|
| 6 |
+
root_dir: Path
|
| 7 |
+
dataset_name: str
|
| 8 |
+
local_data_file: Path
|
| 9 |
+
unzip_dir: Path
|
| 10 |
+
|
| 11 |
+
@dataclass(frozen=True)
|
| 12 |
+
class DataPreparationConfig:
|
| 13 |
+
root_dir: Path
|
| 14 |
+
data_path: Path
|
| 15 |
+
dataset_name: str
|
| 16 |
+
|
| 17 |
+
@dataclass(frozen=True)
|
| 18 |
+
class ModelTrainerConfig:
|
| 19 |
+
root_dir: Path
|
| 20 |
+
trained_model_path: Path
|
| 21 |
+
model_name: str
|
| 22 |
+
image_size: int
|
| 23 |
+
learning_rate: float
|
| 24 |
+
batch_size: int
|
| 25 |
+
num_train_epochs: int
|
| 26 |
+
weight_decay: float
|
| 27 |
+
warmup_steps: int
|
| 28 |
+
test_split_size: float
|
| 29 |
+
random_state: int
|
src/cnnClassifier/pipeline/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/pipeline/stage_01_data_ingestion.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cnnClassifier.config.configuration import ConfigurationManager
|
| 2 |
+
from cnnClassifier.components.data_ingestion import DataIngestion
|
| 3 |
+
from cnnClassifier import logger
|
| 4 |
+
|
| 5 |
+
STAGE_NAME = "Data Ingestion stage"
|
| 6 |
+
|
| 7 |
+
class DataIngestionTrainingPipeline:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
def main(self):
|
| 12 |
+
config = ConfigurationManager()
|
| 13 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
| 14 |
+
data_ingestion = DataIngestion(config=data_ingestion_config)
|
| 15 |
+
data_ingestion.download_file()
|
| 16 |
+
data_ingestion.extract_zip_file()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == '__main__':
|
| 20 |
+
try:
|
| 21 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 22 |
+
obj = DataIngestionTrainingPipeline()
|
| 23 |
+
obj.main()
|
| 24 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(e)
|
| 27 |
+
raise e
|
src/cnnClassifier/pipeline/stage_02_model_training.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cnnClassifier.config.configuration import ConfigurationManager
|
| 2 |
+
from cnnClassifier.components.model_trainer import ModelTrainer
|
| 3 |
+
from cnnClassifier import logger
|
| 4 |
+
|
| 5 |
+
STAGE_NAME = "Model Training stage"
|
| 6 |
+
|
| 7 |
+
class ModelTrainingPipeline:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
def main(self):
|
| 12 |
+
config = ConfigurationManager()
|
| 13 |
+
model_trainer_config = config.get_model_trainer_config()
|
| 14 |
+
model_trainer = ModelTrainer(config=model_trainer_config)
|
| 15 |
+
model_trainer.train()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if __name__ == '__main__':
|
| 19 |
+
try:
|
| 20 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 21 |
+
obj = ModelTrainingPipeline()
|
| 22 |
+
obj.main()
|
| 23 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.exception(e)
|
| 26 |
+
raise e
|
src/cnnClassifier/utils/__init__.py
ADDED
|
File without changes
|
src/cnnClassifier/utils/common.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from box.exceptions import BoxValueError
|
| 3 |
+
import yaml
|
| 4 |
+
from cnnClassifier import logger
|
| 5 |
+
import json
|
| 6 |
+
import joblib
|
| 7 |
+
from ensure import ensure_annotations
|
| 8 |
+
from box import ConfigBox
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
@ensure_annotations
|
| 13 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
| 14 |
+
"""reads yaml file and returns
|
| 15 |
+
Args:
|
| 16 |
+
path_to_yaml (str): path like input
|
| 17 |
+
Raises:
|
| 18 |
+
ValueError: if yaml file is empty
|
| 19 |
+
e: empty file
|
| 20 |
+
Returns:
|
| 21 |
+
ConfigBox: ConfigBox type
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
with open(path_to_yaml) as yaml_file:
|
| 25 |
+
content = yaml.safe_load(yaml_file)
|
| 26 |
+
logger.info(f"yaml file: {path_to_yaml} loaded successfully")
|
| 27 |
+
return ConfigBox(content)
|
| 28 |
+
except BoxValueError:
|
| 29 |
+
raise ValueError("yaml file is empty")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
raise e
|
| 32 |
+
|
| 33 |
+
@ensure_annotations
|
| 34 |
+
def create_directories(path_to_directories: list, verbose=True):
|
| 35 |
+
"""create list of directories
|
| 36 |
+
Args:
|
| 37 |
+
path_to_directories (list): list of path of directories
|
| 38 |
+
ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
|
| 39 |
+
"""
|
| 40 |
+
for path in path_to_directories:
|
| 41 |
+
os.makedirs(path, exist_ok=True)
|
| 42 |
+
if verbose:
|
| 43 |
+
logger.info(f"created directory at: {path}")
|
template.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
|
| 6 |
+
|
| 7 |
+
project_name = "cnnClassifier"
|
| 8 |
+
|
| 9 |
+
list_of_files = [
|
| 10 |
+
".github/workflows/.gitkeep",
|
| 11 |
+
f"src/{project_name}/__init__.py",
|
| 12 |
+
f"src/{project_name}/components/__init__.py",
|
| 13 |
+
f"src/{project_name}/utils/__init__.py",
|
| 14 |
+
f"src/{project_name}/utils/common.py",
|
| 15 |
+
f"src/{project_name}/config/__init__.py",
|
| 16 |
+
f"src/{project_name}/config/configuration.py",
|
| 17 |
+
f"src/{project_name}/pipeline/__init__.py",
|
| 18 |
+
f"src/{project_name}/entity/__init__.py",
|
| 19 |
+
f"src/{project_name}/entity/config_entity.py",
|
| 20 |
+
f"src/{project_name}/constants/__init__.py",
|
| 21 |
+
"config/config.yaml",
|
| 22 |
+
"dvc.yaml",
|
| 23 |
+
"params.yaml",
|
| 24 |
+
"requirements.txt",
|
| 25 |
+
"setup.py",
|
| 26 |
+
"research/trials.ipynb",
|
| 27 |
+
"templates/index.html",
|
| 28 |
+
"app.py" # For Streamlit
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
for filepath in list_of_files:
|
| 32 |
+
filepath = Path(filepath)
|
| 33 |
+
filedir, filename = os.path.split(filepath)
|
| 34 |
+
|
| 35 |
+
if filedir != "":
|
| 36 |
+
os.makedirs(filedir, exist_ok=True)
|
| 37 |
+
logging.info(f"Creating directory; {filedir} for the file: {filename}")
|
| 38 |
+
|
| 39 |
+
if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
|
| 40 |
+
with open(filepath, "w") as f:
|
| 41 |
+
pass
|
| 42 |
+
logging.info(f"Creating empty file: {filepath}")
|
| 43 |
+
else:
|
| 44 |
+
logging.info(f"{filename} is already exists")
|
templates/index.html
ADDED
|
File without changes
|