Spaces:
Sleeping
Sleeping
commiting backend files
Browse files- .github/workflows/.gitkeep +0 -0
- .gitignore +95 -197
- config/config.yaml +39 -0
- dvc.lock +109 -0
- dvc.yaml +44 -0
- gpu +57 -0
- main.py +23 -0
- params.yaml +7 -0
- requirements.txt +21 -0
- research/trials.ipynb +0 -0
- setup.py +27 -0
- src/vitClassifier/__init__.py +26 -0
- src/vitClassifier/components/__init__.py +0 -0
- src/vitClassifier/components/data_ingestion.py +67 -0
- src/vitClassifier/components/data_transformation.py +80 -0
- src/vitClassifier/components/model_evaluation.py +89 -0
- src/vitClassifier/components/model_training.py +74 -0
- src/vitClassifier/config/__init__.py +0 -0
- src/vitClassifier/config/configuration.py +76 -0
- src/vitClassifier/constants/__init__.py +4 -0
- src/vitClassifier/entity/__init__.py +0 -0
- src/vitClassifier/entity/config_entity.py +43 -0
- src/vitClassifier/pipeline/__init__.py +0 -0
- src/vitClassifier/pipeline/stage_01_data_ingestion.py +27 -0
- src/vitClassifier/pipeline/stage_02_data_transformation.py +32 -0
- src/vitClassifier/pipeline/stage_03_model_training.py +24 -0
- src/vitClassifier/pipeline/stage_04_model_evaluation.py +26 -0
- src/vitClassifier/utils/__init__.py +0 -0
- src/vitClassifier/utils/common.py +28 -0
- template.py +40 -0
.github/workflows/.gitkeep
ADDED
|
File without changes
|
.gitignore
CHANGED
|
@@ -1,207 +1,105 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
-
*.py[
|
| 4 |
*$py.class
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
build/
|
| 12 |
-
develop-eggs/
|
| 13 |
-
dist/
|
| 14 |
-
downloads/
|
| 15 |
-
eggs/
|
| 16 |
-
|
| 17 |
-
lib/
|
| 18 |
-
lib64/
|
| 19 |
-
parts/
|
| 20 |
-
sdist/
|
| 21 |
-
var/
|
| 22 |
-
wheels/
|
| 23 |
-
share/python-wheels/
|
| 24 |
*.egg-info/
|
| 25 |
.installed.cfg
|
| 26 |
*.egg
|
| 27 |
-
MANIFEST
|
| 28 |
-
|
| 29 |
-
# PyInstaller
|
| 30 |
-
# Usually these files are written by a python script from a template
|
| 31 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
*.manifest
|
| 33 |
*.spec
|
| 34 |
|
| 35 |
-
# Installer logs
|
| 36 |
-
pip-log.txt
|
| 37 |
-
pip-delete-this-directory.txt
|
| 38 |
-
|
| 39 |
-
# Unit test / coverage reports
|
| 40 |
-
htmlcov/
|
| 41 |
-
.tox/
|
| 42 |
-
.nox/
|
| 43 |
-
.coverage
|
| 44 |
-
.coverage.*
|
| 45 |
-
.cache
|
| 46 |
-
nosetests.xml
|
| 47 |
-
coverage.xml
|
| 48 |
-
*.cover
|
| 49 |
-
*.py.cover
|
| 50 |
-
.hypothesis/
|
| 51 |
-
.pytest_cache/
|
| 52 |
-
cover/
|
| 53 |
-
|
| 54 |
-
# Translations
|
| 55 |
-
*.mo
|
| 56 |
-
*.pot
|
| 57 |
-
|
| 58 |
-
# Django stuff:
|
| 59 |
-
*.log
|
| 60 |
-
local_settings.py
|
| 61 |
-
db.sqlite3
|
| 62 |
-
db.sqlite3-journal
|
| 63 |
-
|
| 64 |
-
# Flask stuff:
|
| 65 |
-
instance/
|
| 66 |
-
.webassets-cache
|
| 67 |
-
|
| 68 |
-
# Scrapy stuff:
|
| 69 |
-
.scrapy
|
| 70 |
-
|
| 71 |
-
# Sphinx documentation
|
| 72 |
-
docs/_build/
|
| 73 |
-
|
| 74 |
-
# PyBuilder
|
| 75 |
-
.pybuilder/
|
| 76 |
-
target/
|
| 77 |
-
|
| 78 |
-
# Jupyter Notebook
|
| 79 |
-
.ipynb_checkpoints
|
| 80 |
-
|
| 81 |
-
# IPython
|
| 82 |
-
profile_default/
|
| 83 |
-
ipython_config.py
|
| 84 |
-
|
| 85 |
-
# pyenv
|
| 86 |
-
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
-
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
-
# .python-version
|
| 89 |
-
|
| 90 |
-
# pipenv
|
| 91 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
-
# install all needed dependencies.
|
| 95 |
-
#Pipfile.lock
|
| 96 |
-
|
| 97 |
-
# UV
|
| 98 |
-
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
-
# commonly ignored for libraries.
|
| 101 |
-
#uv.lock
|
| 102 |
-
|
| 103 |
-
# poetry
|
| 104 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
-
# commonly ignored for libraries.
|
| 107 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
-
#poetry.lock
|
| 109 |
-
#poetry.toml
|
| 110 |
-
|
| 111 |
-
# pdm
|
| 112 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
-
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
-
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
-
#pdm.lock
|
| 116 |
-
#pdm.toml
|
| 117 |
-
.pdm-python
|
| 118 |
-
.pdm-build/
|
| 119 |
-
|
| 120 |
-
# pixi
|
| 121 |
-
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
-
#pixi.lock
|
| 123 |
-
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
-
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
-
.pixi
|
| 126 |
-
|
| 127 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
-
__pypackages__/
|
| 129 |
-
|
| 130 |
-
# Celery stuff
|
| 131 |
-
celerybeat-schedule
|
| 132 |
-
celerybeat.pid
|
| 133 |
-
|
| 134 |
-
# SageMath parsed files
|
| 135 |
-
*.sage.py
|
| 136 |
-
|
| 137 |
-
# Environments
|
| 138 |
-
.env
|
| 139 |
-
.envrc
|
| 140 |
-
.venv
|
| 141 |
-
env/
|
| 142 |
-
venv/
|
| 143 |
-
ENV/
|
| 144 |
-
env.bak/
|
| 145 |
-
venv.bak/
|
| 146 |
|
| 147 |
-
#
|
| 148 |
-
|
| 149 |
-
.
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
.
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
.
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
#
|
| 172 |
-
#
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
#
|
| 179 |
-
#
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
#
|
| 185 |
-
#
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
#
|
| 192 |
-
.
|
| 193 |
-
|
| 194 |
-
# PyPI configuration file
|
| 195 |
-
.pypirc
|
| 196 |
-
|
| 197 |
-
# Cursor
|
| 198 |
-
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
-
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
-
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
-
.cursorignore
|
| 202 |
-
.cursorindexingignore
|
| 203 |
-
|
| 204 |
-
# Marimo
|
| 205 |
-
marimo/_static/
|
| 206 |
-
marimo/_lsp/
|
| 207 |
-
__marimo__/
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# Python Virtual Environments
|
| 3 |
+
# Never commit the virtual environment folder.
|
| 4 |
+
# =============================================================================
|
| 5 |
+
/venv/
|
| 6 |
+
.venv/
|
| 7 |
+
env/
|
| 8 |
+
.env
|
| 9 |
+
ENV/
|
| 10 |
+
env.bak/
|
| 11 |
+
venv.bak/
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# =============================================================================
|
| 15 |
+
# DVC and MLflow Artifacts
|
| 16 |
+
# DVC tracks data, so we don't need Git to. We only commit the .dvc files.
|
| 17 |
+
# The `artifacts` and `logs` directories will be generated by the pipeline.
|
| 18 |
+
# =============================================================================
|
| 19 |
+
/artifacts/
|
| 20 |
+
/logs/
|
| 21 |
+
/mlruns/
|
| 22 |
+
# DVC's internal cache should NEVER be committed.
|
| 23 |
+
/.dvc/cache/
|
| 24 |
+
# DVC's temporary directories
|
| 25 |
+
/.dvc/tmp/
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# =============================================================================
|
| 29 |
+
# Python Cache and Compiled Files
|
| 30 |
+
# These are generated automatically by Python and don't need to be versioned.
|
| 31 |
+
# =============================================================================
|
| 32 |
__pycache__/
|
| 33 |
+
*.py[cod]
|
| 34 |
*$py.class
|
| 35 |
|
| 36 |
+
|
| 37 |
+
# =============================================================================
|
| 38 |
+
# Build and Distribution Artifacts
|
| 39 |
+
# Generated when building a Python package.
|
| 40 |
+
# =============================================================================
|
| 41 |
+
/build/
|
| 42 |
+
/develop-eggs/
|
| 43 |
+
/dist/
|
| 44 |
+
/downloads/
|
| 45 |
+
/eggs/
|
| 46 |
+
/.eggs/
|
| 47 |
+
/lib/
|
| 48 |
+
/lib64/
|
| 49 |
+
/parts/
|
| 50 |
+
/sdist/
|
| 51 |
+
/var/
|
| 52 |
+
/wheels/
|
|
|
|
| 53 |
*.egg-info/
|
| 54 |
.installed.cfg
|
| 55 |
*.egg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
*.manifest
|
| 57 |
*.spec
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
# =============================================================================
|
| 61 |
+
# IDE and Editor-Specific Files
|
| 62 |
+
# Ignore configuration files from common editors like VSCode, PyCharm, etc.
|
| 63 |
+
# =============================================================================
|
| 64 |
+
.vscode/
|
| 65 |
+
.idea/
|
| 66 |
+
.project
|
| 67 |
+
.pydevproject
|
| 68 |
+
.classpath
|
| 69 |
+
*.swp
|
| 70 |
+
*.swo
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# =============================================================================
|
| 74 |
+
# OS-specific Files
|
| 75 |
+
# Ignore files generated by macOS, Windows, and Linux.
|
| 76 |
+
# =============================================================================
|
| 77 |
+
.DS_Store
|
| 78 |
+
Thumbs.db
|
| 79 |
+
Desktop.ini
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# =============================================================================
|
| 83 |
+
# Jupyter Notebook Checkpoints
|
| 84 |
+
# Ignore the checkpoints directory created by Jupyter.
|
| 85 |
+
# =============================================================================
|
| 86 |
+
.ipynb_checkpoints/
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# =============================================================================
|
| 90 |
+
# Kaggle API Credentials
|
| 91 |
+
# IMPORTANT: Never commit your API keys or secrets.
|
| 92 |
+
# =============================================================================
|
| 93 |
+
kaggle.json
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# =============================================================================
|
| 97 |
+
# Other
|
| 98 |
+
# Any other miscellaneous files that shouldn't be in the repo.
|
| 99 |
+
# =============================================================================
|
| 100 |
+
*.log
|
| 101 |
+
*.tmp
|
| 102 |
+
*.bak
|
| 103 |
+
*.local
|
| 104 |
+
eval_output/ # Temporary directory created by the evaluation component
|
| 105 |
+
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/config.yaml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config/config.yaml
|
| 2 |
+
|
| 3 |
+
artifacts_root: artifacts
|
| 4 |
+
|
| 5 |
+
data_ingestion:
|
| 6 |
+
root_dir: artifacts/data_ingestion
|
| 7 |
+
source_kaggle_dataset_id: "paultimothymooney/chest-xray-pneumonia"
|
| 8 |
+
unzip_dir: artifacts/data_ingestion/
|
| 9 |
+
# We will create these three files now
|
| 10 |
+
train_df_path: artifacts/data_ingestion/train_df.csv
|
| 11 |
+
test_df_path: artifacts/data_ingestion/test_df.csv
|
| 12 |
+
val_df_path: artifacts/data_ingestion/val_df.csv
|
| 13 |
+
|
| 14 |
+
data_transformation:
|
| 15 |
+
root_dir: artifacts/data_transformation
|
| 16 |
+
# We now have three sources
|
| 17 |
+
train_data_path: artifacts/data_ingestion/train_df.csv
|
| 18 |
+
test_data_path: artifacts/data_ingestion/test_df.csv
|
| 19 |
+
val_data_path: artifacts/data_ingestion/val_df.csv
|
| 20 |
+
# And will create three outputs
|
| 21 |
+
train_dataset_path: artifacts/data_transformation/train_dataset
|
| 22 |
+
test_dataset_path: artifacts/data_transformation/test_dataset
|
| 23 |
+
val_dataset_path: artifacts/data_transformation/val_dataset
|
| 24 |
+
|
| 25 |
+
model_training:
|
| 26 |
+
root_dir: artifacts/model_training
|
| 27 |
+
trained_model_path: artifacts/model_training/model
|
| 28 |
+
model_name: "google/vit-base-patch16-224-in21k"
|
| 29 |
+
# We'll use the validation set for evaluation during training
|
| 30 |
+
train_dataset_path: artifacts/data_transformation/train_dataset
|
| 31 |
+
val_dataset_path: artifacts/data_transformation/val_dataset
|
| 32 |
+
|
| 33 |
+
model_evaluation:
|
| 34 |
+
root_dir: artifacts/model_evaluation
|
| 35 |
+
model_path: artifacts/model_training/model
|
| 36 |
+
# Final evaluation is done on the unseen test set
|
| 37 |
+
test_dataset_path: artifacts/data_transformation/test_dataset
|
| 38 |
+
metrics_file_name: artifacts/model_evaluation/metrics.json
|
| 39 |
+
mlflow_uri: "https://dagshub.com/AlyyanAhmed21/Chest-X-ray-Pneumonia-Detection-with-ViT.mlflow"
|
dvc.lock
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
schema: '2.0'
|
| 2 |
+
stages:
|
| 3 |
+
data_ingestion:
|
| 4 |
+
cmd: python src/vitClassifier/pipeline/stage_01_data_ingestion.py
|
| 5 |
+
deps:
|
| 6 |
+
- path: config/config.yaml
|
| 7 |
+
hash: md5
|
| 8 |
+
md5: 9a45c00c11b9a8a0b4e396952a6b19a8
|
| 9 |
+
size: 1591
|
| 10 |
+
- path: src/vitClassifier/pipeline/stage_01_data_ingestion.py
|
| 11 |
+
hash: md5
|
| 12 |
+
md5: 2486829b866dffe25a752248afa95f4f
|
| 13 |
+
size: 968
|
| 14 |
+
outs:
|
| 15 |
+
- path: artifacts/data_ingestion
|
| 16 |
+
hash: md5
|
| 17 |
+
md5: 9921508de1b9f2e8a5a4150d063e178d.dir
|
| 18 |
+
size: 2484429974
|
| 19 |
+
nfiles: 17594
|
| 20 |
+
data_transformation:
|
| 21 |
+
cmd: python src/vitClassifier/pipeline/stage_02_data_transformation.py
|
| 22 |
+
deps:
|
| 23 |
+
- path: artifacts/data_ingestion/test_df.csv
|
| 24 |
+
hash: md5
|
| 25 |
+
md5: 95cbf91a4d0719e528c74879d6da0e34
|
| 26 |
+
size: 53272
|
| 27 |
+
- path: artifacts/data_ingestion/train_df.csv
|
| 28 |
+
hash: md5
|
| 29 |
+
md5: af0d24afd4d9092b64bb1db986d38f76
|
| 30 |
+
size: 460017
|
| 31 |
+
- path: artifacts/data_ingestion/val_df.csv
|
| 32 |
+
hash: md5
|
| 33 |
+
md5: 575134cde7f8113c2a51dd4fac3e4c5e
|
| 34 |
+
size: 1389
|
| 35 |
+
- path: config/config.yaml
|
| 36 |
+
hash: md5
|
| 37 |
+
md5: 9a45c00c11b9a8a0b4e396952a6b19a8
|
| 38 |
+
size: 1591
|
| 39 |
+
- path: params.yaml
|
| 40 |
+
hash: md5
|
| 41 |
+
md5: cc525f2481819601bb93ec5d7f008dda
|
| 42 |
+
size: 127
|
| 43 |
+
- path: src/vitClassifier/pipeline/stage_02_data_transformation.py
|
| 44 |
+
hash: md5
|
| 45 |
+
md5: 095fcfa8843d6b94a05d9f2172522b32
|
| 46 |
+
size: 1237
|
| 47 |
+
outs:
|
| 48 |
+
- path: artifacts/data_transformation
|
| 49 |
+
hash: md5
|
| 50 |
+
md5: b91edf7d0a4b4b0022f2d33d3f2176fa.dir
|
| 51 |
+
size: 5074488112
|
| 52 |
+
nfiles: 18
|
| 53 |
+
model_training:
|
| 54 |
+
cmd: python src/vitClassifier/pipeline/stage_03_model_training.py
|
| 55 |
+
deps:
|
| 56 |
+
- path: artifacts/data_transformation/train_dataset
|
| 57 |
+
hash: md5
|
| 58 |
+
md5: 64425f8e57c16ac250c2ea73b78b7aa2.dir
|
| 59 |
+
size: 4687397188
|
| 60 |
+
nfiles: 12
|
| 61 |
+
- path: artifacts/data_transformation/val_dataset
|
| 62 |
+
hash: md5
|
| 63 |
+
md5: e19e673f104f2efec21df351b3d4869c.dir
|
| 64 |
+
size: 9678966
|
| 65 |
+
nfiles: 3
|
| 66 |
+
- path: config/config.yaml
|
| 67 |
+
hash: md5
|
| 68 |
+
md5: 9a45c00c11b9a8a0b4e396952a6b19a8
|
| 69 |
+
size: 1591
|
| 70 |
+
- path: params.yaml
|
| 71 |
+
hash: md5
|
| 72 |
+
md5: cc525f2481819601bb93ec5d7f008dda
|
| 73 |
+
size: 127
|
| 74 |
+
- path: src/vitClassifier/pipeline/stage_03_model_training.py
|
| 75 |
+
hash: md5
|
| 76 |
+
md5: 5e9cde1828fc4b2608e9f4e92b134a07
|
| 77 |
+
size: 815
|
| 78 |
+
outs:
|
| 79 |
+
- path: artifacts/model_training/model
|
| 80 |
+
hash: md5
|
| 81 |
+
md5: 9f0765ff59616eddac47fcaf7a5e7387.dir
|
| 82 |
+
size: 343230531
|
| 83 |
+
nfiles: 4
|
| 84 |
+
model_evaluation:
|
| 85 |
+
cmd: python src/vitClassifier/pipeline/stage_04_model_evaluation.py
|
| 86 |
+
deps:
|
| 87 |
+
- path: artifacts/data_transformation/test_dataset
|
| 88 |
+
hash: md5
|
| 89 |
+
md5: 41a8f95d5075f06bef31fbf55d838cca.dir
|
| 90 |
+
size: 377411958
|
| 91 |
+
nfiles: 3
|
| 92 |
+
- path: artifacts/model_training/model
|
| 93 |
+
hash: md5
|
| 94 |
+
md5: 9f0765ff59616eddac47fcaf7a5e7387.dir
|
| 95 |
+
size: 343230531
|
| 96 |
+
nfiles: 4
|
| 97 |
+
- path: config/config.yaml
|
| 98 |
+
hash: md5
|
| 99 |
+
md5: 9224d2383ec670f1738b47139f250ad4
|
| 100 |
+
size: 1659
|
| 101 |
+
- path: src/vitClassifier/pipeline/stage_04_model_evaluation.py
|
| 102 |
+
hash: md5
|
| 103 |
+
md5: e31c602e23dbfa62f6453ca44b621d0a
|
| 104 |
+
size: 863
|
| 105 |
+
outs:
|
| 106 |
+
- path: artifacts/model_evaluation/metrics.json
|
| 107 |
+
hash: md5
|
| 108 |
+
md5: 26b4e3326f589929e4a6e34833cc187f
|
| 109 |
+
size: 150
|
dvc.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stages:
|
| 2 |
+
data_ingestion:
|
| 3 |
+
cmd: python src/vitClassifier/pipeline/stage_01_data_ingestion.py
|
| 4 |
+
deps:
|
| 5 |
+
- src/vitClassifier/pipeline/stage_01_data_ingestion.py
|
| 6 |
+
- config/config.yaml
|
| 7 |
+
outs:
|
| 8 |
+
- artifacts/data_ingestion
|
| 9 |
+
|
| 10 |
+
data_transformation:
|
| 11 |
+
cmd: python src/vitClassifier/pipeline/stage_02_data_transformation.py
|
| 12 |
+
deps:
|
| 13 |
+
- src/vitClassifier/pipeline/stage_02_data_transformation.py
|
| 14 |
+
# --- THIS IS THE FIX ---
|
| 15 |
+
# Remove the old dependency and add the three new ones.
|
| 16 |
+
- artifacts/data_ingestion/train_df.csv
|
| 17 |
+
- artifacts/data_ingestion/test_df.csv
|
| 18 |
+
- artifacts/data_ingestion/val_df.csv
|
| 19 |
+
- config/config.yaml
|
| 20 |
+
- params.yaml
|
| 21 |
+
outs:
|
| 22 |
+
- artifacts/data_transformation
|
| 23 |
+
|
| 24 |
+
model_training:
|
| 25 |
+
cmd: python src/vitClassifier/pipeline/stage_03_model_training.py
|
| 26 |
+
deps:
|
| 27 |
+
- src/vitClassifier/pipeline/stage_03_model_training.py
|
| 28 |
+
- artifacts/data_transformation/train_dataset
|
| 29 |
+
- artifacts/data_transformation/val_dataset # Added dependency on val dataset
|
| 30 |
+
- config/config.yaml
|
| 31 |
+
- params.yaml
|
| 32 |
+
outs:
|
| 33 |
+
- artifacts/model_training/model
|
| 34 |
+
|
| 35 |
+
model_evaluation:
|
| 36 |
+
cmd: python src/vitClassifier/pipeline/stage_04_model_evaluation.py
|
| 37 |
+
deps:
|
| 38 |
+
- src/vitClassifier/pipeline/stage_04_model_evaluation.py
|
| 39 |
+
- artifacts/data_transformation/test_dataset
|
| 40 |
+
- artifacts/model_training/model
|
| 41 |
+
- config/config.yaml
|
| 42 |
+
metrics:
|
| 43 |
+
- artifacts/model_evaluation/metrics.json:
|
| 44 |
+
cache: false
|
gpu
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# check_gpu.py
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
def check_gpu_environment():
|
| 7 |
+
"""
|
| 8 |
+
This script checks the system's Python and PyTorch GPU environment.
|
| 9 |
+
It prints detailed information about the setup.
|
| 10 |
+
"""
|
| 11 |
+
print("--- System and Python Information ---")
|
| 12 |
+
print(f"Python Version: {sys.version}")
|
| 13 |
+
print("\n--- PyTorch and CUDA Information ---")
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
print(f"PyTorch Version: {torch.__version__}")
|
| 17 |
+
|
| 18 |
+
# Check if CUDA (GPU support) is available
|
| 19 |
+
cuda_available = torch.cuda.is_available()
|
| 20 |
+
print(f"CUDA Available: {cuda_available}")
|
| 21 |
+
|
| 22 |
+
if not cuda_available:
|
| 23 |
+
print("\nWARNING: PyTorch was not built with CUDA support. GPU will not be used.")
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
# Get the number of available GPUs
|
| 27 |
+
gpu_count = torch.cuda.device_count()
|
| 28 |
+
print(f"Number of GPUs Available: {gpu_count}")
|
| 29 |
+
|
| 30 |
+
# Get details for each GPU
|
| 31 |
+
for i in range(gpu_count):
|
| 32 |
+
print(f"\n--- GPU Details (Device {i}) ---")
|
| 33 |
+
gpu_name = torch.cuda.get_device_name(i)
|
| 34 |
+
print(f" GPU Name: {gpu_name}")
|
| 35 |
+
|
| 36 |
+
cuda_capability = torch.cuda.get_device_capability(i)
|
| 37 |
+
print(f" Compute Capability: {cuda_capability[0]}.{cuda_capability[1]}")
|
| 38 |
+
|
| 39 |
+
total_mem = torch.cuda.get_device_properties(i).total_memory / (1024**3) # Convert bytes to GB
|
| 40 |
+
print(f" Total Memory: {total_mem:.2f} GB")
|
| 41 |
+
|
| 42 |
+
# Check for cuDNN
|
| 43 |
+
cudnn_available = torch.backends.cudnn.is_available()
|
| 44 |
+
print("\n--- cuDNN Information ---")
|
| 45 |
+
print(f"cuDNN Available: {cudnn_available}")
|
| 46 |
+
if cudnn_available:
|
| 47 |
+
cudnn_version = torch.backends.cudnn.version()
|
| 48 |
+
print(f"cuDNN Version: {cudnn_version}")
|
| 49 |
+
else:
|
| 50 |
+
print("\nWARNING: cuDNN is not available. Training will be significantly slower.")
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"\nAn error occurred: {e}")
|
| 54 |
+
print("Please ensure PyTorch is installed correctly.")
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
check_gpu_environment()
|
main.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from vitClassifier import logger
|
| 2 |
+
from vitClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
|
| 3 |
+
from vitClassifier.pipeline.stage_02_data_transformation import DataTransformationTrainingPipeline
|
| 4 |
+
from vitClassifier.pipeline.stage_03_model_training import ModelTrainingPipeline
|
| 5 |
+
from vitClassifier.pipeline.stage_04_model_evaluation import ModelEvaluationPipeline
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
def run_pipeline(stage_name, pipeline_class):
|
| 10 |
+
try:
|
| 11 |
+
logger.info(f">>>>>> stage {stage_name} started <<<<<<")
|
| 12 |
+
pipeline = pipeline_class()
|
| 13 |
+
pipeline.main()
|
| 14 |
+
logger.info(f">>>>>> stage {stage_name} completed <<<<<<\n\nx==========x")
|
| 15 |
+
except Exception as e:
|
| 16 |
+
logger.exception(e)
|
| 17 |
+
raise e
|
| 18 |
+
|
| 19 |
+
if __name__ == '__main__':
|
| 20 |
+
run_pipeline("Data Ingestion stage", DataIngestionTrainingPipeline)
|
| 21 |
+
run_pipeline("Data Transformation stage", DataTransformationTrainingPipeline)
|
| 22 |
+
run_pipeline("Model Training stage", ModelTrainingPipeline)
|
| 23 |
+
run_pipeline("Model Evaluation stage", ModelEvaluationPipeline)
|
params.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LEARNING_RATE: 2.0e-5
|
| 2 |
+
BATCH_SIZE: 32
|
| 3 |
+
EPOCHS: 3
|
| 4 |
+
WEIGHT_DECAY: 0.01
|
| 5 |
+
WARMUP_STEPS: 100
|
| 6 |
+
RANDOM_STATE: 42
|
| 7 |
+
TEST_SPLIT_SIZE: 0.2
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
torch
|
| 4 |
+
torchvision
|
| 5 |
+
transformers
|
| 6 |
+
datasets>=2.14.5
|
| 7 |
+
evaluate
|
| 8 |
+
accelerate>=0.27
|
| 9 |
+
mlflow
|
| 10 |
+
scikit-learn
|
| 11 |
+
imblearn
|
| 12 |
+
python-box
|
| 13 |
+
PyYAML
|
| 14 |
+
ensure
|
| 15 |
+
tqdm
|
| 16 |
+
pathlib
|
| 17 |
+
dvc
|
| 18 |
+
matplotlib
|
| 19 |
+
Pillow
|
| 20 |
+
kaggle
|
| 21 |
+
python-dotenv
|
research/trials.ipynb
ADDED
|
File without changes
|
setup.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import setuptools
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
| 4 |
+
long_description = f.read()
|
| 5 |
+
|
| 6 |
+
__version__ = "0.0.1"
|
| 7 |
+
|
| 8 |
+
REPO_NAME = "Chest-Xray-Pneumonia-ViT-MLflow-DVC"
|
| 9 |
+
AUTHOR_USER_NAME = "your-github-username" # CHANGE THIS
|
| 10 |
+
SRC_REPO = "vitClassifier"
|
| 11 |
+
AUTHOR_EMAIL = "your-email@example.com" # CHANGE THIS
|
| 12 |
+
|
| 13 |
+
setuptools.setup(
|
| 14 |
+
name=SRC_REPO,
|
| 15 |
+
version=__version__,
|
| 16 |
+
author=AUTHOR_USER_NAME,
|
| 17 |
+
author_email=AUTHOR_EMAIL,
|
| 18 |
+
description="An end-to-end ML project for Chest X-ray Pneumonia classification using ViT.",
|
| 19 |
+
long_description=long_description,
|
| 20 |
+
long_description_content_type="text/markdown",
|
| 21 |
+
url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
|
| 22 |
+
project_urls={
|
| 23 |
+
"Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
|
| 24 |
+
},
|
| 25 |
+
package_dir={"": "src"},
|
| 26 |
+
packages=setuptools.find_packages(where="src")
|
| 27 |
+
)
|
src/vitClassifier/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/__init__.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
# Define the logging format
|
| 8 |
+
logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
| 9 |
+
|
| 10 |
+
# Define the directory for log files
|
| 11 |
+
log_dir = "logs"
|
| 12 |
+
log_filepath = os.path.join(log_dir, "running_logs.log")
|
| 13 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
# Configure the logging
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
level=logging.INFO,
|
| 18 |
+
format=logging_str,
|
| 19 |
+
handlers=[
|
| 20 |
+
logging.FileHandler(log_filepath), # Log to a file
|
| 21 |
+
logging.StreamHandler(sys.stdout) # Also log to the console
|
| 22 |
+
]
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Create a logger object that can be imported by other modules
|
| 26 |
+
logger = logging.getLogger("vitClassifierLogger")
|
src/vitClassifier/components/__init__.py
ADDED
|
File without changes
|
src/vitClassifier/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from vitClassifier import logger
|
| 5 |
+
from vitClassifier.entity.config_entity import DataIngestionConfig
|
| 6 |
+
import kaggle
|
| 7 |
+
|
| 8 |
+
class DataIngestion:
|
| 9 |
+
def __init__(self, config: DataIngestionConfig):
|
| 10 |
+
self.config = config
|
| 11 |
+
|
| 12 |
+
def download_dataset(self):
|
| 13 |
+
try:
|
| 14 |
+
# ... (download logic remains exactly the same)
|
| 15 |
+
logger.info("Authenticating with Kaggle API...")
|
| 16 |
+
kaggle.api.authenticate()
|
| 17 |
+
logger.info("Authentication successful.")
|
| 18 |
+
|
| 19 |
+
dataset_id = self.config.source_kaggle_dataset_id
|
| 20 |
+
download_path = self.config.unzip_dir
|
| 21 |
+
|
| 22 |
+
expected_data_folder = download_path / "chest_xray"
|
| 23 |
+
if expected_data_folder.exists():
|
| 24 |
+
logger.info(f"Dataset already exists at {expected_data_folder}. Skipping download.")
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
logger.info(f"Downloading dataset '{dataset_id}' to '{download_path}'...")
|
| 28 |
+
kaggle.api.dataset_download_files(
|
| 29 |
+
dataset=dataset_id, path=download_path, unzip=True, quiet=False
|
| 30 |
+
)
|
| 31 |
+
logger.info("Dataset downloaded and unzipped successfully.")
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
logger.error(f"Failed to download dataset from Kaggle. Error: {e}")
|
| 35 |
+
raise e
|
| 36 |
+
|
| 37 |
+
def create_dataframes(self):
|
| 38 |
+
"""
|
| 39 |
+
Scans train, test, and val directories and creates separate DataFrames.
|
| 40 |
+
"""
|
| 41 |
+
source_root = self.config.unzip_dir / "chest_xray"
|
| 42 |
+
|
| 43 |
+
# Helper function to create a dataframe for a given split (train/test/val)
|
| 44 |
+
def _create_df_for_split(split_name: str, save_path: Path):
|
| 45 |
+
split_path = source_root / split_name
|
| 46 |
+
file_names, labels = [], []
|
| 47 |
+
|
| 48 |
+
# Using .glob to find all .jpeg files in NORMAL and PNEUMONIA subfolders
|
| 49 |
+
for file in sorted(split_path.glob('*/*.jpeg')):
|
| 50 |
+
label = file.parent.name # NORMAL or PNEUMONIA
|
| 51 |
+
labels.append(label)
|
| 52 |
+
file_names.append(str(file))
|
| 53 |
+
|
| 54 |
+
df = pd.DataFrame({"image": file_names, "label": labels})
|
| 55 |
+
df.to_csv(save_path, index=False)
|
| 56 |
+
logger.info(f"Created and saved {split_name} DataFrame to {save_path}")
|
| 57 |
+
|
| 58 |
+
# Create DataFrames for each split
|
| 59 |
+
_create_df_for_split("train", self.config.train_df_path)
|
| 60 |
+
_create_df_for_split("test", self.config.test_df_path)
|
| 61 |
+
_create_df_for_split("val", self.config.val_df_path)
|
| 62 |
+
|
| 63 |
+
def ingest_data(self):
|
| 64 |
+
logger.info("Starting data ingestion process.")
|
| 65 |
+
self.download_dataset()
|
| 66 |
+
self.create_dataframes()
|
| 67 |
+
logger.info("Data ingestion process completed.")
|
src/vitClassifier/components/data_transformation.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/components/data_transformation.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from datasets import Dataset, Image, ClassLabel
|
| 5 |
+
from imblearn.over_sampling import RandomOverSampler
|
| 6 |
+
from vitClassifier.entity.config_entity import DataTransformationConfig
|
| 7 |
+
from vitClassifier import logger
|
| 8 |
+
# --- NEW IMPORTS ---
|
| 9 |
+
from transformers import ViTImageProcessor
|
| 10 |
+
from torchvision.transforms import (Compose, Resize, ToTensor, Normalize, RandomRotation, RandomHorizontalFlip)
|
| 11 |
+
|
| 12 |
+
class DataTransformation:
|
| 13 |
+
def __init__(self, config: DataTransformationConfig, random_state: int, model_name: str):
|
| 14 |
+
self.config = config
|
| 15 |
+
self.random_state = random_state
|
| 16 |
+
self.model_name = model_name # <-- Need model_name to load the correct processor
|
| 17 |
+
|
| 18 |
+
def transform_data(self):
|
| 19 |
+
# --- 1. Load DataFrames and apply Oversampling (same as before) ---
|
| 20 |
+
train_df = pd.read_csv(self.config.train_data_path)
|
| 21 |
+
test_df = pd.read_csv(self.config.test_data_path)
|
| 22 |
+
val_df = pd.read_csv(self.config.val_data_path)
|
| 23 |
+
|
| 24 |
+
y = train_df[['label']]
|
| 25 |
+
X = train_df.drop(['label'], axis=1)
|
| 26 |
+
ros = RandomOverSampler(random_state=self.random_state)
|
| 27 |
+
X_resampled, y_resampled = ros.fit_resample(X, y)
|
| 28 |
+
train_df_balanced = pd.concat([X_resampled, y_resampled], axis=1)
|
| 29 |
+
|
| 30 |
+
train_dataset = Dataset.from_pandas(train_df_balanced).cast_column("image", Image())
|
| 31 |
+
test_dataset = Dataset.from_pandas(test_df).cast_column("image", Image())
|
| 32 |
+
val_dataset = Dataset.from_pandas(val_df).cast_column("image", Image())
|
| 33 |
+
|
| 34 |
+
# --- 2. Label Encoding (same as before) ---
|
| 35 |
+
labels_list = train_df_balanced['label'].unique().tolist()
|
| 36 |
+
class_labels = ClassLabel(num_classes=len(labels_list), names=labels_list)
|
| 37 |
+
|
| 38 |
+
def map_label2id(example):
|
| 39 |
+
example['label'] = class_labels.str2int(example['label'])
|
| 40 |
+
return example
|
| 41 |
+
|
| 42 |
+
train_dataset = train_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
|
| 43 |
+
test_dataset = test_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
|
| 44 |
+
val_dataset = val_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
|
| 45 |
+
|
| 46 |
+
# --- 3. THE NEW LOGIC: Preprocess images with .map() ---
|
| 47 |
+
logger.info("Starting image preprocessing with .map(). This may take a few minutes...")
|
| 48 |
+
processor = ViTImageProcessor.from_pretrained(self.model_name)
|
| 49 |
+
image_mean, image_std = processor.image_mean, processor.image_std
|
| 50 |
+
size = processor.size["height"]
|
| 51 |
+
normalize = Normalize(mean=image_mean, std=image_std)
|
| 52 |
+
|
| 53 |
+
# Define transforms
|
| 54 |
+
_train_transforms = Compose([Resize((size, size)), RandomRotation(15), RandomHorizontalFlip(), ToTensor(), normalize])
|
| 55 |
+
_val_test_transforms = Compose([Resize((size, size)), ToTensor(), normalize])
|
| 56 |
+
|
| 57 |
+
def apply_train_transforms(examples):
|
| 58 |
+
examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
|
| 59 |
+
return examples
|
| 60 |
+
|
| 61 |
+
def apply_val_test_transforms(examples):
|
| 62 |
+
examples['pixel_values'] = [_val_test_transforms(image.convert("RGB")) for image in examples['image']]
|
| 63 |
+
return examples
|
| 64 |
+
|
| 65 |
+
# Use .map() to apply transforms and create 'pixel_values' column
|
| 66 |
+
train_dataset = train_dataset.map(apply_train_transforms, batched=True)
|
| 67 |
+
test_dataset = test_dataset.map(apply_val_test_transforms, batched=True)
|
| 68 |
+
val_dataset = val_dataset.map(apply_val_test_transforms, batched=True)
|
| 69 |
+
|
| 70 |
+
# Remove the original 'image' column to save space
|
| 71 |
+
train_dataset = train_dataset.remove_columns(['image'])
|
| 72 |
+
test_dataset = test_dataset.remove_columns(['image'])
|
| 73 |
+
val_dataset = val_dataset.remove_columns(['image'])
|
| 74 |
+
|
| 75 |
+
# --- 4. Save the fully processed datasets ---
|
| 76 |
+
train_dataset.save_to_disk(str(self.config.train_dataset_path))
|
| 77 |
+
test_dataset.save_to_disk(str(self.config.test_dataset_path))
|
| 78 |
+
val_dataset.save_to_disk(str(self.config.val_dataset_path))
|
| 79 |
+
|
| 80 |
+
logger.info("Data Transformation complete. Fully preprocessed datasets saved.")
|
src/vitClassifier/components/model_evaluation.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/components/model_evaluation.py
|
| 2 |
+
|
| 3 |
+
import mlflow
|
| 4 |
+
import mlflow.pytorch
|
| 5 |
+
import torch
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from datasets import load_from_disk
|
| 9 |
+
from transformers import (ViTForImageClassification, ViTImageProcessor, Trainer, TrainingArguments, DefaultDataCollator)
|
| 10 |
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
| 11 |
+
from vitClassifier.entity.config_entity import EvaluationConfig
|
| 12 |
+
from vitClassifier.utils.common import read_yaml # Keep this if you need it, but it's not used here
|
| 13 |
+
from vitClassifier import logger
|
| 14 |
+
|
| 15 |
+
class ModelEvaluation:
|
| 16 |
+
def __init__(self, config: EvaluationConfig):
|
| 17 |
+
self.config = config
|
| 18 |
+
|
| 19 |
+
def evaluate(self):
|
| 20 |
+
# Determine device
|
| 21 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
+
|
| 23 |
+
# Load the best model from the training stage and move it to the correct device
|
| 24 |
+
model_path = str(self.config.path_of_model)
|
| 25 |
+
model = ViTForImageClassification.from_pretrained(model_path).to(device)
|
| 26 |
+
|
| 27 |
+
# Load the pre-processed test dataset
|
| 28 |
+
test_data = load_from_disk(str(self.config.test_dataset_path))
|
| 29 |
+
|
| 30 |
+
# We DO NOT need transforms here because the data is already processed
|
| 31 |
+
# test_data.set_transform(...) # REMOVED
|
| 32 |
+
|
| 33 |
+
# Use the default collator which handles 'pixel_values' and 'label'
|
| 34 |
+
data_collator = DefaultDataCollator()
|
| 35 |
+
|
| 36 |
+
# Dummy trainer for running predictions
|
| 37 |
+
eval_args = TrainingArguments(
|
| 38 |
+
output_dir="./eval_output", # Temporary directory
|
| 39 |
+
per_device_eval_batch_size=self.config.batch_size,
|
| 40 |
+
report_to="none"
|
| 41 |
+
)
|
| 42 |
+
trainer = Trainer(
|
| 43 |
+
model=model,
|
| 44 |
+
args=eval_args,
|
| 45 |
+
data_collator=data_collator
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# --- Run Predictions ---
|
| 49 |
+
logger.info("Running final evaluation on the test set...")
|
| 50 |
+
outputs = trainer.predict(test_data)
|
| 51 |
+
y_true = outputs.label_ids
|
| 52 |
+
y_pred = outputs.predictions.argmax(1)
|
| 53 |
+
|
| 54 |
+
# --- Calculate Metrics ---
|
| 55 |
+
scores = {
|
| 56 |
+
"accuracy": accuracy_score(y_true, y_pred),
|
| 57 |
+
"f1_score": f1_score(y_true, y_pred, average='macro'),
|
| 58 |
+
"precision": precision_score(y_true, y_pred, average='macro'),
|
| 59 |
+
"recall": recall_score(y_true, y_pred, average='macro')
|
| 60 |
+
}
|
| 61 |
+
logger.info(f"Test Set Metrics: {scores}")
|
| 62 |
+
|
| 63 |
+
# --- Save Metrics to a JSON file ---
|
| 64 |
+
metrics_path = Path(self.config.metrics_file_name)
|
| 65 |
+
|
| 66 |
+
# Now create the directory
|
| 67 |
+
metrics_path.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
with open(metrics_path, 'w') as f:
|
| 70 |
+
json.dump(scores, f, indent=4)
|
| 71 |
+
logger.info(f"Metrics saved to {metrics_path}")
|
| 72 |
+
|
| 73 |
+
# --- Log to MLflow ---
|
| 74 |
+
mlflow.set_tracking_uri(self.config.mlflow_uri)
|
| 75 |
+
mlflow.set_experiment("Pneumonia-ViT-Classification")
|
| 76 |
+
|
| 77 |
+
with mlflow.start_run():
|
| 78 |
+
logger.info("Logging parameters and metrics to MLflow...")
|
| 79 |
+
mlflow.log_params(self.config.all_params)
|
| 80 |
+
mlflow.log_metrics(scores)
|
| 81 |
+
|
| 82 |
+
# --- THIS IS THE FINAL FIX ---
|
| 83 |
+
# Instead of logging the model object, log the directory where the
|
| 84 |
+
# trained model was already saved by the Trainer.
|
| 85 |
+
# `mlflow.log_artifact` is a simple upload and will not cause registry errors.
|
| 86 |
+
model_dir_path = str(self.config.path_of_model)
|
| 87 |
+
mlflow.log_artifact(model_dir_path, artifact_path="model")
|
| 88 |
+
|
| 89 |
+
logger.info("Successfully logged artifacts to MLflow.")
|
src/vitClassifier/components/model_training.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/components/model_training.py
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from datasets import load_from_disk
|
| 5 |
+
from transformers import (ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer, DefaultDataCollator)
|
| 6 |
+
from vitClassifier.entity.config_entity import TrainingConfig
|
| 7 |
+
from vitClassifier import logger
|
| 8 |
+
import evaluate
|
| 9 |
+
|
| 10 |
+
class ModelTraining:
|
| 11 |
+
def __init__(self, config: TrainingConfig):
|
| 12 |
+
self.config = config
|
| 13 |
+
|
| 14 |
+
def train(self):
|
| 15 |
+
# --- NEW: Explicitly define the device ---
|
| 16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
+
logger.info(f"Using device: {device}")
|
| 18 |
+
|
| 19 |
+
# --- Load datasets (no change) ---
|
| 20 |
+
train_data = load_from_disk(str(self.config.train_dataset_path))
|
| 21 |
+
val_data = load_from_disk(str(self.config.val_dataset_path))
|
| 22 |
+
|
| 23 |
+
id2label = {i: label for i, label in enumerate(train_data.features['label'].names)}
|
| 24 |
+
label2id = {label: i for i, label in id2label.items()}
|
| 25 |
+
|
| 26 |
+
model = ViTForImageClassification.from_pretrained(
|
| 27 |
+
self.config.model_name, num_labels=len(id2label), id2label=id2label,
|
| 28 |
+
label2id=label2id, ignore_mismatched_sizes=True
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# --- NEW: Move the model to the correct device ---
|
| 32 |
+
model.to(device)
|
| 33 |
+
|
| 34 |
+
# --- TrainingArguments (no change) ---
|
| 35 |
+
args = TrainingArguments(
|
| 36 |
+
output_dir=str(self.config.root_dir),
|
| 37 |
+
learning_rate=self.config.learning_rate,
|
| 38 |
+
per_device_train_batch_size=self.config.batch_size,
|
| 39 |
+
per_device_eval_batch_size=self.config.batch_size,
|
| 40 |
+
num_train_epochs=self.config.epochs,
|
| 41 |
+
weight_decay=self.config.weight_decay,
|
| 42 |
+
warmup_steps=self.config.warmup_steps,
|
| 43 |
+
save_strategy='epoch',
|
| 44 |
+
eval_strategy='epoch',
|
| 45 |
+
load_best_model_at_end=True,
|
| 46 |
+
metric_for_best_model="accuracy",
|
| 47 |
+
save_total_limit=1,
|
| 48 |
+
report_to="none"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
metric = evaluate.load("accuracy")
|
| 52 |
+
|
| 53 |
+
def compute_metrics(eval_pred):
|
| 54 |
+
predictions, labels = eval_pred
|
| 55 |
+
predictions = predictions.argmax(axis=1)
|
| 56 |
+
return metric.compute(predictions=predictions, references=labels)
|
| 57 |
+
|
| 58 |
+
data_collator = DefaultDataCollator()
|
| 59 |
+
processor = ViTImageProcessor.from_pretrained(self.config.model_name)
|
| 60 |
+
|
| 61 |
+
trainer = Trainer(
|
| 62 |
+
model, # The model is now already on the GPU
|
| 63 |
+
args,
|
| 64 |
+
train_dataset=train_data,
|
| 65 |
+
eval_dataset=val_data,
|
| 66 |
+
data_collator=data_collator,
|
| 67 |
+
compute_metrics=compute_metrics,
|
| 68 |
+
tokenizer=processor,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
logger.info("Starting model fine-tuning with validation...")
|
| 72 |
+
trainer.train()
|
| 73 |
+
trainer.save_model(str(self.config.trained_model_path))
|
| 74 |
+
logger.info("Model fine-tuning complete and best model saved.")
|
src/vitClassifier/config/__init__.py
ADDED
|
File without changes
|
src/vitClassifier/config/configuration.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/config/configuration.py
|
| 2 |
+
|
| 3 |
+
from vitClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH # <-- THIS IMPORT IS THE FIX
|
| 4 |
+
from vitClassifier.utils.common import read_yaml, create_directories
|
| 5 |
+
from vitClassifier.entity.config_entity import (DataIngestionConfig,
|
| 6 |
+
DataTransformationConfig,
|
| 7 |
+
TrainingConfig,
|
| 8 |
+
EvaluationConfig)
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
class ConfigurationManager:
|
| 13 |
+
def __init__(self, config_filepath=None, params_filepath=None):
|
| 14 |
+
|
| 15 |
+
# If no path is provided when creating an instance, use the imported constants
|
| 16 |
+
if config_filepath is None:
|
| 17 |
+
config_filepath = CONFIG_FILE_PATH
|
| 18 |
+
if params_filepath is None:
|
| 19 |
+
params_filepath = PARAMS_FILE_PATH
|
| 20 |
+
|
| 21 |
+
self.config = read_yaml(config_filepath)
|
| 22 |
+
self.params = read_yaml(params_filepath)
|
| 23 |
+
create_directories([self.config.artifacts_root])
|
| 24 |
+
|
| 25 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
| 26 |
+
config = self.config.data_ingestion
|
| 27 |
+
create_directories([config.root_dir])
|
| 28 |
+
return DataIngestionConfig(
|
| 29 |
+
root_dir=Path(config.root_dir),
|
| 30 |
+
source_kaggle_dataset_id=config.source_kaggle_dataset_id,
|
| 31 |
+
unzip_dir=Path(config.unzip_dir),
|
| 32 |
+
train_df_path=Path(config.train_df_path),
|
| 33 |
+
test_df_path=Path(config.test_df_path),
|
| 34 |
+
val_df_path=Path(config.val_df_path)
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
def get_data_transformation_config(self) -> DataTransformationConfig:
|
| 38 |
+
config = self.config.data_transformation
|
| 39 |
+
create_directories([config.root_dir])
|
| 40 |
+
return DataTransformationConfig(
|
| 41 |
+
root_dir=Path(config.root_dir),
|
| 42 |
+
train_data_path=Path(config.train_data_path),
|
| 43 |
+
test_data_path=Path(config.test_data_path),
|
| 44 |
+
val_data_path=Path(config.val_data_path),
|
| 45 |
+
train_dataset_path=Path(config.train_dataset_path),
|
| 46 |
+
test_dataset_path=Path(config.test_dataset_path),
|
| 47 |
+
val_dataset_path=Path(config.val_dataset_path)
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def get_training_config(self) -> TrainingConfig:
|
| 51 |
+
training = self.config.model_training
|
| 52 |
+
params = self.params
|
| 53 |
+
create_directories([Path(training.root_dir)])
|
| 54 |
+
return TrainingConfig(
|
| 55 |
+
root_dir=Path(training.root_dir),
|
| 56 |
+
trained_model_path=Path(training.trained_model_path),
|
| 57 |
+
model_name=training.model_name,
|
| 58 |
+
train_dataset_path=Path(training.train_dataset_path),
|
| 59 |
+
val_dataset_path=Path(training.val_dataset_path),
|
| 60 |
+
learning_rate=params.LEARNING_RATE,
|
| 61 |
+
batch_size=params.BATCH_SIZE,
|
| 62 |
+
epochs=params.EPOCHS,
|
| 63 |
+
weight_decay=params.WEIGHT_DECAY,
|
| 64 |
+
warmup_steps=params.WARMUP_STEPS,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def get_evaluation_config(self) -> EvaluationConfig:
|
| 68 |
+
eval_config = self.config.model_evaluation
|
| 69 |
+
return EvaluationConfig(
|
| 70 |
+
path_of_model=Path(eval_config.model_path),
|
| 71 |
+
test_dataset_path=Path(eval_config.test_dataset_path),
|
| 72 |
+
mlflow_uri=eval_config.mlflow_uri,
|
| 73 |
+
all_params=self.params,
|
| 74 |
+
batch_size=self.params.BATCH_SIZE,
|
| 75 |
+
metrics_file_name=Path(eval_config.metrics_file_name) # <--- MAKE SURE THIS LINE EXISTS
|
| 76 |
+
)
|
src/vitClassifier/constants/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
CONFIG_FILE_PATH = Path("config/config.yaml")
|
| 4 |
+
PARAMS_FILE_PATH = Path("params.yaml")
|
src/vitClassifier/entity/__init__.py
ADDED
|
File without changes
|
src/vitClassifier/entity/config_entity.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
@dataclass(frozen=True)
|
| 5 |
+
class DataIngestionConfig:
|
| 6 |
+
root_dir: Path
|
| 7 |
+
source_kaggle_dataset_id: str
|
| 8 |
+
unzip_dir: Path
|
| 9 |
+
train_df_path: Path # New
|
| 10 |
+
test_df_path: Path # New
|
| 11 |
+
val_df_path: Path # New
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class DataTransformationConfig:
|
| 15 |
+
root_dir: Path
|
| 16 |
+
train_data_path: Path # New
|
| 17 |
+
test_data_path: Path # New
|
| 18 |
+
val_data_path: Path # New
|
| 19 |
+
train_dataset_path: Path
|
| 20 |
+
test_dataset_path: Path
|
| 21 |
+
val_dataset_path: Path # New
|
| 22 |
+
|
| 23 |
+
@dataclass(frozen=True)
|
| 24 |
+
class TrainingConfig:
|
| 25 |
+
root_dir: Path
|
| 26 |
+
trained_model_path: Path
|
| 27 |
+
model_name: str
|
| 28 |
+
train_dataset_path: Path # New
|
| 29 |
+
val_dataset_path: Path # New
|
| 30 |
+
learning_rate: float
|
| 31 |
+
batch_size: int
|
| 32 |
+
epochs: int
|
| 33 |
+
weight_decay: float
|
| 34 |
+
warmup_steps: int
|
| 35 |
+
|
| 36 |
+
@dataclass(frozen=True)
|
| 37 |
+
class EvaluationConfig:
|
| 38 |
+
path_of_model: Path
|
| 39 |
+
test_dataset_path: Path
|
| 40 |
+
mlflow_uri: str
|
| 41 |
+
all_params: dict
|
| 42 |
+
batch_size: int
|
| 43 |
+
metrics_file_name: Path
|
src/vitClassifier/pipeline/__init__.py
ADDED
|
File without changes
|
src/vitClassifier/pipeline/stage_01_data_ingestion.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/vitClassifier/pipeline/stage_01_data_ingestion.py
|
| 2 |
+
|
| 3 |
+
from vitClassifier.config.configuration import ConfigurationManager
|
| 4 |
+
from vitClassifier.components.data_ingestion import DataIngestion
|
| 5 |
+
from vitClassifier import logger
|
| 6 |
+
|
| 7 |
+
STAGE_NAME = "Data Ingestion stage"
|
| 8 |
+
|
| 9 |
+
class DataIngestionTrainingPipeline:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
pass
|
| 12 |
+
def main(self):
|
| 13 |
+
config = ConfigurationManager()
|
| 14 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
| 15 |
+
data_ingestion = DataIngestion(config=data_ingestion_config)
|
| 16 |
+
data_ingestion.ingest_data()
|
| 17 |
+
|
| 18 |
+
# <<< ADD THIS BLOCK TO MAKE THE SCRIPT RUNNABLE >>>
|
| 19 |
+
if __name__ == '__main__':
|
| 20 |
+
try:
|
| 21 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 22 |
+
obj = DataIngestionTrainingPipeline()
|
| 23 |
+
obj.main()
|
| 24 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(e)
|
| 27 |
+
raise e
|
src/vitClassifier/pipeline/stage_02_data_transformation.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from vitClassifier.config.configuration import ConfigurationManager
|
| 2 |
+
from vitClassifier.components.data_transformation import DataTransformation
|
| 3 |
+
from vitClassifier import logger
|
| 4 |
+
|
| 5 |
+
STAGE_NAME = "Data Transformation stage"
|
| 6 |
+
|
| 7 |
+
class DataTransformationTrainingPipeline:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
def main(self):
|
| 11 |
+
config_manager = ConfigurationManager()
|
| 12 |
+
data_transformation_config = config_manager.get_data_transformation_config()
|
| 13 |
+
params = config_manager.params
|
| 14 |
+
# Get model_name from the training config section
|
| 15 |
+
model_name = config_manager.config.model_training.model_name
|
| 16 |
+
|
| 17 |
+
data_transformation = DataTransformation(
|
| 18 |
+
config=data_transformation_config,
|
| 19 |
+
random_state=params.RANDOM_STATE,
|
| 20 |
+
model_name=model_name # Pass the model name
|
| 21 |
+
)
|
| 22 |
+
data_transformation.transform_data()
|
| 23 |
+
|
| 24 |
+
if __name__ == '__main__':
|
| 25 |
+
try:
|
| 26 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 27 |
+
obj = DataTransformationTrainingPipeline()
|
| 28 |
+
obj.main()
|
| 29 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.exception(e)
|
| 32 |
+
raise e
|
src/vitClassifier/pipeline/stage_03_model_training.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from vitClassifier.config.configuration import ConfigurationManager
|
| 2 |
+
from vitClassifier.components.model_training import ModelTraining
|
| 3 |
+
from vitClassifier import logger
|
| 4 |
+
|
| 5 |
+
STAGE_NAME = "Model Training stage"
|
| 6 |
+
|
| 7 |
+
class ModelTrainingPipeline:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
def main(self):
|
| 11 |
+
config = ConfigurationManager()
|
| 12 |
+
training_config = config.get_training_config()
|
| 13 |
+
model_training = ModelTraining(config=training_config)
|
| 14 |
+
model_training.train()
|
| 15 |
+
|
| 16 |
+
if __name__ == '__main__':
|
| 17 |
+
try:
|
| 18 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 19 |
+
obj = ModelTrainingPipeline()
|
| 20 |
+
obj.main()
|
| 21 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 22 |
+
except Exception as e:
|
| 23 |
+
logger.exception(e)
|
| 24 |
+
raise e
|
src/vitClassifier/pipeline/stage_04_model_evaluation.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from vitClassifier.config.configuration import ConfigurationManager
|
| 2 |
+
from vitClassifier.components.model_evaluation import ModelEvaluation
|
| 3 |
+
from vitClassifier import logger
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
STAGE_NAME = "Model Evaluation stage"
|
| 8 |
+
|
| 9 |
+
class ModelEvaluationPipeline:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
pass
|
| 12 |
+
def main(self):
|
| 13 |
+
config = ConfigurationManager()
|
| 14 |
+
eval_config = config.get_evaluation_config()
|
| 15 |
+
evaluation = ModelEvaluation(config=eval_config)
|
| 16 |
+
evaluation.evaluate()
|
| 17 |
+
|
| 18 |
+
if __name__ == '__main__':
|
| 19 |
+
try:
|
| 20 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 21 |
+
obj = ModelEvaluationPipeline()
|
| 22 |
+
obj.main()
|
| 23 |
+
logger.info(f">>>see stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.exception(e)
|
| 26 |
+
raise e
|
src/vitClassifier/utils/__init__.py
ADDED
|
File without changes
|
src/vitClassifier/utils/common.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
import json
|
| 4 |
+
from box import ConfigBox
|
| 5 |
+
from box.exceptions import BoxValueError
|
| 6 |
+
from ensure import ensure_annotations
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
from vitClassifier import logger
|
| 10 |
+
|
| 11 |
+
@ensure_annotations
|
| 12 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
| 13 |
+
try:
|
| 14 |
+
with open(path_to_yaml) as yaml_file:
|
| 15 |
+
content = yaml.safe_load(yaml_file)
|
| 16 |
+
logger.info(f"yaml file: {path_to_yaml} loaded successfully")
|
| 17 |
+
return ConfigBox(content)
|
| 18 |
+
except BoxValueError:
|
| 19 |
+
raise ValueError("yaml file is empty")
|
| 20 |
+
except Exception as e:
|
| 21 |
+
raise e
|
| 22 |
+
|
| 23 |
+
@ensure_annotations
|
| 24 |
+
def create_directories(path_to_directories: list, verbose=True):
|
| 25 |
+
for path in path_to_directories:
|
| 26 |
+
os.makedirs(path, exist_ok=True)
|
| 27 |
+
if verbose:
|
| 28 |
+
logger.info(f"created directory at: {path}")
|
template.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
|
| 6 |
+
|
| 7 |
+
project_name = "vitClassifier"
|
| 8 |
+
|
| 9 |
+
list_of_files = [
|
| 10 |
+
".github/workflows/.gitkeep",
|
| 11 |
+
f"src/{project_name}/__init__.py",
|
| 12 |
+
f"src/{project_name}/components/__init__.py",
|
| 13 |
+
f"src/{project_name}/utils/__init__.py",
|
| 14 |
+
f"src/{project_name}/config/__init__.py",
|
| 15 |
+
f"src/{project_name}/config/configuration.py",
|
| 16 |
+
f"src/{project_name}/pipeline/__init__.py",
|
| 17 |
+
f"src/{project_name}/entity/__init__.py",
|
| 18 |
+
f"src/{project_name}/constants/__init__.py",
|
| 19 |
+
"config/config.yaml",
|
| 20 |
+
"dvc.yaml",
|
| 21 |
+
"params.yaml",
|
| 22 |
+
"requirements.txt",
|
| 23 |
+
"setup.py",
|
| 24 |
+
"research/trials.ipynb"
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
for filepath in list_of_files:
|
| 28 |
+
filepath = Path(filepath)
|
| 29 |
+
filedir, filename = os.path.split(filepath)
|
| 30 |
+
|
| 31 |
+
if filedir != "":
|
| 32 |
+
os.makedirs(filedir, exist_ok=True)
|
| 33 |
+
logging.info(f"Creating directory; {filedir} for the file: {filename}")
|
| 34 |
+
|
| 35 |
+
if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
|
| 36 |
+
with open(filepath, "w") as f:
|
| 37 |
+
pass
|
| 38 |
+
logging.info(f"Creating empty file: {filepath}")
|
| 39 |
+
else:
|
| 40 |
+
logging.info(f"{filename} is already exists")
|