ALYYAN commited on
Commit
4821854
·
1 Parent(s): bb52f51

commiting backend files

Browse files
.github/workflows/.gitkeep ADDED
File without changes
.gitignore CHANGED
@@ -1,207 +1,105 @@
1
- # Byte-compiled / optimized / DLL files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  __pycache__/
3
- *.py[codz]
4
  *$py.class
5
 
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
  *.egg-info/
25
  .installed.cfg
26
  *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
  *.manifest
33
  *.spec
34
 
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
- .coverage
44
- .coverage.*
45
- .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py.cover
50
- .hypothesis/
51
- .pytest_cache/
52
- cover/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- .pybuilder/
76
- target/
77
-
78
- # Jupyter Notebook
79
- .ipynb_checkpoints
80
-
81
- # IPython
82
- profile_default/
83
- ipython_config.py
84
-
85
- # pyenv
86
- # For a library or package, you might want to ignore these files since the code is
87
- # intended to run in multiple environments; otherwise, check them in:
88
- # .python-version
89
-
90
- # pipenv
91
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
- # install all needed dependencies.
95
- #Pipfile.lock
96
-
97
- # UV
98
- # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
- # This is especially recommended for binary packages to ensure reproducibility, and is more
100
- # commonly ignored for libraries.
101
- #uv.lock
102
-
103
- # poetry
104
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
- # This is especially recommended for binary packages to ensure reproducibility, and is more
106
- # commonly ignored for libraries.
107
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
- #poetry.lock
109
- #poetry.toml
110
-
111
- # pdm
112
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
- # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
- # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
- #pdm.lock
116
- #pdm.toml
117
- .pdm-python
118
- .pdm-build/
119
-
120
- # pixi
121
- # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
- #pixi.lock
123
- # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
- # in the .venv directory. It is recommended not to include this directory in version control.
125
- .pixi
126
-
127
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
- __pypackages__/
129
-
130
- # Celery stuff
131
- celerybeat-schedule
132
- celerybeat.pid
133
-
134
- # SageMath parsed files
135
- *.sage.py
136
-
137
- # Environments
138
- .env
139
- .envrc
140
- .venv
141
- env/
142
- venv/
143
- ENV/
144
- env.bak/
145
- venv.bak/
146
 
147
- # Spyder project settings
148
- .spyderproject
149
- .spyproject
150
-
151
- # Rope project settings
152
- .ropeproject
153
-
154
- # mkdocs documentation
155
- /site
156
-
157
- # mypy
158
- .mypy_cache/
159
- .dmypy.json
160
- dmypy.json
161
-
162
- # Pyre type checker
163
- .pyre/
164
-
165
- # pytype static type analyzer
166
- .pytype/
167
-
168
- # Cython debug symbols
169
- cython_debug/
170
-
171
- # PyCharm
172
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
- # and can be added to the global gitignore or merged into this file. For a more nuclear
175
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
- #.idea/
177
-
178
- # Abstra
179
- # Abstra is an AI-powered process automation framework.
180
- # Ignore directories containing user credentials, local state, and settings.
181
- # Learn more at https://abstra.io/docs
182
- .abstra/
183
-
184
- # Visual Studio Code
185
- # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
- # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
- # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
- # you could uncomment the following to ignore the entire vscode folder
189
- # .vscode/
190
-
191
- # Ruff stuff:
192
- .ruff_cache/
193
-
194
- # PyPI configuration file
195
- .pypirc
196
-
197
- # Cursor
198
- # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
- # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
- # refer to https://docs.cursor.com/context/ignore-files
201
- .cursorignore
202
- .cursorindexingignore
203
-
204
- # Marimo
205
- marimo/_static/
206
- marimo/_lsp/
207
- __marimo__/
 
1
+ # =============================================================================
2
+ # Python Virtual Environments
3
+ # Never commit the virtual environment folder.
4
+ # =============================================================================
5
+ /venv/
6
+ .venv/
7
+ env/
8
+ .env
9
+ ENV/
10
+ env.bak/
11
+ venv.bak/
12
+
13
+
14
+ # =============================================================================
15
+ # DVC and MLflow Artifacts
16
+ # DVC tracks data, so we don't need Git to. We only commit the .dvc files.
17
+ # The `artifacts` and `logs` directories will be generated by the pipeline.
18
+ # =============================================================================
19
+ /artifacts/
20
+ /logs/
21
+ /mlruns/
22
+ # DVC's internal cache should NEVER be committed.
23
+ /.dvc/cache/
24
+ # DVC's temporary directories
25
+ /.dvc/tmp/
26
+
27
+
28
+ # =============================================================================
29
+ # Python Cache and Compiled Files
30
+ # These are generated automatically by Python and don't need to be versioned.
31
+ # =============================================================================
32
  __pycache__/
33
+ *.py[cod]
34
  *$py.class
35
 
36
+
37
+ # =============================================================================
38
+ # Build and Distribution Artifacts
39
+ # Generated when building a Python package.
40
+ # =============================================================================
41
+ /build/
42
+ /develop-eggs/
43
+ /dist/
44
+ /downloads/
45
+ /eggs/
46
+ /.eggs/
47
+ /lib/
48
+ /lib64/
49
+ /parts/
50
+ /sdist/
51
+ /var/
52
+ /wheels/
 
53
  *.egg-info/
54
  .installed.cfg
55
  *.egg
 
 
 
 
 
56
  *.manifest
57
  *.spec
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # =============================================================================
61
+ # IDE and Editor-Specific Files
62
+ # Ignore configuration files from common editors like VSCode, PyCharm, etc.
63
+ # =============================================================================
64
+ .vscode/
65
+ .idea/
66
+ .project
67
+ .pydevproject
68
+ .classpath
69
+ *.swp
70
+ *.swo
71
+
72
+
73
+ # =============================================================================
74
+ # OS-specific Files
75
+ # Ignore files generated by macOS, Windows, and Linux.
76
+ # =============================================================================
77
+ .DS_Store
78
+ Thumbs.db
79
+ Desktop.ini
80
+
81
+
82
+ # =============================================================================
83
+ # Jupyter Notebook Checkpoints
84
+ # Ignore the checkpoints directory created by Jupyter.
85
+ # =============================================================================
86
+ .ipynb_checkpoints/
87
+
88
+
89
+ # =============================================================================
90
+ # Kaggle API Credentials
91
+ # IMPORTANT: Never commit your API keys or secrets.
92
+ # =============================================================================
93
+ kaggle.json
94
+
95
+
96
+ # =============================================================================
97
+ # Other
98
+ # Any other miscellaneous files that shouldn't be in the repo.
99
+ # =============================================================================
100
+ *.log
101
+ *.tmp
102
+ *.bak
103
+ *.local
104
+ eval_output/ # Temporary directory created by the evaluation component
105
+ .env
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/config.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config/config.yaml
2
+
3
+ artifacts_root: artifacts
4
+
5
+ data_ingestion:
6
+ root_dir: artifacts/data_ingestion
7
+ source_kaggle_dataset_id: "paultimothymooney/chest-xray-pneumonia"
8
+ unzip_dir: artifacts/data_ingestion/
9
+ # We will create these three files now
10
+ train_df_path: artifacts/data_ingestion/train_df.csv
11
+ test_df_path: artifacts/data_ingestion/test_df.csv
12
+ val_df_path: artifacts/data_ingestion/val_df.csv
13
+
14
+ data_transformation:
15
+ root_dir: artifacts/data_transformation
16
+ # We now have three sources
17
+ train_data_path: artifacts/data_ingestion/train_df.csv
18
+ test_data_path: artifacts/data_ingestion/test_df.csv
19
+ val_data_path: artifacts/data_ingestion/val_df.csv
20
+ # And will create three outputs
21
+ train_dataset_path: artifacts/data_transformation/train_dataset
22
+ test_dataset_path: artifacts/data_transformation/test_dataset
23
+ val_dataset_path: artifacts/data_transformation/val_dataset
24
+
25
+ model_training:
26
+ root_dir: artifacts/model_training
27
+ trained_model_path: artifacts/model_training/model
28
+ model_name: "google/vit-base-patch16-224-in21k"
29
+ # We'll use the validation set for evaluation during training
30
+ train_dataset_path: artifacts/data_transformation/train_dataset
31
+ val_dataset_path: artifacts/data_transformation/val_dataset
32
+
33
+ model_evaluation:
34
+ root_dir: artifacts/model_evaluation
35
+ model_path: artifacts/model_training/model
36
+ # Final evaluation is done on the unseen test set
37
+ test_dataset_path: artifacts/data_transformation/test_dataset
38
+ metrics_file_name: artifacts/model_evaluation/metrics.json
39
+ mlflow_uri: "https://dagshub.com/AlyyanAhmed21/Chest-X-ray-Pneumonia-Detection-with-ViT.mlflow"
dvc.lock ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ schema: '2.0'
2
+ stages:
3
+ data_ingestion:
4
+ cmd: python src/vitClassifier/pipeline/stage_01_data_ingestion.py
5
+ deps:
6
+ - path: config/config.yaml
7
+ hash: md5
8
+ md5: 9a45c00c11b9a8a0b4e396952a6b19a8
9
+ size: 1591
10
+ - path: src/vitClassifier/pipeline/stage_01_data_ingestion.py
11
+ hash: md5
12
+ md5: 2486829b866dffe25a752248afa95f4f
13
+ size: 968
14
+ outs:
15
+ - path: artifacts/data_ingestion
16
+ hash: md5
17
+ md5: 9921508de1b9f2e8a5a4150d063e178d.dir
18
+ size: 2484429974
19
+ nfiles: 17594
20
+ data_transformation:
21
+ cmd: python src/vitClassifier/pipeline/stage_02_data_transformation.py
22
+ deps:
23
+ - path: artifacts/data_ingestion/test_df.csv
24
+ hash: md5
25
+ md5: 95cbf91a4d0719e528c74879d6da0e34
26
+ size: 53272
27
+ - path: artifacts/data_ingestion/train_df.csv
28
+ hash: md5
29
+ md5: af0d24afd4d9092b64bb1db986d38f76
30
+ size: 460017
31
+ - path: artifacts/data_ingestion/val_df.csv
32
+ hash: md5
33
+ md5: 575134cde7f8113c2a51dd4fac3e4c5e
34
+ size: 1389
35
+ - path: config/config.yaml
36
+ hash: md5
37
+ md5: 9a45c00c11b9a8a0b4e396952a6b19a8
38
+ size: 1591
39
+ - path: params.yaml
40
+ hash: md5
41
+ md5: cc525f2481819601bb93ec5d7f008dda
42
+ size: 127
43
+ - path: src/vitClassifier/pipeline/stage_02_data_transformation.py
44
+ hash: md5
45
+ md5: 095fcfa8843d6b94a05d9f2172522b32
46
+ size: 1237
47
+ outs:
48
+ - path: artifacts/data_transformation
49
+ hash: md5
50
+ md5: b91edf7d0a4b4b0022f2d33d3f2176fa.dir
51
+ size: 5074488112
52
+ nfiles: 18
53
+ model_training:
54
+ cmd: python src/vitClassifier/pipeline/stage_03_model_training.py
55
+ deps:
56
+ - path: artifacts/data_transformation/train_dataset
57
+ hash: md5
58
+ md5: 64425f8e57c16ac250c2ea73b78b7aa2.dir
59
+ size: 4687397188
60
+ nfiles: 12
61
+ - path: artifacts/data_transformation/val_dataset
62
+ hash: md5
63
+ md5: e19e673f104f2efec21df351b3d4869c.dir
64
+ size: 9678966
65
+ nfiles: 3
66
+ - path: config/config.yaml
67
+ hash: md5
68
+ md5: 9a45c00c11b9a8a0b4e396952a6b19a8
69
+ size: 1591
70
+ - path: params.yaml
71
+ hash: md5
72
+ md5: cc525f2481819601bb93ec5d7f008dda
73
+ size: 127
74
+ - path: src/vitClassifier/pipeline/stage_03_model_training.py
75
+ hash: md5
76
+ md5: 5e9cde1828fc4b2608e9f4e92b134a07
77
+ size: 815
78
+ outs:
79
+ - path: artifacts/model_training/model
80
+ hash: md5
81
+ md5: 9f0765ff59616eddac47fcaf7a5e7387.dir
82
+ size: 343230531
83
+ nfiles: 4
84
+ model_evaluation:
85
+ cmd: python src/vitClassifier/pipeline/stage_04_model_evaluation.py
86
+ deps:
87
+ - path: artifacts/data_transformation/test_dataset
88
+ hash: md5
89
+ md5: 41a8f95d5075f06bef31fbf55d838cca.dir
90
+ size: 377411958
91
+ nfiles: 3
92
+ - path: artifacts/model_training/model
93
+ hash: md5
94
+ md5: 9f0765ff59616eddac47fcaf7a5e7387.dir
95
+ size: 343230531
96
+ nfiles: 4
97
+ - path: config/config.yaml
98
+ hash: md5
99
+ md5: 9224d2383ec670f1738b47139f250ad4
100
+ size: 1659
101
+ - path: src/vitClassifier/pipeline/stage_04_model_evaluation.py
102
+ hash: md5
103
+ md5: e31c602e23dbfa62f6453ca44b621d0a
104
+ size: 863
105
+ outs:
106
+ - path: artifacts/model_evaluation/metrics.json
107
+ hash: md5
108
+ md5: 26b4e3326f589929e4a6e34833cc187f
109
+ size: 150
dvc.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stages:
2
+ data_ingestion:
3
+ cmd: python src/vitClassifier/pipeline/stage_01_data_ingestion.py
4
+ deps:
5
+ - src/vitClassifier/pipeline/stage_01_data_ingestion.py
6
+ - config/config.yaml
7
+ outs:
8
+ - artifacts/data_ingestion
9
+
10
+ data_transformation:
11
+ cmd: python src/vitClassifier/pipeline/stage_02_data_transformation.py
12
+ deps:
13
+ - src/vitClassifier/pipeline/stage_02_data_transformation.py
14
+ # --- THIS IS THE FIX ---
15
+ # Remove the old dependency and add the three new ones.
16
+ - artifacts/data_ingestion/train_df.csv
17
+ - artifacts/data_ingestion/test_df.csv
18
+ - artifacts/data_ingestion/val_df.csv
19
+ - config/config.yaml
20
+ - params.yaml
21
+ outs:
22
+ - artifacts/data_transformation
23
+
24
+ model_training:
25
+ cmd: python src/vitClassifier/pipeline/stage_03_model_training.py
26
+ deps:
27
+ - src/vitClassifier/pipeline/stage_03_model_training.py
28
+ - artifacts/data_transformation/train_dataset
29
+ - artifacts/data_transformation/val_dataset # Added dependency on val dataset
30
+ - config/config.yaml
31
+ - params.yaml
32
+ outs:
33
+ - artifacts/model_training/model
34
+
35
+ model_evaluation:
36
+ cmd: python src/vitClassifier/pipeline/stage_04_model_evaluation.py
37
+ deps:
38
+ - src/vitClassifier/pipeline/stage_04_model_evaluation.py
39
+ - artifacts/data_transformation/test_dataset
40
+ - artifacts/model_training/model
41
+ - config/config.yaml
42
+ metrics:
43
+ - artifacts/model_evaluation/metrics.json:
44
+ cache: false
gpu ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # check_gpu.py
2
+
3
+ import sys
4
+ import torch
5
+
6
+ def check_gpu_environment():
7
+ """
8
+ This script checks the system's Python and PyTorch GPU environment.
9
+ It prints detailed information about the setup.
10
+ """
11
+ print("--- System and Python Information ---")
12
+ print(f"Python Version: {sys.version}")
13
+ print("\n--- PyTorch and CUDA Information ---")
14
+
15
+ try:
16
+ print(f"PyTorch Version: {torch.__version__}")
17
+
18
+ # Check if CUDA (GPU support) is available
19
+ cuda_available = torch.cuda.is_available()
20
+ print(f"CUDA Available: {cuda_available}")
21
+
22
+ if not cuda_available:
23
+ print("\nWARNING: PyTorch was not built with CUDA support. GPU will not be used.")
24
+ return
25
+
26
+ # Get the number of available GPUs
27
+ gpu_count = torch.cuda.device_count()
28
+ print(f"Number of GPUs Available: {gpu_count}")
29
+
30
+ # Get details for each GPU
31
+ for i in range(gpu_count):
32
+ print(f"\n--- GPU Details (Device {i}) ---")
33
+ gpu_name = torch.cuda.get_device_name(i)
34
+ print(f" GPU Name: {gpu_name}")
35
+
36
+ cuda_capability = torch.cuda.get_device_capability(i)
37
+ print(f" Compute Capability: {cuda_capability[0]}.{cuda_capability[1]}")
38
+
39
+ total_mem = torch.cuda.get_device_properties(i).total_memory / (1024**3) # Convert bytes to GB
40
+ print(f" Total Memory: {total_mem:.2f} GB")
41
+
42
+ # Check for cuDNN
43
+ cudnn_available = torch.backends.cudnn.is_available()
44
+ print("\n--- cuDNN Information ---")
45
+ print(f"cuDNN Available: {cudnn_available}")
46
+ if cudnn_available:
47
+ cudnn_version = torch.backends.cudnn.version()
48
+ print(f"cuDNN Version: {cudnn_version}")
49
+ else:
50
+ print("\nWARNING: cuDNN is not available. Training will be significantly slower.")
51
+
52
+ except Exception as e:
53
+ print(f"\nAn error occurred: {e}")
54
+ print("Please ensure PyTorch is installed correctly.")
55
+
56
+ if __name__ == "__main__":
57
+ check_gpu_environment()
main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vitClassifier import logger
2
+ from vitClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
3
+ from vitClassifier.pipeline.stage_02_data_transformation import DataTransformationTrainingPipeline
4
+ from vitClassifier.pipeline.stage_03_model_training import ModelTrainingPipeline
5
+ from vitClassifier.pipeline.stage_04_model_evaluation import ModelEvaluationPipeline
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ def run_pipeline(stage_name, pipeline_class):
10
+ try:
11
+ logger.info(f">>>>>> stage {stage_name} started <<<<<<")
12
+ pipeline = pipeline_class()
13
+ pipeline.main()
14
+ logger.info(f">>>>>> stage {stage_name} completed <<<<<<\n\nx==========x")
15
+ except Exception as e:
16
+ logger.exception(e)
17
+ raise e
18
+
19
+ if __name__ == '__main__':
20
+ run_pipeline("Data Ingestion stage", DataIngestionTrainingPipeline)
21
+ run_pipeline("Data Transformation stage", DataTransformationTrainingPipeline)
22
+ run_pipeline("Model Training stage", ModelTrainingPipeline)
23
+ run_pipeline("Model Evaluation stage", ModelEvaluationPipeline)
params.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ LEARNING_RATE: 2.0e-5
2
+ BATCH_SIZE: 32
3
+ EPOCHS: 3
4
+ WEIGHT_DECAY: 0.01
5
+ WARMUP_STEPS: 100
6
+ RANDOM_STATE: 42
7
+ TEST_SPLIT_SIZE: 0.2
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ torch
4
+ torchvision
5
+ transformers
6
+ datasets>=2.14.5
7
+ evaluate
8
+ accelerate>=0.27
9
+ mlflow
10
+ scikit-learn
11
+ imblearn
12
+ python-box
13
+ PyYAML
14
+ ensure
15
+ tqdm
16
+ pathlib
17
+ dvc
18
+ matplotlib
19
+ Pillow
20
+ kaggle
21
+ python-dotenv
research/trials.ipynb ADDED
File without changes
setup.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ __version__ = "0.0.1"
7
+
8
+ REPO_NAME = "Chest-Xray-Pneumonia-ViT-MLflow-DVC"
9
+ AUTHOR_USER_NAME = "your-github-username" # CHANGE THIS
10
+ SRC_REPO = "vitClassifier"
11
+ AUTHOR_EMAIL = "your-email@example.com" # CHANGE THIS
12
+
13
+ setuptools.setup(
14
+ name=SRC_REPO,
15
+ version=__version__,
16
+ author=AUTHOR_USER_NAME,
17
+ author_email=AUTHOR_EMAIL,
18
+ description="An end-to-end ML project for Chest X-ray Pneumonia classification using ViT.",
19
+ long_description=long_description,
20
+ long_description_content_type="text/markdown",
21
+ url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
22
+ project_urls={
23
+ "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
24
+ },
25
+ package_dir={"": "src"},
26
+ packages=setuptools.find_packages(where="src")
27
+ )
src/vitClassifier/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/__init__.py
2
+
3
+ import os
4
+ import sys
5
+ import logging
6
+
7
+ # Define the logging format
8
+ logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
9
+
10
+ # Define the directory for log files
11
+ log_dir = "logs"
12
+ log_filepath = os.path.join(log_dir, "running_logs.log")
13
+ os.makedirs(log_dir, exist_ok=True)
14
+
15
+ # Configure the logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format=logging_str,
19
+ handlers=[
20
+ logging.FileHandler(log_filepath), # Log to a file
21
+ logging.StreamHandler(sys.stdout) # Also log to the console
22
+ ]
23
+ )
24
+
25
+ # Create a logger object that can be imported by other modules
26
+ logger = logging.getLogger("vitClassifierLogger")
src/vitClassifier/components/__init__.py ADDED
File without changes
src/vitClassifier/components/data_ingestion.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from vitClassifier import logger
5
+ from vitClassifier.entity.config_entity import DataIngestionConfig
6
+ import kaggle
7
+
8
+ class DataIngestion:
9
+ def __init__(self, config: DataIngestionConfig):
10
+ self.config = config
11
+
12
+ def download_dataset(self):
13
+ try:
14
+ # ... (download logic remains exactly the same)
15
+ logger.info("Authenticating with Kaggle API...")
16
+ kaggle.api.authenticate()
17
+ logger.info("Authentication successful.")
18
+
19
+ dataset_id = self.config.source_kaggle_dataset_id
20
+ download_path = self.config.unzip_dir
21
+
22
+ expected_data_folder = download_path / "chest_xray"
23
+ if expected_data_folder.exists():
24
+ logger.info(f"Dataset already exists at {expected_data_folder}. Skipping download.")
25
+ return
26
+
27
+ logger.info(f"Downloading dataset '{dataset_id}' to '{download_path}'...")
28
+ kaggle.api.dataset_download_files(
29
+ dataset=dataset_id, path=download_path, unzip=True, quiet=False
30
+ )
31
+ logger.info("Dataset downloaded and unzipped successfully.")
32
+
33
+ except Exception as e:
34
+ logger.error(f"Failed to download dataset from Kaggle. Error: {e}")
35
+ raise e
36
+
37
+ def create_dataframes(self):
38
+ """
39
+ Scans train, test, and val directories and creates separate DataFrames.
40
+ """
41
+ source_root = self.config.unzip_dir / "chest_xray"
42
+
43
+ # Helper function to create a dataframe for a given split (train/test/val)
44
+ def _create_df_for_split(split_name: str, save_path: Path):
45
+ split_path = source_root / split_name
46
+ file_names, labels = [], []
47
+
48
+ # Using .glob to find all .jpeg files in NORMAL and PNEUMONIA subfolders
49
+ for file in sorted(split_path.glob('*/*.jpeg')):
50
+ label = file.parent.name # NORMAL or PNEUMONIA
51
+ labels.append(label)
52
+ file_names.append(str(file))
53
+
54
+ df = pd.DataFrame({"image": file_names, "label": labels})
55
+ df.to_csv(save_path, index=False)
56
+ logger.info(f"Created and saved {split_name} DataFrame to {save_path}")
57
+
58
+ # Create DataFrames for each split
59
+ _create_df_for_split("train", self.config.train_df_path)
60
+ _create_df_for_split("test", self.config.test_df_path)
61
+ _create_df_for_split("val", self.config.val_df_path)
62
+
63
+ def ingest_data(self):
64
+ logger.info("Starting data ingestion process.")
65
+ self.download_dataset()
66
+ self.create_dataframes()
67
+ logger.info("Data ingestion process completed.")
src/vitClassifier/components/data_transformation.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/components/data_transformation.py
2
+
3
+ import pandas as pd
4
+ from datasets import Dataset, Image, ClassLabel
5
+ from imblearn.over_sampling import RandomOverSampler
6
+ from vitClassifier.entity.config_entity import DataTransformationConfig
7
+ from vitClassifier import logger
8
+ # --- NEW IMPORTS ---
9
+ from transformers import ViTImageProcessor
10
+ from torchvision.transforms import (Compose, Resize, ToTensor, Normalize, RandomRotation, RandomHorizontalFlip)
11
+
12
+ class DataTransformation:
13
+ def __init__(self, config: DataTransformationConfig, random_state: int, model_name: str):
14
+ self.config = config
15
+ self.random_state = random_state
16
+ self.model_name = model_name # <-- Need model_name to load the correct processor
17
+
18
+ def transform_data(self):
19
+ # --- 1. Load DataFrames and apply Oversampling (same as before) ---
20
+ train_df = pd.read_csv(self.config.train_data_path)
21
+ test_df = pd.read_csv(self.config.test_data_path)
22
+ val_df = pd.read_csv(self.config.val_data_path)
23
+
24
+ y = train_df[['label']]
25
+ X = train_df.drop(['label'], axis=1)
26
+ ros = RandomOverSampler(random_state=self.random_state)
27
+ X_resampled, y_resampled = ros.fit_resample(X, y)
28
+ train_df_balanced = pd.concat([X_resampled, y_resampled], axis=1)
29
+
30
+ train_dataset = Dataset.from_pandas(train_df_balanced).cast_column("image", Image())
31
+ test_dataset = Dataset.from_pandas(test_df).cast_column("image", Image())
32
+ val_dataset = Dataset.from_pandas(val_df).cast_column("image", Image())
33
+
34
+ # --- 2. Label Encoding (same as before) ---
35
+ labels_list = train_df_balanced['label'].unique().tolist()
36
+ class_labels = ClassLabel(num_classes=len(labels_list), names=labels_list)
37
+
38
+ def map_label2id(example):
39
+ example['label'] = class_labels.str2int(example['label'])
40
+ return example
41
+
42
+ train_dataset = train_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
43
+ test_dataset = test_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
44
+ val_dataset = val_dataset.map(map_label2id, batched=True).cast_column('label', class_labels)
45
+
46
+ # --- 3. THE NEW LOGIC: Preprocess images with .map() ---
47
+ logger.info("Starting image preprocessing with .map(). This may take a few minutes...")
48
+ processor = ViTImageProcessor.from_pretrained(self.model_name)
49
+ image_mean, image_std = processor.image_mean, processor.image_std
50
+ size = processor.size["height"]
51
+ normalize = Normalize(mean=image_mean, std=image_std)
52
+
53
+ # Define transforms
54
+ _train_transforms = Compose([Resize((size, size)), RandomRotation(15), RandomHorizontalFlip(), ToTensor(), normalize])
55
+ _val_test_transforms = Compose([Resize((size, size)), ToTensor(), normalize])
56
+
57
+ def apply_train_transforms(examples):
58
+ examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
59
+ return examples
60
+
61
+ def apply_val_test_transforms(examples):
62
+ examples['pixel_values'] = [_val_test_transforms(image.convert("RGB")) for image in examples['image']]
63
+ return examples
64
+
65
+ # Use .map() to apply transforms and create 'pixel_values' column
66
+ train_dataset = train_dataset.map(apply_train_transforms, batched=True)
67
+ test_dataset = test_dataset.map(apply_val_test_transforms, batched=True)
68
+ val_dataset = val_dataset.map(apply_val_test_transforms, batched=True)
69
+
70
+ # Remove the original 'image' column to save space
71
+ train_dataset = train_dataset.remove_columns(['image'])
72
+ test_dataset = test_dataset.remove_columns(['image'])
73
+ val_dataset = val_dataset.remove_columns(['image'])
74
+
75
+ # --- 4. Save the fully processed datasets ---
76
+ train_dataset.save_to_disk(str(self.config.train_dataset_path))
77
+ test_dataset.save_to_disk(str(self.config.test_dataset_path))
78
+ val_dataset.save_to_disk(str(self.config.val_dataset_path))
79
+
80
+ logger.info("Data Transformation complete. Fully preprocessed datasets saved.")
src/vitClassifier/components/model_evaluation.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/components/model_evaluation.py
2
+
3
+ import mlflow
4
+ import mlflow.pytorch
5
+ import torch
6
+ import json
7
+ from pathlib import Path
8
+ from datasets import load_from_disk
9
+ from transformers import (ViTForImageClassification, ViTImageProcessor, Trainer, TrainingArguments, DefaultDataCollator)
10
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
11
+ from vitClassifier.entity.config_entity import EvaluationConfig
12
+ from vitClassifier.utils.common import read_yaml # Keep this if you need it, but it's not used here
13
+ from vitClassifier import logger
14
+
15
+ class ModelEvaluation:
16
+ def __init__(self, config: EvaluationConfig):
17
+ self.config = config
18
+
19
+ def evaluate(self):
20
+ # Determine device
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ # Load the best model from the training stage and move it to the correct device
24
+ model_path = str(self.config.path_of_model)
25
+ model = ViTForImageClassification.from_pretrained(model_path).to(device)
26
+
27
+ # Load the pre-processed test dataset
28
+ test_data = load_from_disk(str(self.config.test_dataset_path))
29
+
30
+ # We DO NOT need transforms here because the data is already processed
31
+ # test_data.set_transform(...) # REMOVED
32
+
33
+ # Use the default collator which handles 'pixel_values' and 'label'
34
+ data_collator = DefaultDataCollator()
35
+
36
+ # Dummy trainer for running predictions
37
+ eval_args = TrainingArguments(
38
+ output_dir="./eval_output", # Temporary directory
39
+ per_device_eval_batch_size=self.config.batch_size,
40
+ report_to="none"
41
+ )
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=eval_args,
45
+ data_collator=data_collator
46
+ )
47
+
48
+ # --- Run Predictions ---
49
+ logger.info("Running final evaluation on the test set...")
50
+ outputs = trainer.predict(test_data)
51
+ y_true = outputs.label_ids
52
+ y_pred = outputs.predictions.argmax(1)
53
+
54
+ # --- Calculate Metrics ---
55
+ scores = {
56
+ "accuracy": accuracy_score(y_true, y_pred),
57
+ "f1_score": f1_score(y_true, y_pred, average='macro'),
58
+ "precision": precision_score(y_true, y_pred, average='macro'),
59
+ "recall": recall_score(y_true, y_pred, average='macro')
60
+ }
61
+ logger.info(f"Test Set Metrics: {scores}")
62
+
63
+ # --- Save Metrics to a JSON file ---
64
+ metrics_path = Path(self.config.metrics_file_name)
65
+
66
+ # Now create the directory
67
+ metrics_path.parent.mkdir(parents=True, exist_ok=True)
68
+
69
+ with open(metrics_path, 'w') as f:
70
+ json.dump(scores, f, indent=4)
71
+ logger.info(f"Metrics saved to {metrics_path}")
72
+
73
+ # --- Log to MLflow ---
74
+ mlflow.set_tracking_uri(self.config.mlflow_uri)
75
+ mlflow.set_experiment("Pneumonia-ViT-Classification")
76
+
77
+ with mlflow.start_run():
78
+ logger.info("Logging parameters and metrics to MLflow...")
79
+ mlflow.log_params(self.config.all_params)
80
+ mlflow.log_metrics(scores)
81
+
82
+ # --- THIS IS THE FINAL FIX ---
83
+ # Instead of logging the model object, log the directory where the
84
+ # trained model was already saved by the Trainer.
85
+ # `mlflow.log_artifact` is a simple upload and will not cause registry errors.
86
+ model_dir_path = str(self.config.path_of_model)
87
+ mlflow.log_artifact(model_dir_path, artifact_path="model")
88
+
89
+ logger.info("Successfully logged artifacts to MLflow.")
src/vitClassifier/components/model_training.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/components/model_training.py
2
+
3
+ import torch
4
+ from datasets import load_from_disk
5
+ from transformers import (ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer, DefaultDataCollator)
6
+ from vitClassifier.entity.config_entity import TrainingConfig
7
+ from vitClassifier import logger
8
+ import evaluate
9
+
10
+ class ModelTraining:
11
+ def __init__(self, config: TrainingConfig):
12
+ self.config = config
13
+
14
+ def train(self):
15
+ # --- NEW: Explicitly define the device ---
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ logger.info(f"Using device: {device}")
18
+
19
+ # --- Load datasets (no change) ---
20
+ train_data = load_from_disk(str(self.config.train_dataset_path))
21
+ val_data = load_from_disk(str(self.config.val_dataset_path))
22
+
23
+ id2label = {i: label for i, label in enumerate(train_data.features['label'].names)}
24
+ label2id = {label: i for i, label in id2label.items()}
25
+
26
+ model = ViTForImageClassification.from_pretrained(
27
+ self.config.model_name, num_labels=len(id2label), id2label=id2label,
28
+ label2id=label2id, ignore_mismatched_sizes=True
29
+ )
30
+
31
+ # --- NEW: Move the model to the correct device ---
32
+ model.to(device)
33
+
34
+ # --- TrainingArguments (no change) ---
35
+ args = TrainingArguments(
36
+ output_dir=str(self.config.root_dir),
37
+ learning_rate=self.config.learning_rate,
38
+ per_device_train_batch_size=self.config.batch_size,
39
+ per_device_eval_batch_size=self.config.batch_size,
40
+ num_train_epochs=self.config.epochs,
41
+ weight_decay=self.config.weight_decay,
42
+ warmup_steps=self.config.warmup_steps,
43
+ save_strategy='epoch',
44
+ eval_strategy='epoch',
45
+ load_best_model_at_end=True,
46
+ metric_for_best_model="accuracy",
47
+ save_total_limit=1,
48
+ report_to="none"
49
+ )
50
+
51
+ metric = evaluate.load("accuracy")
52
+
53
+ def compute_metrics(eval_pred):
54
+ predictions, labels = eval_pred
55
+ predictions = predictions.argmax(axis=1)
56
+ return metric.compute(predictions=predictions, references=labels)
57
+
58
+ data_collator = DefaultDataCollator()
59
+ processor = ViTImageProcessor.from_pretrained(self.config.model_name)
60
+
61
+ trainer = Trainer(
62
+ model, # The model is now already on the GPU
63
+ args,
64
+ train_dataset=train_data,
65
+ eval_dataset=val_data,
66
+ data_collator=data_collator,
67
+ compute_metrics=compute_metrics,
68
+ tokenizer=processor,
69
+ )
70
+
71
+ logger.info("Starting model fine-tuning with validation...")
72
+ trainer.train()
73
+ trainer.save_model(str(self.config.trained_model_path))
74
+ logger.info("Model fine-tuning complete and best model saved.")
src/vitClassifier/config/__init__.py ADDED
File without changes
src/vitClassifier/config/configuration.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/config/configuration.py
2
+
3
+ from vitClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH # <-- THIS IMPORT IS THE FIX
4
+ from vitClassifier.utils.common import read_yaml, create_directories
5
+ from vitClassifier.entity.config_entity import (DataIngestionConfig,
6
+ DataTransformationConfig,
7
+ TrainingConfig,
8
+ EvaluationConfig)
9
+ from pathlib import Path
10
+ import os
11
+
12
+ class ConfigurationManager:
13
+ def __init__(self, config_filepath=None, params_filepath=None):
14
+
15
+ # If no path is provided when creating an instance, use the imported constants
16
+ if config_filepath is None:
17
+ config_filepath = CONFIG_FILE_PATH
18
+ if params_filepath is None:
19
+ params_filepath = PARAMS_FILE_PATH
20
+
21
+ self.config = read_yaml(config_filepath)
22
+ self.params = read_yaml(params_filepath)
23
+ create_directories([self.config.artifacts_root])
24
+
25
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
26
+ config = self.config.data_ingestion
27
+ create_directories([config.root_dir])
28
+ return DataIngestionConfig(
29
+ root_dir=Path(config.root_dir),
30
+ source_kaggle_dataset_id=config.source_kaggle_dataset_id,
31
+ unzip_dir=Path(config.unzip_dir),
32
+ train_df_path=Path(config.train_df_path),
33
+ test_df_path=Path(config.test_df_path),
34
+ val_df_path=Path(config.val_df_path)
35
+ )
36
+
37
+ def get_data_transformation_config(self) -> DataTransformationConfig:
38
+ config = self.config.data_transformation
39
+ create_directories([config.root_dir])
40
+ return DataTransformationConfig(
41
+ root_dir=Path(config.root_dir),
42
+ train_data_path=Path(config.train_data_path),
43
+ test_data_path=Path(config.test_data_path),
44
+ val_data_path=Path(config.val_data_path),
45
+ train_dataset_path=Path(config.train_dataset_path),
46
+ test_dataset_path=Path(config.test_dataset_path),
47
+ val_dataset_path=Path(config.val_dataset_path)
48
+ )
49
+
50
+ def get_training_config(self) -> TrainingConfig:
51
+ training = self.config.model_training
52
+ params = self.params
53
+ create_directories([Path(training.root_dir)])
54
+ return TrainingConfig(
55
+ root_dir=Path(training.root_dir),
56
+ trained_model_path=Path(training.trained_model_path),
57
+ model_name=training.model_name,
58
+ train_dataset_path=Path(training.train_dataset_path),
59
+ val_dataset_path=Path(training.val_dataset_path),
60
+ learning_rate=params.LEARNING_RATE,
61
+ batch_size=params.BATCH_SIZE,
62
+ epochs=params.EPOCHS,
63
+ weight_decay=params.WEIGHT_DECAY,
64
+ warmup_steps=params.WARMUP_STEPS,
65
+ )
66
+
67
+ def get_evaluation_config(self) -> EvaluationConfig:
68
+ eval_config = self.config.model_evaluation
69
+ return EvaluationConfig(
70
+ path_of_model=Path(eval_config.model_path),
71
+ test_dataset_path=Path(eval_config.test_dataset_path),
72
+ mlflow_uri=eval_config.mlflow_uri,
73
+ all_params=self.params,
74
+ batch_size=self.params.BATCH_SIZE,
75
+ metrics_file_name=Path(eval_config.metrics_file_name) # <--- MAKE SURE THIS LINE EXISTS
76
+ )
src/vitClassifier/constants/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CONFIG_FILE_PATH = Path("config/config.yaml")
4
+ PARAMS_FILE_PATH = Path("params.yaml")
src/vitClassifier/entity/__init__.py ADDED
File without changes
src/vitClassifier/entity/config_entity.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ @dataclass(frozen=True)
5
+ class DataIngestionConfig:
6
+ root_dir: Path
7
+ source_kaggle_dataset_id: str
8
+ unzip_dir: Path
9
+ train_df_path: Path # New
10
+ test_df_path: Path # New
11
+ val_df_path: Path # New
12
+
13
+ @dataclass(frozen=True)
14
+ class DataTransformationConfig:
15
+ root_dir: Path
16
+ train_data_path: Path # New
17
+ test_data_path: Path # New
18
+ val_data_path: Path # New
19
+ train_dataset_path: Path
20
+ test_dataset_path: Path
21
+ val_dataset_path: Path # New
22
+
23
+ @dataclass(frozen=True)
24
+ class TrainingConfig:
25
+ root_dir: Path
26
+ trained_model_path: Path
27
+ model_name: str
28
+ train_dataset_path: Path # New
29
+ val_dataset_path: Path # New
30
+ learning_rate: float
31
+ batch_size: int
32
+ epochs: int
33
+ weight_decay: float
34
+ warmup_steps: int
35
+
36
+ @dataclass(frozen=True)
37
+ class EvaluationConfig:
38
+ path_of_model: Path
39
+ test_dataset_path: Path
40
+ mlflow_uri: str
41
+ all_params: dict
42
+ batch_size: int
43
+ metrics_file_name: Path
src/vitClassifier/pipeline/__init__.py ADDED
File without changes
src/vitClassifier/pipeline/stage_01_data_ingestion.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vitClassifier/pipeline/stage_01_data_ingestion.py
2
+
3
+ from vitClassifier.config.configuration import ConfigurationManager
4
+ from vitClassifier.components.data_ingestion import DataIngestion
5
+ from vitClassifier import logger
6
+
7
+ STAGE_NAME = "Data Ingestion stage"
8
+
9
+ class DataIngestionTrainingPipeline:
10
+ def __init__(self):
11
+ pass
12
+ def main(self):
13
+ config = ConfigurationManager()
14
+ data_ingestion_config = config.get_data_ingestion_config()
15
+ data_ingestion = DataIngestion(config=data_ingestion_config)
16
+ data_ingestion.ingest_data()
17
+
18
+ # <<< ADD THIS BLOCK TO MAKE THE SCRIPT RUNNABLE >>>
19
+ if __name__ == '__main__':
20
+ try:
21
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
22
+ obj = DataIngestionTrainingPipeline()
23
+ obj.main()
24
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
25
+ except Exception as e:
26
+ logger.exception(e)
27
+ raise e
src/vitClassifier/pipeline/stage_02_data_transformation.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vitClassifier.config.configuration import ConfigurationManager
2
+ from vitClassifier.components.data_transformation import DataTransformation
3
+ from vitClassifier import logger
4
+
5
+ STAGE_NAME = "Data Transformation stage"
6
+
7
+ class DataTransformationTrainingPipeline:
8
+ def __init__(self):
9
+ pass
10
+ def main(self):
11
+ config_manager = ConfigurationManager()
12
+ data_transformation_config = config_manager.get_data_transformation_config()
13
+ params = config_manager.params
14
+ # Get model_name from the training config section
15
+ model_name = config_manager.config.model_training.model_name
16
+
17
+ data_transformation = DataTransformation(
18
+ config=data_transformation_config,
19
+ random_state=params.RANDOM_STATE,
20
+ model_name=model_name # Pass the model name
21
+ )
22
+ data_transformation.transform_data()
23
+
24
+ if __name__ == '__main__':
25
+ try:
26
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
27
+ obj = DataTransformationTrainingPipeline()
28
+ obj.main()
29
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
30
+ except Exception as e:
31
+ logger.exception(e)
32
+ raise e
src/vitClassifier/pipeline/stage_03_model_training.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vitClassifier.config.configuration import ConfigurationManager
2
+ from vitClassifier.components.model_training import ModelTraining
3
+ from vitClassifier import logger
4
+
5
+ STAGE_NAME = "Model Training stage"
6
+
7
+ class ModelTrainingPipeline:
8
+ def __init__(self):
9
+ pass
10
+ def main(self):
11
+ config = ConfigurationManager()
12
+ training_config = config.get_training_config()
13
+ model_training = ModelTraining(config=training_config)
14
+ model_training.train()
15
+
16
+ if __name__ == '__main__':
17
+ try:
18
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
19
+ obj = ModelTrainingPipeline()
20
+ obj.main()
21
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
22
+ except Exception as e:
23
+ logger.exception(e)
24
+ raise e
src/vitClassifier/pipeline/stage_04_model_evaluation.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vitClassifier.config.configuration import ConfigurationManager
2
+ from vitClassifier.components.model_evaluation import ModelEvaluation
3
+ from vitClassifier import logger
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+
7
+ STAGE_NAME = "Model Evaluation stage"
8
+
9
+ class ModelEvaluationPipeline:
10
+ def __init__(self):
11
+ pass
12
+ def main(self):
13
+ config = ConfigurationManager()
14
+ eval_config = config.get_evaluation_config()
15
+ evaluation = ModelEvaluation(config=eval_config)
16
+ evaluation.evaluate()
17
+
18
+ if __name__ == '__main__':
19
+ try:
20
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
21
+ obj = ModelEvaluationPipeline()
22
+ obj.main()
23
+ logger.info(f">>>see stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
24
+ except Exception as e:
25
+ logger.exception(e)
26
+ raise e
src/vitClassifier/utils/__init__.py ADDED
File without changes
src/vitClassifier/utils/common.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+ from box import ConfigBox
5
+ from box.exceptions import BoxValueError
6
+ from ensure import ensure_annotations
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from vitClassifier import logger
10
+
11
+ @ensure_annotations
12
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
13
+ try:
14
+ with open(path_to_yaml) as yaml_file:
15
+ content = yaml.safe_load(yaml_file)
16
+ logger.info(f"yaml file: {path_to_yaml} loaded successfully")
17
+ return ConfigBox(content)
18
+ except BoxValueError:
19
+ raise ValueError("yaml file is empty")
20
+ except Exception as e:
21
+ raise e
22
+
23
+ @ensure_annotations
24
+ def create_directories(path_to_directories: list, verbose=True):
25
+ for path in path_to_directories:
26
+ os.makedirs(path, exist_ok=True)
27
+ if verbose:
28
+ logger.info(f"created directory at: {path}")
template.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')
6
+
7
+ project_name = "vitClassifier"
8
+
9
+ list_of_files = [
10
+ ".github/workflows/.gitkeep",
11
+ f"src/{project_name}/__init__.py",
12
+ f"src/{project_name}/components/__init__.py",
13
+ f"src/{project_name}/utils/__init__.py",
14
+ f"src/{project_name}/config/__init__.py",
15
+ f"src/{project_name}/config/configuration.py",
16
+ f"src/{project_name}/pipeline/__init__.py",
17
+ f"src/{project_name}/entity/__init__.py",
18
+ f"src/{project_name}/constants/__init__.py",
19
+ "config/config.yaml",
20
+ "dvc.yaml",
21
+ "params.yaml",
22
+ "requirements.txt",
23
+ "setup.py",
24
+ "research/trials.ipynb"
25
+ ]
26
+
27
+ for filepath in list_of_files:
28
+ filepath = Path(filepath)
29
+ filedir, filename = os.path.split(filepath)
30
+
31
+ if filedir != "":
32
+ os.makedirs(filedir, exist_ok=True)
33
+ logging.info(f"Creating directory; {filedir} for the file: {filename}")
34
+
35
+ if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
36
+ with open(filepath, "w") as f:
37
+ pass
38
+ logging.info(f"Creating empty file: {filepath}")
39
+ else:
40
+ logging.info(f"{filename} is already exists")