Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +9 -0
- .gitignore +208 -0
- .python_version +1 -0
- CHANGELOG.md +789 -0
- LICENSE +201 -0
- README.md +108 -0
- airflow/airflow-webserver.pid +1 -0
- airflow/airflow.cfg +2498 -0
- airflow/airflow.db +3 -0
- airflow/dags/new6.py +147 -0
- airflow/webserver_config.py +132 -0
- analytics/BTCUSDT_report.pdf +0 -0
- ckpts/.gitignore +2 -0
- ckpts/model_2025-10-28-11-33-51-(+07).h5 +3 -0
- ckpts/scaler_2025-10-28-11-33-51-(+07).pkl +3 -0
- components/__init__.py +0 -0
- components/btcusdt_ingest_data.py +157 -0
- components/datalake_cr.py +44 -0
- components/delete_lstm_predict.py +56 -0
- components/delete_lstm_train.py +42 -0
- components/delete_model.py +10 -0
- components/duckdb2csv.py +22 -0
- components/duckdb_api.py +68 -0
- components/model/__init__.py +0 -0
- components/model/data_utils.py +90 -0
- components/model/evaluation.py +239 -0
- components/model/model_utils.py +294 -0
- components/model/old_model_utils.py +189 -0
- components/model/training.py +194 -0
- components/old-process_data.py +111 -0
- components/process_data.py +150 -0
- components/utils/__init__.py +0 -0
- components/utils/file_utils.py +105 -0
- components/utils/utils.py +26 -0
- configs/data_limit.yml +4 -0
- configs/data_sources.yml +2 -0
- configs/delete_lstm_hyperparams.yml +17 -0
- configs/extract_data.yml +4 -0
- configs/model_config.yml +41 -0
- configs/pipeline_config.yml +8 -0
- docs/data_sources.md +26 -0
- docs/dependencies.md +22 -0
- docs/frameworks_installation.md +72 -0
- docs/install_airflow.md +121 -0
- docs/install_minio_server.md +59 -0
- docs/install_spark.md +27 -0
- docs/visualize_data.md +16 -0
- duckdb_databases/financial_data.db +3 -0
- evaluation/.gitignore +1 -0
- logs/.gitkeep +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
airflow/airflow.db filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
duckdb_databases/financial_data.db filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
minio filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
temp/BTCUSDT-1s-2025-08.csv filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
temp/BTCUSDT-1s-2025-09.csv filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
temp/temp_parquet_chunks/.part-00000-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
temp/temp_parquet_chunks/.part-00002-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
temp/temp_parquet_chunks/.part-00003-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
zrok filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
*.env
|
| 140 |
+
.envrc
|
| 141 |
+
.venv
|
| 142 |
+
env/
|
| 143 |
+
venv/
|
| 144 |
+
ENV/
|
| 145 |
+
env.bak/
|
| 146 |
+
venv.bak/
|
| 147 |
+
|
| 148 |
+
# Spyder project settings
|
| 149 |
+
.spyderproject
|
| 150 |
+
.spyproject
|
| 151 |
+
|
| 152 |
+
# Rope project settings
|
| 153 |
+
.ropeproject
|
| 154 |
+
|
| 155 |
+
# mkdocs documentation
|
| 156 |
+
/site
|
| 157 |
+
|
| 158 |
+
# mypy
|
| 159 |
+
.mypy_cache/
|
| 160 |
+
.dmypy.json
|
| 161 |
+
dmypy.json
|
| 162 |
+
|
| 163 |
+
# Pyre type checker
|
| 164 |
+
.pyre/
|
| 165 |
+
|
| 166 |
+
# pytype static type analyzer
|
| 167 |
+
.pytype/
|
| 168 |
+
|
| 169 |
+
# Cython debug symbols
|
| 170 |
+
cython_debug/
|
| 171 |
+
|
| 172 |
+
# PyCharm
|
| 173 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 174 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 175 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 176 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 177 |
+
#.idea/
|
| 178 |
+
|
| 179 |
+
# Abstra
|
| 180 |
+
# Abstra is an AI-powered process automation framework.
|
| 181 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 182 |
+
# Learn more at https://abstra.io/docs
|
| 183 |
+
.abstra/
|
| 184 |
+
|
| 185 |
+
# Visual Studio Code
|
| 186 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 187 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 188 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 189 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 190 |
+
# .vscode/
|
| 191 |
+
|
| 192 |
+
# Ruff stuff:
|
| 193 |
+
.ruff_cache/
|
| 194 |
+
|
| 195 |
+
# PyPI configuration file
|
| 196 |
+
.pypirc
|
| 197 |
+
|
| 198 |
+
# Cursor
|
| 199 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 200 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 201 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 202 |
+
.cursorignore
|
| 203 |
+
.cursorindexingignore
|
| 204 |
+
|
| 205 |
+
# Marimo
|
| 206 |
+
marimo/_static/
|
| 207 |
+
marimo/_lsp/
|
| 208 |
+
__marimo__/
|
.python_version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CHANGELOG
|
| 2 |
+
|
| 3 |
+
## v1.1.9
|
| 4 |
+
|
| 5 |
+
CHANGE: The `publicProxy` now supports "striped session cookies" to support larger authentication payloads when working with OIDC providers that use larger tokens/payloads. (https://github.com/openziti/zrok/issues/1101)
|
| 6 |
+
|
| 7 |
+
FIX: Fix for icon/favicon in HTML for the api console. (https://github.com/openziti/zrok/pull/1094)
|
| 8 |
+
|
| 9 |
+
## v1.1.8
|
| 10 |
+
|
| 11 |
+
CHANGE: The `ContextDialer` in `agent.Controller.NewClient` now utilizes the "two-phase" approach to invoking `Dial` on the remote agent service. First, it attempts to dial the service with the current service list state. If that fails, it will call `RefreshService` to update the service list and does a second `Dial`. If the second `Dial` fails, then the connection fails. This works around service list staleness issues. (https://github.com/openziti/zrok/issues/1090)
|
| 12 |
+
|
| 13 |
+
CHANGE: `github.com/openziti/sdk-golang` updated to `v1.2.8`.
|
| 14 |
+
|
| 15 |
+
CHANGE: CI pull requests now trigger a native-architecture Windows build.
|
| 16 |
+
|
| 17 |
+
## v1.1.7
|
| 18 |
+
|
| 19 |
+
FIX: Missing import for windows-specific build.
|
| 20 |
+
|
| 21 |
+
## v1.1.6
|
| 22 |
+
|
| 23 |
+
FEATURE: The `agent.Agent` now can optionally enforce that agent remoting starts successfully when creating a new instance. The `agent.Config` struct has a new `RequireRemoting` boolean to control this behavior. (https://github.com/openziti/zrok/issues/1085)
|
| 24 |
+
|
| 25 |
+
CHANGE: Additional diagnostic logging in the zrok Agent; details around sub-process executions, etc. (https://github.com/openziti/zrok/issues/1084)
|
| 26 |
+
|
| 27 |
+
## v1.1.5
|
| 28 |
+
|
| 29 |
+
CHANGE: Upgraded go toolchain to `v1.24.6`. (https://github.com/openziti/zrok/issues/1072)
|
| 30 |
+
|
| 31 |
+
## v1.1.4
|
| 32 |
+
|
| 33 |
+
CHANGE: Update `github.com/caddyserver/caddy/v2` to `v2.9.1`; fixes CVE-2024-53259 (would only potentially effect users using the QUIC protocol, very atypical) (https://github.com/openziti/zrok/issues/1047)
|
| 34 |
+
|
| 35 |
+
## v1.1.3
|
| 36 |
+
|
| 37 |
+
FEATURE: A new `compatibility` > `version_patterns` array is included in the controller configuration, allowing for dynamic adjustment of allowed client version strings (https://github.com/openziti/zrok/issues/1030)
|
| 38 |
+
|
| 39 |
+
FEATURE: A new `compatibility` > `log_version` boolean is included in the controller configuration. When this boolean is set to `true`, the controller will log all client versions provided for compatibility checking.
|
| 40 |
+
|
| 41 |
+
CHANGE: Update `github.com/openziti/sdk-golang` to `v1.2.3`
|
| 42 |
+
|
| 43 |
+
CHANGE: Minor vulnerability packages updated in `ui` and `agent/agentUi`
|
| 44 |
+
|
| 45 |
+
FIX: The `scope` field of the metrics returned from `/metrics/environment/...` is now properly set as `environment` and the from `/metrics/share/...` is now properly set as `share` (https://github.com/openziti/zrok/issues/1031)
|
| 46 |
+
|
| 47 |
+
## v1.1.2
|
| 48 |
+
|
| 49 |
+
FIX: A panic happened in the `publicProxy` implementation when no `oauth` config block is present (https://github.com/openziti/zrok/issues/1032)
|
| 50 |
+
|
| 51 |
+
## v1.1.1
|
| 52 |
+
|
| 53 |
+
FIX: Masquerade as `v1.0-v1.1.1 [gitHash]` when performing client version checks. Will be replaced with the usual client identifier in `v1.1.2` when the regular expressions for controlling client compatibility are externalized in the controller config (https://github.com/openziti/zrok/issues/1028)
|
| 54 |
+
|
| 55 |
+
## v1.1.0
|
| 56 |
+
|
| 57 |
+
FEATURE: Rewritten and improved `publicProxy` package (`zrok access public`), with support for extensible OAuth-compliant identity providers. The `publicProxy` configuration now supports any number of configured OAuth-compliant providers (rather than just a single `google` provider and/or a single `github` provider). Also includes a new OIDC-compliant generic IDP provider integration. Improvements to authentication flows and security all around. See the [updated guide](https://docs.zrok.io/docs/guides/self-hosting/oauth/configuring-oauth/) on using OAuth-based identity providers with the zrok public frontend (https://github.com/openziti/zrok/issues/968)
|
| 58 |
+
|
| 59 |
+
FEATURE: Templatized and improved static pages (not found/404, unauthorized/401, health check, etc.) used by the public frontend. Consolidated variable data using golang `text/template` so that static `proxyUi` package can display additional error information and provide extension points for replacing all of the templated content with external files. See the [error pages guide](https://docs.zrok.io/docs/guides/self-hosting/error-pages/) for more information on customizing the built-in template (https://github.com/openziti/zrok/issues/1012)
|
| 60 |
+
|
| 61 |
+
FEATURE: `zrok access private` now includes a `--template-path` allowing the embedded `proxyUi` template to be replaced with an external HTML file (https://github.com/openziti/zrok/issues/1012)
|
| 62 |
+
|
| 63 |
+
FIX: Invoking `/agent/*` endpoints to remotely manage agents with remoting was causing a new API session to be allocated in the ziti controller for each request. A slightly different strategy was employed for embedding the ziti SDK into the zrok controller that should mitigate this (https://github.com/openziti/zrok/issues/1023)
|
| 64 |
+
|
| 65 |
+
## v1.0.8
|
| 66 |
+
|
| 67 |
+
FEATURE: New opt-in configuration item `superNetwork` which enables multiple data plane connections to the OpenZiti underlay, a separate control plane connection, enabling SDK-based flow control. To opt-in use `zrok config set superNetwork true` in each environment, or set the `ZROK_SUPER_NETWORK` environment variable to `true` (https://github.com/openziti/zrok/issues/1010)
|
| 68 |
+
|
| 69 |
+
CHANGE: Updated `github.com/openziti/sdk-golang` to `v1.2.1` (https://github.com/openziti/zrok/issues/1010)
|
| 70 |
+
|
| 71 |
+
## v1.0.7
|
| 72 |
+
|
| 73 |
+
FEATURE: zrok Agent now supports health checks (against the target endpoint) for `proxy` backend shares using the `zrok agent share http-healthcheck` command. The zrok API now includes an `/agent/share/http-healthcheck` endpoint for remotely performing these checks against remoted Agents. See the guide for using the feature at https://docs.zrok.io/guides/agent/http-healthcheck/ (https://github.com/openziti/zrok/issues/1002)
|
| 74 |
+
|
| 75 |
+
FEATURE: `/overview`, `/detail/share`, `/detail/environment`, and `/overview/{organizationToken}/{accountEmail}` all adjusted to include `envZId` in share detail output (https://github.com/openziti/zrok/issues/998)
|
| 76 |
+
|
| 77 |
+
FEATURE: New add and delete API endpoints for frontend grants. New `zrok admin create frontend-grant` and `zrok admin delete frontend-grant` CLI for invoking these API endpoints from the command line (https://github.com/openziti/zrok/issues/992)
|
| 78 |
+
|
| 79 |
+
FEATURE: New admin endpoint for deleting accounts. New `zrok admin delete account` CLI for invoking the API endpoint from the command line (https://github.com/openziti/zrok/issues/993)
|
| 80 |
+
|
| 81 |
+
FEATURE: New admin endpoint for deleting identities. New `zrok admin delete identity` CLI for invoking the API endpoint from the command line (https://github.com/openziti/zrok/issues/800)
|
| 82 |
+
|
| 83 |
+
FEATURE: New API endpoint (`/overview/public-frontends`) that returns the public frontends available to authenticated account. The public frontends include those marked with the `open` permission mode, and those marked `closed` where the user has a frontend grant allowing them to access the frontend. New CLI command `zrok overview public-frontends` to allow end users to list the public frontends their account can use (https://github.com/openziti/zrok/issues/996)
|
| 84 |
+
|
| 85 |
+
CHANGE: Updated `openapi-generator-cli` from `7.12.0` to `7.14.0`
|
| 86 |
+
|
| 87 |
+
## v1.0.6
|
| 88 |
+
|
| 89 |
+
CHANGE: The `/overview` endpoint has been adjusted to include a new `remoteAgent` `boolean` on the `environment` instances, indicating whether or not the environment has an enrolled remote agent (https://github.com/openziti/zrok/issues/977)
|
| 90 |
+
|
| 91 |
+
CHANGE: Adjusted core framework entry points to support changing zrokdir, and host interrogation functions to better support embedded zrok functionality (https://github.com/openziti/zrok/issues/976)
|
| 92 |
+
|
| 93 |
+
## v1.0.5
|
| 94 |
+
|
| 95 |
+
FEATURE: Initial support for zrok Agent remoting; new `zrok agent enroll` and `zrok agent unenroll` commands that establish opt-in remote Agent management facilities on a per-environment basis. The central API has been augmented to allow for remote control (creating shares and private access instances) of these agents; see the [remoting guide](https://docs.zrok.io/docs/guides/agent/remoting) for details (https://github.com/openziti/zrok/issues/967)
|
| 96 |
+
|
| 97 |
+
CHANGE: `zrok share public`, `zrok share private`, and `zrok reserve` all default to the "closed" permission mode (they previously defaulted to the "open" permission mode). The `--closed` flag has been replaced with a new `--open` flag. See the [Permission Modes](https://docs.zrok.io/docs/guides/permission-modes/) docs for details (https://github.com/openziti/zrok/issues/971)
|
| 98 |
+
|
| 99 |
+
FIX: `zrok enable` now handles the case where the user ID does not resolve to a username when generating the default environment description (https://github.com/openziti/zrok/issues/959)
|
| 100 |
+
|
| 101 |
+
FIX: Linux packages were optimized to avoid manage file revision conflicts (https://github.com/openziti/zrok/issues/817)
|
| 102 |
+
|
| 103 |
+
## v1.0.4
|
| 104 |
+
|
| 105 |
+
FIX: `zrok admin bootstrap` and `zrok enable` functionality were broken in `v1.0.3`. A bad combination of dependencies caused issues with marshalling data from the associated controller endpoints
|
| 106 |
+
|
| 107 |
+
CHANGE: `github.com/openziti/sdk-golang` has been updated to `v1.1.0`, `github.com/openziti/ziti` has been updated to `v1.6.0`. Related dependencies and indirects also updated
|
| 108 |
+
|
| 109 |
+
CHANGE: Updated to `golang` `v1.24` as the official build toolchain
|
| 110 |
+
|
| 111 |
+
## v1.0.3
|
| 112 |
+
|
| 113 |
+
FEATURE: `zrok agent console` now outputs the URL it is attempting to open. New `zrok agent console --headless` option to only emit the agent console URL (https://github.com/openziti/zrok/issues/944)
|
| 114 |
+
|
| 115 |
+
FEATURE: New `zrok admin unbootstrap` to remove zrok resources from the underlying OpenZiti instance (https://github.com/openziti/zrok/issues/935)
|
| 116 |
+
|
| 117 |
+
FEATURE: New InfluxDB metrics capture infrastructure for `zrok test canary` framework (https://github.com/openziti/zrok/issues/948)
|
| 118 |
+
|
| 119 |
+
FEATURE: New `zrok test canary enabler` to validate `enable`/`disable` operations and gather performance metrics around how those paths are operating (https://github.com/openziti/zrok/issues/771)
|
| 120 |
+
|
| 121 |
+
FEATURE: New `zrok test canary` infrastructure capable of supporting more complex testing scenarios; now capable of streaming canary metrics into an InfluxDB repository; new programming framework for developing additional types of streaming canary metrics (https://github.com/openziti/zrok/issues/948 https://github.com/openziti/zrok/issues/954)
|
| 122 |
+
|
| 123 |
+
FEATURE: All `zrok test canary` commands that have "min" and "max" values (`--min-pacing` and `--max-pacing` for example) now include a singular version of that flag for setting both "min" and "max" to the same value (`--pacing` for example). The singular version of the flag always overrides any `--min-*` or `--max-*` values that might be set
|
| 124 |
+
|
| 125 |
+
CHANGE: New _guard_ to prevent users from running potentially dangerous `zrok test canary` commands inadvertently without understanding what they do (https://github.com/openziti/zrok/issues/947)
|
| 126 |
+
|
| 127 |
+
CHANGE: Updated `npm` dependencies for `ui`, `agent/agentUi` and `website`. Updated `github.com/openziti/sdk-golang` to `v0.24.0`
|
| 128 |
+
|
| 129 |
+
## v1.0.2
|
| 130 |
+
|
| 131 |
+
FEATURE: "Auto-rebase" for enabled environments where the `apiEndpoint` is set to `https://api.zrok.io`. This will automatically migrate existing environments to the new `apiEndpoint` for the `v1.0.x` series (https://github.com/openziti/zrok/issues/936)
|
| 132 |
+
|
| 133 |
+
FEATURE: New `admin/new_account_link` configuration option to allow the insertion of "how do I register for an account?" links into the login form (https://github.com/openziti/zrok/issues/552)
|
| 134 |
+
|
| 135 |
+
CHANGE: The release environment, share, and access modals in the API console now have a better message letting the user know they will still need to clean up their `zrok` processes (https://github.com/openziti/zrok/issues/910)
|
| 136 |
+
|
| 137 |
+
CHANGE: The openziti/zrok Docker image has been updated to use the latest version of the ziti CLI, 1.4.3 (https://github.com/openziti/zrok/pull/917)
|
| 138 |
+
|
| 139 |
+
## v1.0.1
|
| 140 |
+
|
| 141 |
+
FEATURE: The zrok Agent now persists private accesses and reserved shares between executions. Any `zrok access private` instances or `zrok share reserved` instances created using the agent are now persisted to a registry stored in `${HOME}/.zrok`. When restarting the agent these accesses and reserved shares are re-created from the data in this registry (https://github.com/openziti/zrok/pull/922)
|
| 142 |
+
|
| 143 |
+
FEATURE: zrok-agent Linux package runs the agent as a user service (https://github.com/openziti/zrok/issues/883)
|
| 144 |
+
|
| 145 |
+
CHANGE: Updated the "Getting Started" guide to be slightly more streamlined and reflect the `v1.0` changes (https://github.com/openziti/zrok/issues/877)
|
| 146 |
+
|
| 147 |
+
CHANGE: let the Docker instance set the Caddy HTTPS port (https://github.com/openziti/zrok/pull/920)
|
| 148 |
+
|
| 149 |
+
CHANGE: Add Traefik option for TLS termination in the Docker instance (https://github.com/openziti/zrok/issues/808)
|
| 150 |
+
|
| 151 |
+
## v1.0.0
|
| 152 |
+
|
| 153 |
+
MAJOR RELEASE: zrok reaches version 1.0.0!
|
| 154 |
+
|
| 155 |
+
FEATURE: Completely redesigned web interface ("API Console"). New implementation provides a dual-mode interface supporting an improved visual network navigator and also a "tabular" view, which provides a more traditional "data" view. New stack built using vite, React, and TypeScript (https://github.com/openziti/zrok/issues/724)
|
| 156 |
+
|
| 157 |
+
FEATURE: New "zrok Agent", a background manager process for your zrok environments, which allows you to easily manage and work with multiple `zrok share` and `zrok access` processes. New `--subordinate` flag added to `zrok share [public|private|reserved]` and `zrok access private` to operate in a mode that allows an Agent to manage shares and accesses (https://github.com/openziti/zrok/issues/463)
|
| 158 |
+
|
| 159 |
+
FEATURE: New "zrok Agent UI" a web-based user interface for the zrok Agent, which allows creating and releasing shares and accesses through a web browser. This is just an initial chunk of the new Agent UI, and is considered a "minimum viable" version of this interface (https://github.com/openziti/zrok/issues/221)
|
| 160 |
+
|
| 161 |
+
FEATURE: `zrok share [public|private|reserved]` and `zrok access private` now auto-detect if the zrok Agent is running in an environment and will automatically service share and access requests through the Agent, rather than in-process if the Agent is running. If the Agent is not running, operation remains as it was in `v0.4.x` and the share or access is handled in-process. New `--force-agent` and `--force-local` flags exist to skip Agent detection and manually select an operating mode (https://github.com/openziti/zrok/issues/751)
|
| 162 |
+
|
| 163 |
+
FEATURE: `zrok access private` supports a new `--auto` mode, which can automatically find an available open address/port to bind the frontend listener on. Also includes `--auto-address`, `--auto-start-port`, and `--auto-end-port` features with sensible defaults. Supported by both the agent and local operating modes (https://github.com/openziti/zrok/issues/780)
|
| 164 |
+
|
| 165 |
+
FEATURE: `zrok rebase` commands (`zrok rebase apiEndpoint` and `zrok rebase accountToken`) allows "rebasing" an enabled environment onto a different API endpoint or a different account token. This is useful for migrating already-enabled environments between endpoints supporting different zrok versions, and is also useful when regenerating an account token (https://github.com/openziti/zrok/issues/869, https://github.com/openziti/zrok/issues/897)
|
| 166 |
+
|
| 167 |
+
FEATURE: `zrok test canary` CLI tree replaces the old `zrok test loop` tree; new `zrok test canary public-proxy` and `zrok test canary private-proxy` provide modernized, updated versions of what the `zrok test loop` commands used to do. This new approach will serve as the foundation for all future zrok testing infrastructure (https://github.com/openziti/zrok/issues/771)
|
| 168 |
+
|
| 169 |
+
FEATURE: New `/api/v1/versions` endpoint to return comprehensive, full stack version information about the deployed service instance. Currently only returns a single `controllerVersion` property (https://github.com/openziti/zrok/issues/881)
|
| 170 |
+
|
| 171 |
+
CHANGE: The default API URL for `v1.0.x` zrok clients is now `https://api-v1.zrok.io` (instead of the older `https://api.zrok.io`). The zrok.io deployment will now be maintaining version-specific DNS for versioned API endpoints.
|
| 172 |
+
|
| 173 |
+
CHANGE: Refactored API implementation. Cleanup, lint removal, additional data elements added, unused data removed (https://github.com/openziti/zrok/issues/834)
|
| 174 |
+
|
| 175 |
+
CHANGE: Deprecated the `passwords` configuration stanza. The zrok controller and API console now use a hard-coded set of (what we believe to be) reasonable assumptions about password quality (https://github.com/openziti/zrok/issues/834)
|
| 176 |
+
|
| 177 |
+
CHANGE: The protocol for determining valid client versions has been changed. Previously a zrok client would do a `GET` against the `/api/v1/version` endpoint and do a local version string comparison (as a normal precondition to any API call) to see if the controller version matched. The protocol has been amended so that any out-of-date client using the old protocol will receive a version string indicating that they need to uprade their client. New clients will do a `POST` against the `/api/v1/clientVersionCheck` endpoint, posting their client version, and the server will check for compatibility. Does not change the security posture in any significant way, but gives more flexibility on the server side for managing client compatibility. Provides a better, cleared out-of-date error message for old clients when accessing `v1.0.0`+ (https://github.com/openziti/zrok/issues/859)
|
| 178 |
+
|
| 179 |
+
CHANGE: The Node.js SDK is now generated by `openapi-generator` using the `typescript-fetch` template. Examples and SDK components updated to use the `v1.0.0` API and generated client (https://github.com/openziti/zrok/issues/893)
|
| 180 |
+
|
| 181 |
+
CHANGE: The Python SDK is now generated by `openapi-generator` and requires a newer `urllib3` version 2.1.0. The published Python module, `zrok`, inherits the dependencies of the generated packages (https://github.com/openziti/zrok/issues/894)
|
| 182 |
+
|
| 183 |
+
## v0.4.49
|
| 184 |
+
|
| 185 |
+
FIX: Release artifacts now include a reproducible source archive. The archive's download URL is now used by the Homebrew formula when building from source instead of the archive generated on-demand by GitHub (https://github.com/openziti/zrok/issues/858).
|
| 186 |
+
|
| 187 |
+
FIX: Pre-releases are no longer uploaded to the stable Linux package repo, and workflows that promote stable release artifacts to downstream distribution channels enforce semver stable release tags, i.e., not having a semver hyphenated prerelease suffix.
|
| 188 |
+
|
| 189 |
+
CHANGE: The release `checksums.txt` has been renamed `checksums.sha256.txt` to reflect the use of a collision-resistant algorithm instead of `shasum`'s default algorithm, SHA-1.
|
| 190 |
+
|
| 191 |
+
CHANGE: The dependency graph is now published as a release artifact named `sbom-{version}.spdx.json` (https://github.com/openziti/zrok/issues/888).
|
| 192 |
+
|
| 193 |
+
CHANGE: Pre-releases are uploaded to the pre-release Linux package repo and Docker Hub for testing. [RELEASING.md](./RELEASING.md) describes releaser steps and the events they trigger.
|
| 194 |
+
|
| 195 |
+
CHANGE: Linux release binaries are now built on the ziti-builder container image based on Ubuntu Focal 20.04 to preserve backward compatibility as the ubuntu-20.04 GitHub runner is end of life.
|
| 196 |
+
|
| 197 |
+
CHANGE: Container images now include SLSA and SBOM attestations, and these are also published to the Docker Hub registry (https://github.com/openziti/zrok/issues/890).
|
| 198 |
+
|
| 199 |
+
CHANGE: Release binary and text artifacts are now accompanied by provenance attestations (https://github.com/openziti/zrok/issues/889).
|
| 200 |
+
|
| 201 |
+
## v0.4.48
|
| 202 |
+
|
| 203 |
+
FEATURE: The controller configuration now supports a `disable_auto_migration` boolean in the `store` stanza. When set to `true`, the controller will not attempt to auto-migrate (or otherwise validate the migration state) of the underlying database. Leaving `disable_auto_migration` out, or setting it to false will retain the default behavior of auto-migrating when starting the zrok controller. The `zrok admin migrate` command will still perform a migration regardless of how this setting is configured in the controller configuration (https://github.com/openziti/zrok/issues/866)
|
| 204 |
+
|
| 205 |
+
FIX: the Python SDK erroneously assumed the enabled zrok environment contained a config.json file, and was changed to only load it if the file was present (https://github.com/openziti/zrok/pull/853/).
|
| 206 |
+
|
| 207 |
+
## v0.4.47
|
| 208 |
+
|
| 209 |
+
CHANGE: the Docker instance will wait for the ziti container healthy status (contribution from Ben Wong @bwong365 - https://github.com/openziti/zrok/pull/790)
|
| 210 |
+
|
| 211 |
+
CHANGE: Document solving the DNS propagation timeout for Docker instances that are using Caddy to manage the wildcard certificate.
|
| 212 |
+
|
| 213 |
+
CHANGE: Add usage hint in `zrok config get --help` to clarify how to list all valid `configName` and their current values by running `zrok status`.
|
| 214 |
+
|
| 215 |
+
CHANGE: The Python SDK's `Overview()` function was refactored as a class method (https://github.com/openziti/zrok/pull/846).
|
| 216 |
+
|
| 217 |
+
FEATURE: The Python SDK now includes a `ProxyShare` class providing an HTTP proxy for public and private shares and a
|
| 218 |
+
Jupyter notebook example (https://github.com/openziti/zrok/pull/847).
|
| 219 |
+
|
| 220 |
+
FIX: PyPi publishing was failing due to a CI issue (https://github.com/openziti/zrok/issues/849)
|
| 221 |
+
|
| 222 |
+
## v0.4.46
|
| 223 |
+
|
| 224 |
+
FEATURE: Linux service template for systemd user units (https://github.com/openziti/zrok/pull/818)
|
| 225 |
+
|
| 226 |
+
FIX: Docker share examples had incorrect default path for zrok environment mountpoint
|
| 227 |
+
|
| 228 |
+
FIX: Clarify how to use DNS providers like Route53 with the zrok Docker instance sample.
|
| 229 |
+
|
| 230 |
+
CHANGE: Use port 80 for the default Ziti API endpoint in the zrok Docker instance sample (https://github.com/openziti/zrok/issues/793).
|
| 231 |
+
|
| 232 |
+
CHANGE: Clarify OS requirements for zrok VPN
|
| 233 |
+
|
| 234 |
+
CHANGE: Set the Windows executable search path in the Windows install guide.
|
| 235 |
+
|
| 236 |
+
CHANGE: bump macOS runner for Python module from macos-12 to macos-13
|
| 237 |
+
|
| 238 |
+
## v0.4.45
|
| 239 |
+
|
| 240 |
+
FEATURE: Minimal support for "organizations". Site admin API endpoints provided to create, list, and delete "organizations". Site admin API endpoints provided to add, list, and remove "organization members" (zrok accounts) with the ability to mark accounts as a "organization admin". API endpoints provided for organization admins to list the members of their organizations, and to also see the overview (environments, shares, and accesses) for any account in their organization. API endpoint for end users to see which organizations their account is a member of (https://github.com/openziti/zrok/issues/537)
|
| 241 |
+
|
| 242 |
+
CHANGE: briefly mention the backend modes that apply to public and private share concepts
|
| 243 |
+
|
| 244 |
+
FIX: Update indirect dependency `github.com/golang-jwt/jwt/v4` to version `v4.5.1` (https://github.com/openziti/zrok/issues/794)
|
| 245 |
+
|
| 246 |
+
FIX: Document unique names
|
| 247 |
+
|
| 248 |
+
FIX: reduce Docker image sizes (https://github.com/openziti/zrok/pull/783)
|
| 249 |
+
|
| 250 |
+
FIX: Docker reserved private share startup error (https://github.com/openziti/zrok/pull/801)
|
| 251 |
+
|
| 252 |
+
FIX: Correct the download URL for the armv7 Linux release (https://github.com/openziti/zrok/issues/782)
|
| 253 |
+
|
| 254 |
+
## v0.4.44
|
| 255 |
+
|
| 256 |
+
FIX: Fix for goreleaser build action to align with changed ARM64 build path.
|
| 257 |
+
|
| 258 |
+
## v0.4.43
|
| 259 |
+
|
| 260 |
+
CHANGE: Update `github.com/openziti/sdk-golang` to version `v0.23.44`. Remove old `github.com/openziti/fabric` dependency, instead pulling in the modern `github.com/openziti/ziti` dependency.
|
| 261 |
+
|
| 262 |
+
FIX: Bypass interstitial page for HTTP `OPTIONS` method (https://github.com/openziti/zrok/issues/777)
|
| 263 |
+
|
| 264 |
+
## v0.4.42
|
| 265 |
+
|
| 266 |
+
CHANGE: Switch all `Dial` operations made into the OpenZiti overlay to use `DialWithOptions(..., &ziti.DialOptions{ConnectTimeout: 30 * time.Second})`, switching to a 30 second timeout from a 5 second default (https://github.com/openziti/zrok/issues/772)
|
| 267 |
+
|
| 268 |
+
FIX: Removed the `--basic-auth` flag from `zrok share private` as this was ignored... even if `zrok access private` honored the `ziti.proxy.v1` config to ask for basic auth, it would still be easy to write a custom SDK client that ignored the basic auth and accessed the share directly; better to remove the option than to allow confusing usage (https://github.com/openziti/zrok/issues/770)
|
| 269 |
+
|
| 270 |
+
FIX: always append common options like `--headless` and conditionally append `--verbose --insecure` if their respective env vars are set to when running in a service manager like systemd or Docker and wrapping the `zrok` command with the `zrok-share.bash` shell script (https://openziti.discourse.group/t/question-about-reserved-public-vs-temp-public-shares/3169)
|
| 271 |
+
|
| 272 |
+
FIX: Correct registration page CSS to ensure that the entire form is visible
|
| 273 |
+
|
| 274 |
+
## v0.4.41
|
| 275 |
+
|
| 276 |
+
FIX: Fixed crash when invoking `zrok share reserved` with no arguments (https://github.com/openziti/zrok/issues/740)
|
| 277 |
+
|
| 278 |
+
FIX: zrok-share.service on Linux failed to start with a private share in closed permission mode
|
| 279 |
+
|
| 280 |
+
FIX: Update `gopkg.in/go-jose/go-jose.v2` to `v2.6.3` to fix vulnerability around compressed data (https://github.com/openziti/zrok/issues/761)
|
| 281 |
+
|
| 282 |
+
## v0.4.40
|
| 283 |
+
|
| 284 |
+
FEATURE: New endpoint for synchronizing grants for an account (https://github.com/openziti/zrok/pull/744). Useful for updating the `zrok.proxy.v1` config objects containing interstitial setting when the `skip_interstitial_grants` table has been updated.
|
| 285 |
+
|
| 286 |
+
FIX: prune incorrect troubleshooting advice about listing Caddy's certificates
|
| 287 |
+
|
| 288 |
+
## v0.4.39
|
| 289 |
+
|
| 290 |
+
FEATURE: New API endpoint allowing direct creation of accounts in the zrok database. Requires an admin token (specified in the controller configuration yaml) for authentication. See the OpenAPI spec for details of the API endpoint. The `zrok admin create account` CLI was also updated to call the API endpoint, rather than directly operating on the underlying database (https://github.com/openziti/zrok/issues/734). The [Docker](https://github.com/openziti/zrok/pull/736) and [Kubernetes](https://github.com/openziti/helm-charts/pull/249) zrok instance deployments were adapted to the new CLI parameter shape.
|
| 291 |
+
|
| 292 |
+
FEATURE: Support `html_path` directive in `interstitial` stanza of public frontend configuration to support using an external HTML file for the interstitial page (https://github.com/openziti/zrok/issues/716)
|
| 293 |
+
|
| 294 |
+
FEATURE: `zrok access private` now includes a `--response-header` flag to add headers to the response for HTTP-based backends. Add flag multiple times to add multiple headers to the response. Expects `key:value` header definitions in this format: `--response-header "Access-Control-Allow-Origin: *"` (https://github.com/openziti/zrok/issues/522)
|
| 295 |
+
|
| 296 |
+
CHANGE: Update `github.com/openziti/sdk-golang` (and related dependencies) to version `v0.23.40`.
|
| 297 |
+
|
| 298 |
+
CHANGE: upgrade to ziti v1.1.7 CLI in zrok container image
|
| 299 |
+
|
| 300 |
+
## v0.4.38
|
| 301 |
+
|
| 302 |
+
FEATURE: Conditionally enable interstitial page based on `User-Agent` prefix list. See the [frontend configuration template](etc/frontend.yml) for details on the new configuration structure (https://github.com/openziti/zrok/issues/715)
|
| 303 |
+
|
| 304 |
+
CHANGE: The interstitial configuration has been modified from a simple `interstitial: <bool>` to a richer structure, but the config version has not been incremented; this feature has not been widely adopted yet. See the [frontend configuration template](etc/frontend.yml) for details on the new structure.
|
| 305 |
+
|
| 306 |
+
CHANGE: The registration page where a new user's password is set now includes a required checkbox, asking them to acknowledge the terms and conditions presented above the checkbox (https://github.com/openziti/zrok/issues/669)
|
| 307 |
+
|
| 308 |
+
FIX: The registration page where a new user's password is set now includes better styling of the error message `<div/>` to prevent the entire page from jumping when the message changes.
|
| 309 |
+
|
| 310 |
+
## v0.4.37
|
| 311 |
+
|
| 312 |
+
FIX: Fix for setting the `zrok_interstitial` cookie on Chrome-based browsers.
|
| 313 |
+
|
| 314 |
+
FIX: Fix for `store.IsAccountGrantedSkipInterstitial` to respect the `deleted` flag.
|
| 315 |
+
|
| 316 |
+
FIX: When an error occurs connecting to the proxied endpoint, the `proxy` backend should return HTTP status `502` (https://github.com/openziti/zrok/issues/703)
|
| 317 |
+
|
| 318 |
+
## v0.4.36
|
| 319 |
+
|
| 320 |
+
FEATURE: New interstitial pages that can be enabled per-frontend, and disabled per-account (https://github.com/openziti/zrok/issues/704)
|
| 321 |
+
|
| 322 |
+
CHANGE: Enable `"declaration": true` in `tsconfig.json` for Node SDK.
|
| 323 |
+
|
| 324 |
+
FIX: build 32bit build for armhf to fix [the FPE issue](https://github.com/openziti/zrok/issues/654) and [the missing link issue](https://github.com/openziti/zrok/issues/642)
|
| 325 |
+
|
| 326 |
+
CHANGE: add [cross-build instructions](./BUILD.md) (includes new snapshot build target `armel`)
|
| 327 |
+
|
| 328 |
+
## v0.4.35
|
| 329 |
+
|
| 330 |
+
FEATURE: Added import for `github.com/greenpau/caddy-security` to include that Caddy plugin to enable authentication, authorization, and credentials extensions for the `caddy` backend (https://github.com/openziti/zrok/issues/506)
|
| 331 |
+
|
| 332 |
+
FEATURE: Closed permission mode for Docker and Linux private shares
|
| 333 |
+
|
| 334 |
+
CHANGE: add example in ./etc/caddy to set X-Real-IP header to public share client IP
|
| 335 |
+
|
| 336 |
+
CHANGE: auto-update the ziti CLI version that is built in to the openziti/zrok container image
|
| 337 |
+
|
| 338 |
+
CHANGE: Docker examples set HOME to enable running CLI commands in the container
|
| 339 |
+
|
| 340 |
+
FIX: Fix for environment count inheritance when using a resource count class to override global environment count (https://github.com/openziti/zrok/issues/695)
|
| 341 |
+
|
| 342 |
+
## v0.4.34
|
| 343 |
+
|
| 344 |
+
FEATURE: Linux service support for all private share modes (contribution from Stefan Adelbert @stefanadelbert)
|
| 345 |
+
|
| 346 |
+
FIX: Fix for mixing limited and unlimited (-1) resource counts in the limits system (https://github.com/openziti/zrok/issues/680)
|
| 347 |
+
|
| 348 |
+
FIX: Fix for sending multiple warning emails when a warning is applied to an account (https://github.com/openziti/zrok/issues/685)
|
| 349 |
+
|
| 350 |
+
CHANGE: add Docker compose example for multiple share containers using the same enabled environment in [compose.override.yml](./docker/compose/zrok-public-reserved/compose.override.yml)
|
| 351 |
+
|
| 352 |
+
CHANGE: bump many GitHub Actions that were using deprecated distributions of Node.js
|
| 353 |
+
|
| 354 |
+
CHANGE: bump macOS runner for Node SDK from macos-11 to macos-12
|
| 355 |
+
|
| 356 |
+
## v0.4.33
|
| 357 |
+
|
| 358 |
+
FIX: Fix for log message in `Agent.CanAccessShare` (`"account '#%d' over frontends per share limit '%d'"`), which was not returning the correct limit value.
|
| 359 |
+
|
| 360 |
+
FIX: Properly set `permission_mode` in `frontends` when createing a private frontend using `zrok access private` (https://github.com/openziti/zrok/issues/677)
|
| 361 |
+
|
| 362 |
+
CHANGE: Updated `react-bootstrap` to version `2.10.2` (web console).
|
| 363 |
+
|
| 364 |
+
CHANGE: Updated `@mui/material` to version `5.15.18` (web console).
|
| 365 |
+
|
| 366 |
+
CHANGE: Updated `react` and `react-dom` to version `18.3.1` (web console).
|
| 367 |
+
|
| 368 |
+
CHANGE: Updated `recharts` to version `2.12.7` (web console).
|
| 369 |
+
|
| 370 |
+
CHANGE: Updated `react-router-dom` to version `6.23.1` (web console).
|
| 371 |
+
|
| 372 |
+
CHANGE: Updated `axios` to version `1.7.2` for (node SDK).
|
| 373 |
+
|
| 374 |
+
CHANGE: Updated `@openziti/ziti-sdk-nodejs` to version `0.17.0` (node SDK).
|
| 375 |
+
|
| 376 |
+
## v0.4.32
|
| 377 |
+
|
| 378 |
+
FEATURE: New permission mode support for public frontends. Open permission mode frontends are available to all users in the service instance. Closed permission mode frontends reference the new `frontend_grants` table that can be used to control which accounts are allowed to create shares using that frontend. `zrok admin create frontend` now supports `--closed` flag to create closed permission mode frontends (https://github.com/openziti/zrok/issues/539)
|
| 379 |
+
|
| 380 |
+
FEATURE: New config `defaultFrontend` that specifies the default frontend to be used for an environment. Provides the default `--frontend` for `zrok share public` and `zrok reserve public` (https://github.com/openziti/zrok/issues/663)
|
| 381 |
+
|
| 382 |
+
FEATURE: Resource count limits now include `share_frontends` to limit the number of frontends that are allowed to make connections to a share (https://github.com/openziti/zrok/issues/650)
|
| 383 |
+
|
| 384 |
+
CHANGE: The frontend selection flag used by `zrok share public` and `zrok reserve public` has been changed from `--frontends` to `--frontend`
|
| 385 |
+
|
| 386 |
+
FIX: use controller config spec v4 in the Docker instance
|
| 387 |
+
|
| 388 |
+
## v0.4.31
|
| 389 |
+
|
| 390 |
+
FEATURE: New "limits classes" limits implementation (https://github.com/openziti/zrok/issues/606). This new feature allows for extensive limits customization on a per-user basis, with fallback to the global defaults in the controller configuration.
|
| 391 |
+
|
| 392 |
+
CHANGE: The controller configuration version has been updated to version `4` (`v: 4`) to support the new limits global configuration changes (https://github.com/openziti/zrok/issues/606).
|
| 393 |
+
|
| 394 |
+
CHANGE: A new `ZROK_CTRL_CONFIG_VERSION` environment variable now exists to temporarily force the controller to assume a specific controller configuration version, regardless of what version exists in the file. This allows two different config versions to potentially be co-mingled in the same controller configuration file. Use with care (https://github.com/openziti/zrok/issues/648)
|
| 395 |
+
|
| 396 |
+
CHANGE: Log messages that said `backend proxy endpoint` were clarified to say `backend target`.
|
| 397 |
+
|
| 398 |
+
FIX: Correct the syntax for the Docker and Linux zrok-share "frontdoor" service that broke OAuth email address pattern matching.
|
| 399 |
+
|
| 400 |
+
## v0.4.30
|
| 401 |
+
|
| 402 |
+
FIX: Fix to the Node.js release process to properly support releasing on a tag.
|
| 403 |
+
|
| 404 |
+
## v0.4.29
|
| 405 |
+
|
| 406 |
+
FIX: Backed out an incorrect change to support a FreeBSD port in progress.
|
| 407 |
+
|
| 408 |
+
## v0.4.28
|
| 409 |
+
|
| 410 |
+
FEATURE: Node.js support for the zrok SDK (https://github.com/openziti/zrok/issues/400)
|
| 411 |
+
|
| 412 |
+
FEATURE: A Docker Compose project for self-hosting a zrok instance and [accompanying Docker guide](https://docs.zrok.io/docs/guides/self-hosting/docker) for more information.
|
| 413 |
+
|
| 414 |
+
CHANGE: the container images run as "ziggy" (UID 2171) instead of the generic restricted user "nobody" (UID 65534). This reduces the risk of unexpected file permissions when binding the Docker host's filesystem to a zrok container.
|
| 415 |
+
|
| 416 |
+
CHANGE: the Docker sharing guides were simplified and expanded
|
| 417 |
+
|
| 418 |
+
## v0.4.27
|
| 419 |
+
|
| 420 |
+
FEATURE: New `vpn` backend mode. Use `sudo zrok share private --backend-mode vpn` on the _VPN server_ host, then `sudo zrok access private <token>` on _VPN client_ machine. Works with reserved shares using `zrok reserve private --backend-mode vpn`. Use `<target>` parameter to override default VPN network settings `zrok share private -b vpn 192.168.255.42/24` -- server IP is `192.168.255.42` and VPN netmask will be `192.168.255.0/24`. Client IPs are assigned automatically from netmask range.
|
| 421 |
+
|
| 422 |
+
CHANGE: Update to OpenZiti SDK (`github.com/openziti/sdk-golang`) at `v0.23.22`.
|
| 423 |
+
|
| 424 |
+
CHANGE: Added indexes to `environments`, `shares`, and `frontends` tables to improve overall query performance on both PostgreSQL and Sqlite.
|
| 425 |
+
|
| 426 |
+
FIX: Also update the Python SDK to include the permission mode and access grants fields on the `ShareRequest` (https://github.com/openziti/zrok/issues/432)
|
| 427 |
+
|
| 428 |
+
FIX: Add a way to find the username on Linux when /etc/passwd and stdlib can't resolve the UID (https://github.com/openziti/zrok/issues/454)
|
| 429 |
+
|
| 430 |
+
## v0.4.26
|
| 431 |
+
|
| 432 |
+
FEATURE: New _permission modes_ available for shares. _Open permission mode_ retains the behavior of previous zrok releases and is the default setting. _Closed permission mode_ (`--closed`) only allows a share to be accessed (`zrok access`) by users who have been granted access with the `--access-grant` flag. See the documentation at (https://docs.zrok.io/docs/guides/permission-modes/) (https://github.com/openziti/zrok/issues/432)
|
| 433 |
+
|
| 434 |
+
CHANGE: The target for a `socks` share is automatically set to `socks` to improve web console display.
|
| 435 |
+
|
| 436 |
+
CHANGE: Enhancements to the look and feel of the account actions tab in the web console. Textual improvements.
|
| 437 |
+
|
| 438 |
+
FIX: The regenerate account token dialog incorrectly specified the path `${HOME}/.zrok/environments.yml`. This, was corrected to be `${HOME}/.zrok/environments.json`.
|
| 439 |
+
|
| 440 |
+
FIX: Align zrok frontdoor examples and Linux package (`zrok-share`) with the new OAuth email flag `--oauth-email-address-patterns` introduced in v0.4.25.
|
| 441 |
+
|
| 442 |
+
FIX: Reloading the web console when logged in no longer provokes the user to the login page.
|
| 443 |
+
|
| 444 |
+
## v0.4.25
|
| 445 |
+
|
| 446 |
+
FEATURE: New action in the web console that allows changing the password of the logged-in account (https://github.com/openziti/zrok/issues/148)
|
| 447 |
+
|
| 448 |
+
FEATURE: The web console now supports revoking your current account token and generating a new one (https://github.com/openziti/zrok/issues/191)
|
| 449 |
+
|
| 450 |
+
CHANGE: When specifying OAuth configuration for public shares from the `zrok share public` or `zrok reserve` public commands, the flags and functionality for restricting the allowed email addresses of the authenticating users has changed. The old flag was `--oauth-email-domains`, which took a string value that needed to be contained in the user's email address. The new flag is `--oauth-email-address-patterns`, which accepts a glob-style filter, using https://github.com/gobwas/glob (https://github.com/openziti/zrok/issues/413)
|
| 451 |
+
|
| 452 |
+
CHANGE: Creating a reserved share checks for token collision and returns a more appropriate error message (https://github.com/openziti/zrok/issues/531)
|
| 453 |
+
|
| 454 |
+
CHANGE: Update UI to add a 'true' value on `reserved` boolean (https://github.com/openziti/zrok/issues/443)
|
| 455 |
+
|
| 456 |
+
CHANGE: OpenZiti SDK (github.com/openziti/sdk-golang) updated to version `v0.22.29`, which introduces changes to OpenZiti API session handling
|
| 457 |
+
|
| 458 |
+
FIX: Fixed bug where a second password reset request would for any account would fail (https://github.com/openziti/zrok/issues/452)
|
| 459 |
+
|
| 460 |
+
## v0.4.24
|
| 461 |
+
|
| 462 |
+
FEATURE: New `socks` backend mode for use with private sharing. Use `zrok share private --backend-mode socks` and then `zrok access private` that share from somewhere else... very lightweight VPN-like functionality (https://github.com/openziti/zrok/issues/558)
|
| 463 |
+
|
| 464 |
+
FEATURE: New `zrok admin create account` command that allows populating accounts directly into the underlying controller database (https://github.com/openziti/zrok/issues/551)
|
| 465 |
+
|
| 466 |
+
CHANGE: The `zrok test loopback public` utility to report non-`200` errors and also ensure that the listening side of the test is fully established before starting loopback testing.
|
| 467 |
+
|
| 468 |
+
CHANGE: The OpenZiti SDK for golang (https://github.com/openziti/sdk-golang) has been updated to version `v0.22.28`
|
| 469 |
+
|
| 470 |
+
## v0.4.23
|
| 471 |
+
|
| 472 |
+
FEATURE: New CLI commands have been implemented for working with the `drive` share backend mode (part of the "zrok Drives" functionality). These commands include `zrok cp`, `zrok mkdir` `zrok mv`, `zrok ls`, and `zrok rm`. These are initial, minimal versions of these commands and very likely contain bugs and ergonomic annoyances. There is a guide available at (`docs/guides/drives.mdx`) that explains how to work with these tools in detail (https://github.com/openziti/zrok/issues/438)
|
| 473 |
+
|
| 474 |
+
FEATURE: Python SDK now has a decorator for integrating with various server side frameworks. See the `http-server` example.
|
| 475 |
+
|
| 476 |
+
FEATURE: Python SDK share and access handling now supports context management.
|
| 477 |
+
|
| 478 |
+
FEATURE: TLS for `zrok` controller and frontends. Add the `tls:` stanza to your controller configuration (see `etc/ctrl.yml`) to enable TLS support for the controller API. Add the `tls:` stanza to your frontend configuration (see `etc/frontend.yml`) to enable TLS support for frontends (be sure to check your `public` frontend template) (#24)(https://github.com/openziti/zrok/issues/24)
|
| 479 |
+
|
| 480 |
+
CHANGE: Improved OpenZiti resource cleanup resilience. Previous resource cleanup would stop when an error was encountered at any stage of the cleanup process (serps, sps, config, service). New cleanup implementation logs errors but continues to clean up anything that it can (https://github.com/openziti/zrok/issues/533)
|
| 481 |
+
|
| 482 |
+
CHANGE: Instead of setting the `ListenOptions.MaxConnections` property to `64`, use the default value of `3`. This property actually controls the number of terminators created on the underlying OpenZiti network. This property is actually getting renamed to `ListenOptions.MaxTerminators` in an upcoming release of `github.com/openziti/sdk-golang` (https://github.com/openziti/zrok/issues/535)
|
| 483 |
+
|
| 484 |
+
CHANGE: Versioning for the Python SDK has been updated to use versioneer for management.
|
| 485 |
+
|
| 486 |
+
CHANGE: Python SDK package name has been renamed to `zrok`, dropping the `-sdk` postfix. [pypi](https://pypi.org/project/zrok).
|
| 487 |
+
|
| 488 |
+
## v0.4.22
|
| 489 |
+
|
| 490 |
+
FIX: The goreleaser action is not updated to work with the latest golang build. Modifed `go.mod` to comply with what goreleaser expects
|
| 491 |
+
|
| 492 |
+
## v0.4.21
|
| 493 |
+
|
| 494 |
+
FEATURE: The web console now supports deleting `zrok access` frontends (https://github.com/openziti/zrok/issues/504)
|
| 495 |
+
|
| 496 |
+
CHANGE: The web console now displays the frontend token as the label for any `zrok access` frontends throughout the user interface (https://github.com/openziti/zrok/issues/504)
|
| 497 |
+
|
| 498 |
+
CHANGE: Updated `github.com/rubenv/sql-migrate` to `v1.6.0`
|
| 499 |
+
|
| 500 |
+
CHANGE: Updated `github.com/openziti/sdk-golang` to `v0.22.6`
|
| 501 |
+
|
| 502 |
+
FIX: The migration `sqlite3/015_v0_4_19_share_unique_name_constraint.sql` has been adjusted to delete the old `shares_old` table as the last step of the migration process. Not sure exactly why, but SQLite is unhappy otherwise (https://github.com/openziti/zrok/issues/504)
|
| 503 |
+
|
| 504 |
+
FIX: Email addresses have been made case-insensitive. Please note that there is a migration included in this release (`016_v0_4_21_lowercase_email.sql`) which will attempt to ensure that all email addresses in your existing database are stored in lowercase; **if this migration fails you will need to manually remediate the duplicate account entries** (https://github.com/openziti/zrok/issues/517)
|
| 505 |
+
|
| 506 |
+
FIX: Stop sending authentication cookies to non-authenticated shares (https://github.com/openziti/zrok/issues/512)
|
| 507 |
+
|
| 508 |
+
## v0.4.20
|
| 509 |
+
|
| 510 |
+
CHANGE: OpenZiti SDK updated to `v0.21.2`. All `ziti.ListenOptions` listener options configured to use `WaitForNEstablishedListeners: 1`. When a `zrok share` client or an `sdk.Share` client are connected to an OpenZiti router that supports "listener established" events, then listen calls will not return until the listener is fully established on the OpenZiti network. Previously a `zrok share` client could report that it is fully operational and listening before the listener is fully established on the OpenZiti network; in practice this produced a very small window of time when the share would not be ready to accept requests. This change eliminates this window of time (https://github.com/openziti/zrok/issues/490)
|
| 511 |
+
|
| 512 |
+
FIX: Require the JWT in a zrok OAuth cookie to have an audience claim that matches the public share hostname. This prevents a cookie from one share from being use to log in to another share.
|
| 513 |
+
|
| 514 |
+
## v0.4.19
|
| 515 |
+
|
| 516 |
+
FEATURE: Reserved shares now support unique names ("vanity tokens"). This allows for the creation of reserved shares with identifiable names rather than generated share tokens. Includes basic support for profanity checking (https://github.com/openziti/zrok/issues/401)
|
| 517 |
+
|
| 518 |
+
CHANGE: The `publicProxy` endpoint implementation used in the `zrok access public` frontend has been updated to use the new `RefreshService(serviceName)` call instead of `RefreshServices()`. This should greatly improve the performance of requests against missing or non-responsive zrok shares (https://github.com/openziti/zrok/issues/487)
|
| 519 |
+
|
| 520 |
+
CHANGE: The Python SDK has been updated to properly support the "reserved" flag on the `ShareRequest` passed to `CreateShare`
|
| 521 |
+
|
| 522 |
+
CHANGE: Dependency updates; `github.com/openziti/sdk-golang@v0.20.145`; `github.com/caddyserver/caddy/v2@2.7.6`; indirect dependencies
|
| 523 |
+
|
| 524 |
+
## v0.4.18
|
| 525 |
+
|
| 526 |
+
FEATURE: Python SDK added. Can be found on [pypi](https://test.pypi.org/project/zrok-sdk). `pastebin` example illustrates basic SDK usage (see `sdk/python/examples/README.md` for details) (https://github.com/openziti/zrok/issues/401)
|
| 527 |
+
|
| 528 |
+
CHANGE: Moved the golang zrok sdk into `sdk/golang/sdk` to normalize location for future SDK's.
|
| 529 |
+
|
| 530 |
+
CHANGE: add restart policies to docker compose samples used by the guide docs, e.g., reserved public share should auto-start on boot, temp public share should not.
|
| 531 |
+
|
| 532 |
+
## v0.4.17
|
| 533 |
+
|
| 534 |
+
CHANGE: Replaced most in-line shell scripts in Docker Compose projects with installed scripts that are shared between the Docker and Linux service. This normalizes the operational configuration of both Docker shares and Linux service, i.e., to use the same env vars.
|
| 535 |
+
|
| 536 |
+
CHANGE: Upgrade to Docusaurus v3 for documentation.
|
| 537 |
+
|
| 538 |
+
FIX: Some Docker shares had broken env mountpoints
|
| 539 |
+
|
| 540 |
+
## v0.4.16
|
| 541 |
+
|
| 542 |
+
FEATURE: Publish Linux packages for `zrok` CLI and a systemd service for running a reserved public share (`zrok-share`).
|
| 543 |
+
|
| 544 |
+
## v0.4.15
|
| 545 |
+
|
| 546 |
+
CHANGE: Updated the code signing and notarization process for macos binaries. The previous release process used the `gon` utility to handle both code signing and notarization. Apple changed the requirements and the `gon` utility no longer properly functions as of 2023-11-01. The `goreleaser` process has been adjusted to use the `notarytool` utility that ships with XCode to sign and notarize the binary (https://github.com/openziti/zrok/issues/435)
|
| 547 |
+
|
| 548 |
+
## v0.4.14
|
| 549 |
+
|
| 550 |
+
FEATURE: `zrok` Drives "Phase 1" (`p1`) functionality included in this release. This includes new `--backend-mode drive`, which accepts a folder path as a target. A `drive` share can be mounted as a network drive on Windows, macOS, and Linux, allowing full read/write access from all applications on those systems (https://github.com/openziti/zrok/issues/218) Subsequent releases will address CLI use cases and provide further refinements to the overall approach.
|
| 551 |
+
|
| 552 |
+
FEATURE: Docker Compose project for a reserved public share in docker/compose/zrok-public-reserved/compose.yml is described in the [public share guide](https://docs.zrok.io/docs/guides/docker-share/docker_public_share_guide/).
|
| 553 |
+
|
| 554 |
+
## v0.4.13
|
| 555 |
+
|
| 556 |
+
FIX: Update to Homebrew automation to properly integrate with the latest version of the Homebrew release process.
|
| 557 |
+
|
| 558 |
+
## v0.4.12
|
| 559 |
+
|
| 560 |
+
FIX: The `zrok reserve` command was not properly recording the reserved share status of the shares that it created, preventing the `zrok release` command from properly releasing them (https://github.com/openziti/zrok/issues/427) If a user encounters reserved shares that cannot be released with the `zrok release` command, they can be deleted through the web console.
|
| 561 |
+
|
| 562 |
+
## v0.4.11
|
| 563 |
+
|
| 564 |
+
FEATURE: The `zrok reserve` command now incorporates the `--json-output|-j` flag, which outputs the reservation details as JSON, rather than as human-consumable log messages. Other commands will produce similar output in the future (https://github.com/openziti/zrok/issues/422)
|
| 565 |
+
|
| 566 |
+
FIX: Include `--oauth-provider` and associated flags for the `zrok reserve` command, allowing reserved shares to specify OAuth authentication (https://github.com/openziti/zrok/issues/421)
|
| 567 |
+
|
| 568 |
+
## v0.4.10
|
| 569 |
+
|
| 570 |
+
CHANGE: The public frontend configuration has been bumped from `v: 2` to `v: 3`. The `redirect_host`, `redirect_port` and `redirect_http_only` parameters have been removed. These three configuration options have been replaced with `bind_address`, `redirect_url` and `cookie_domain`. See the OAuth configuration guide at `docs/guides/self-hosting/oauth/configuring-oauth.md` for more details (https://github.com/openziti/zrok/issues/411)
|
| 571 |
+
|
| 572 |
+
## v0.4.9
|
| 573 |
+
|
| 574 |
+
FIX: Remove extraneous share token prepended to OAuth frontend redirect.
|
| 575 |
+
|
| 576 |
+
## v0.4.8
|
| 577 |
+
|
| 578 |
+
FEATURE: The `sdk` package now includes a `sdk.Overview` function, which returns a complete description of the account attached to the enabled environment. Useful for inventorying the deployed shares and environments (https://github.com/openziti/zrok/issues/407)
|
| 579 |
+
|
| 580 |
+
CHANGE: The `zrok access public` frontend configuration format has changed and now requires that the configuration document include a `v: 2` declaration. This frontend configuration format is now versioned and when the code updates the configuration structure, you will receive an error message at startup, provoking you to look into updating your configuration (https://github.com/openziti/zrok/issues/406)
|
| 581 |
+
|
| 582 |
+
CHANGE: The title color of the header was changed from white to flourescent green, to better match the overall branding
|
| 583 |
+
|
| 584 |
+
CHANGE: Tweaks to build and release process for logging and deprecations. Pin golang version at 1.21.3+ and node version at 18.x across all platforms
|
| 585 |
+
|
| 586 |
+
CHANGE: Improvements to email invitation sent in response to `zrok invite` to correct broken links, some minor HTML issues and improve overall deliverability (https://github.com/openziti/zrok/issues/405)
|
| 587 |
+
|
| 588 |
+
CHANGE: Added warning message after `zrok invite` submit directing the user to check their "spam" folder if they do not receive the invite message.
|
| 589 |
+
|
| 590 |
+
## v0.4.7
|
| 591 |
+
|
| 592 |
+
FEATURE: OAuth authentication with the ability to restrict authenticated users to specified domains for `zrok share public`. Supports both Google and GitHub authentication in this version. More authentication providers, and extensibility to come in future `zrok` releases. See the OAuth configuration guide at `docs/guides/self-hosting/oauth/configuring-oauth.md` for details (https://github.com/openziti/zrok/issues/45, https://github.com/openziti/zrok/issues/404)
|
| 593 |
+
|
| 594 |
+
CHANGE: `--basic-auth` realm now presented as the share token rather than as `zrok` in `publicProxy` frontend implementation
|
| 595 |
+
|
| 596 |
+
## v0.4.6
|
| 597 |
+
|
| 598 |
+
FEATURE: New `--backend-mode caddy`, which pre-processes a `Caddyfile` allowing a `bind` statement to work like this: `bind {{ .ZrokBindAddress }}`. Allows development of complicated API gateways and multi-backend shares, while maintaining the simple, ephemeral sharing model provided by `zrok` (https://github.com/openziti/zrok/issues/391)
|
| 599 |
+
|
| 600 |
+
CHANGE: `--backend-mode web` has been refactored to utilize Caddy as the integrated web server. This provides for a much nicer web-based file browsing experience, while maintaining the existing web server facilities (https://github.com/openziti/zrok/issues/392)
|
| 601 |
+
|
| 602 |
+
CHANGE: Updated the golang version for release builds to `1.21.0` and the node version to `18.x`
|
| 603 |
+
|
| 604 |
+
CHANGE: Added `FrontendEndponts` to `sdk.Share`, returning selected frontend URLs to callers of `sdk.CreateShare`
|
| 605 |
+
|
| 606 |
+
CHANGE: Added a short alias `-b` for `--backend-mode` to improve CLI ergonomics (https://github.com/openziti/zrok/issues/397)
|
| 607 |
+
|
| 608 |
+
## v0.4.5
|
| 609 |
+
|
| 610 |
+
FEATURE: New health check endpoint (`/health`), which verifies that the underlying SQL store and metrics repository (InfluxDB, if configured) are operating correctly (https://github.com/openziti/zrok/issues/372)
|
| 611 |
+
|
| 612 |
+
CHANGE: Updated to golang v1.21.0 and node v18.x
|
| 613 |
+
|
| 614 |
+
FIX: `zrok admin bootstrap` and `zrok enable` both broken with latest OpenZiti releases (tested with `v0.30.0`); updated to latest OpenZiti golang SDK (https://github.com/openziti/zrok/issues/389)
|
| 615 |
+
|
| 616 |
+
## v0.4.4
|
| 617 |
+
|
| 618 |
+
FIX: `zrok status`, `zrok enable`, `zrok config`, etc. were all causing a panic when used on systems that had no previous `~/.zrok` directory (https://github.com/openziti/zrok/issues/383)
|
| 619 |
+
|
| 620 |
+
## v0.4.3
|
| 621 |
+
|
| 622 |
+
FEATURE: New `zrok overview` command, which returns all of the account details as a single JSON structure. See the OpenAPI spec at `specs/zrok.yml` for more details of the `/api/v1/overview` endpoint (https://github.com/openziti/zrok/issues/374)
|
| 623 |
+
|
| 624 |
+
FEATURE: New `zrok` SDK (https://github.com/openziti/zrok/issues/34). `pastebin` example illustrates basic SDK usage (see `sdk/examples/pastebin/README.md` for details) ((https://github.com/openziti/zrok/issues/379)
|
| 625 |
+
|
| 626 |
+
## v0.4.2
|
| 627 |
+
|
| 628 |
+
Some days are just like this. `v0.4.2` is a re-do of `v0.4.1`. Trying to get Homebrew working and had a bad release. Hopefully this is the one.
|
| 629 |
+
|
| 630 |
+
## v0.4.1
|
| 631 |
+
|
| 632 |
+
FEATURE: New `zrok console` command to open the currently configured web console in the local web browser (https://github.com/openziti/zrok/issues/170)
|
| 633 |
+
|
| 634 |
+
CHANGE: Further tweaks to the release process to automatically get the latest release into Homebrew (https://github.com/openziti/zrok/issues/264)
|
| 635 |
+
|
| 636 |
+
## v0.4.0
|
| 637 |
+
|
| 638 |
+
FEATURE: New `tcpTunnel` backend mode allowing for private sharing of local TCP sockets with other `zrok` users (https://github.com/openziti/zrok/issues/170)
|
| 639 |
+
|
| 640 |
+
FEATURE: New `udpTunnel` backend mode allowing for private sharing of local UDP sockets with other `zrok` users (https://github.com/openziti/zrok/issues/306)
|
| 641 |
+
|
| 642 |
+
FEATURE: New metrics infrastructure based on OpenZiti usage events (https://github.com/openziti/zrok/issues/128). See the [v0.4 Metrics Guide](docs/guides/metrics-and-limits/configuring-metrics.md) for more information.
|
| 643 |
+
|
| 644 |
+
FEATURE: New limits implementation based on the new metrics infrastructure (https://github.com/openziti/zrok/issues/235). See the [v0.4 Limits Guide](docs/guides/metrics-and-limits/configuring-limits.md) for more information.
|
| 645 |
+
|
| 646 |
+
FEATURE: The invite mechanism has been reworked to improve user experience. The configuration has been updated to include a new `invite` stanza, and now includes a boolean flag indicating whether or not the instance allows new invitations to be created, and also includes contact details for requesting a new invite. These values are used by the `zrok invite` command to provide a smoother end-user invite experience https://github.com/openziti/zrok/issues/229)
|
| 647 |
+
|
| 648 |
+
FEATURE: New password strength checking rules and configuration. See the example configuration file (`etc/ctrl.yml`) for details about how to configure the strength checking rules (https://github.com/openziti/zrok/issues/167)
|
| 649 |
+
|
| 650 |
+
FEATURE: A new `admin/profile_endpoint` configuration option is available to start a `net/http/pprof` listener. See `etc/ctrl.yml` for details.
|
| 651 |
+
|
| 652 |
+
CHANGE: The controller configuration version bumps from `v: 2` to `v: 3` to support all of the new `v0.4` functionality. See the [example ctrl.yml](etc/ctrl.yml) for details on the new configuration.
|
| 653 |
+
|
| 654 |
+
CHANGE: The underlying database store now utilizes a `deleted` flag on all tables to implement "soft deletes". This was necessary for the new metrics infrastructure, where we need to account for metrics data that arrived after the lifetime of a share or environment; and also we're going to need this for limits, where we need to see historical information about activity in the past (https://github.com/openziti/zrok/issues/262)
|
| 655 |
+
|
| 656 |
+
CHANGE: Updated to latest `github.com/openziti/sdk-golang` (https://github.com/openziti/zrok/issues/335)
|
| 657 |
+
|
| 658 |
+
FIX: `zrok share reserved --override-endpoint` now works correctly; `--override-endpoint` was being incorrectly ignore previously (https://github.com/openziti/zrok/pull/348)
|
| 659 |
+
|
| 660 |
+
## v0.3.7
|
| 661 |
+
|
| 662 |
+
FIX: Improved TUI word-wrapping (https://github.com/openziti/zrok/issues/180)
|
| 663 |
+
|
| 664 |
+
## v0.3.6
|
| 665 |
+
|
| 666 |
+
CHANGE: Additional change to support branch builds (for CI purposes) and additional containerization efforts around k8s.
|
| 667 |
+
|
| 668 |
+
## v0.3.5
|
| 669 |
+
|
| 670 |
+
CHANGE: `zrok config set apiEndpoint` now validates that the new API endpoint correctly starts with `http://` or `https://` (https://github.com/openziti/zrok/issues/258)
|
| 671 |
+
|
| 672 |
+
CHANGE: Additional linting to support homebrew (https://github.com/openziti/zrok/issues/264)
|
| 673 |
+
|
| 674 |
+
## v0.3.4
|
| 675 |
+
|
| 676 |
+
CHANGE: `zrok test endpoint` incorporates `--ziti` mode (and related flags) to allow direct endpoint listening on a Ziti service
|
| 677 |
+
|
| 678 |
+
CHANGE: `zrok test websocket` command to test websockets, whether over TCP or over Ziti
|
| 679 |
+
|
| 680 |
+
FIX: Websocket support now functional
|
| 681 |
+
|
| 682 |
+
## v0.3.3
|
| 683 |
+
|
| 684 |
+
CHANGE: `zrok test loop` has been moved to `zrok test loop public`, making way for additional types of loopback testing tools. The `zrok test endpoint` server now includes an `/echo` endpoint, which provides a simple echo websocket (https://github.com/openziti/zrok/issues/237)
|
| 685 |
+
|
| 686 |
+
## v0.3.2
|
| 687 |
+
|
| 688 |
+
FEATURE: New docker infrastructure, including `docker-compose.yml` examples (and documentation) illustrating how to deploy `zrok` in `docker`-based environments
|
| 689 |
+
|
| 690 |
+
CHANGE: Include missing `--headless` flag for `zrok enable` and `zrok access private` (https://github.com/openziti/zrok/issues/246)
|
| 691 |
+
|
| 692 |
+
CHANGE: Fix for `zrok enable` error path handling (https://github.com/openziti/zrok/issues/244)
|
| 693 |
+
|
| 694 |
+
FEATURE: `zrok controller validate` and `zrok access public validate` will both perform a quick syntax validation on controller and public frontend configuration documents (https://github.com/openziti/zrok/issues/238)
|
| 695 |
+
|
| 696 |
+
$ zrok controller validate etc/dev.yml
|
| 697 |
+
[ERROR]: controller config validation failed (error loading controller config 'etc/dev.yml': field 'maintenance': field 'registration': field 'expiration_timeout': got [bool], expected [time.Duration])
|
| 698 |
+
|
| 699 |
+
CHANGE: `zrok status` no longer shows secrets (secret token, ziti identity) unless the `--secrets` flag is passed (https://github.com/openziti/zrok/issues/243)
|
| 700 |
+
|
| 701 |
+
## v0.3.1
|
| 702 |
+
|
| 703 |
+
CHANGE: Incorporate initial docker image build (https://github.com/openziti/zrok/issues/217)
|
| 704 |
+
|
| 705 |
+
CHANGE: Improve target URL parsing for `zrok share` when using `--backend-mode` proxy (https://github.com/openziti/zrok/issues/211)
|
| 706 |
+
|
| 707 |
+
New and improved URL handling for proxy backends:
|
| 708 |
+
|
| 709 |
+
9090 -> http://127.0.0.1:9090
|
| 710 |
+
localhost:9090 -> http://127.0.0.1:9090
|
| 711 |
+
https://localhost:9090 -> https://localhost:9090
|
| 712 |
+
|
| 713 |
+
CHANGE: Improve usability of `zrok invite` TUI in low-color environments (https://github.com/openziti/zrok/issues/206)
|
| 714 |
+
|
| 715 |
+
CHANGE: Better error responses when `zrok invite` fails due to missing token (https://github.com/openziti/zrok/issues/207)
|
| 716 |
+
|
| 717 |
+
## v0.3.0
|
| 718 |
+
|
| 719 |
+
CHANGE: Removed some minor web console lint and warnings (https://github.com/openziti/zrok/issues/205)
|
| 720 |
+
|
| 721 |
+
## v0.3.0-rc6
|
| 722 |
+
|
| 723 |
+
CHANGE: Better error message when `zrok admin create frontend` runs into a duplicate name collision (https://github.com/openziti/zrok/issues/168)
|
| 724 |
+
|
| 725 |
+
CHANGE: Gentler CLI error messages by default (https://github.com/openziti/zrok/issues/203)
|
| 726 |
+
|
| 727 |
+
CHANGE: Add favicon to web console (https://github.com/openziti/zrok/issues/198)
|
| 728 |
+
|
| 729 |
+
CHANGE: Add configurable "terms of use" link in the controller configuration, and optionally display the link on the login form and registration forms (https://github.com/openziti/zrok/issues/184)
|
| 730 |
+
|
| 731 |
+
CHANGE: Prevent multiple `zrok enable` commands from succeeding (https://github.com/openziti/zrok/issues/190)
|
| 732 |
+
|
| 733 |
+
CHANGE: New `--insecure` flag for `share <public|private|reserved>` commands (https://github.com/openziti/zrok/issues/195)
|
| 734 |
+
|
| 735 |
+
## v0.3.0-rc5
|
| 736 |
+
|
| 737 |
+
CHANGE: Improvements to controller log messages to assist in operations (https://github.com/openziti/zrok/issues/186)
|
| 738 |
+
|
| 739 |
+
CHANGE: `armv7` builds for Linux are now shipped with releases; these builds were tested against a Raspberry Pi 4 (https://github.com/openziti/zrok/issues/93)
|
| 740 |
+
|
| 741 |
+
CHANGE: `zrok config set` now includes a warning when the `apiEndpoint` config is changed and an environment is already enabled; the user will not see the change until `zrok disable` is run. The CLI now includes a `zrok config unset` command (https://github.com/openziti/zrok/issues/188)
|
| 742 |
+
|
| 743 |
+
## v0.3.0-rc4
|
| 744 |
+
|
| 745 |
+
CHANGE: Enable notarization for macos binaries (https://github.com/openziti/zrok/issues/92)
|
| 746 |
+
|
| 747 |
+
## v0.3.0-rc3
|
| 748 |
+
|
| 749 |
+
> This release increments the configuration version from `1` to `2`. See the note below.
|
| 750 |
+
|
| 751 |
+
CHANGE: The email "from" configuration moved from `registration/email_from` to `email/from`. **NOTE: This change increments the configuration `V` from `1` to `2`.**
|
| 752 |
+
|
| 753 |
+
CHANGE: Replaced un-salted sha512 password hashing with salted hashing based on Argon2 **NOTE: This version will _invalidate_ all account passwords, and will require all users to use the 'Forgot Password?' function to reset their password.** (https://github.com/openziti/zrok/issues/156)
|
| 754 |
+
|
| 755 |
+
CHANGE: Switched from `ubuntu-latest` (`22.04`) for the Linux builds to `ubuntu-20.04`. Should improve `glibc` compatibility with older Linux distributions (https://github.com/openziti/zrok/issues/179)
|
| 756 |
+
|
| 757 |
+
CHANGE: `zrok admin generate` now outputs the generated tokens to `stdout` after successfully provisioning the tokens (https://github.com/openziti/zrok/issues/181)
|
| 758 |
+
|
| 759 |
+
FIX: Fixed log message in `resetPasswordRequest.go` (https://github.com/openziti/zrok/issues/175)
|
| 760 |
+
|
| 761 |
+
FIX: Fixed `-v` (verbose mode) on in TUI-based `zrok share` and `zrok access` (https://github.com/openziti/zrok/issues/174)
|
| 762 |
+
|
| 763 |
+
## v0.3.0-rc2
|
| 764 |
+
|
| 765 |
+
FEATURE: Allow users to reset their password (https://github.com/openziti/zrok/issues/65)
|
| 766 |
+
|
| 767 |
+
CHANGE: Improved email styling for new user invite emails (https://github.com/openziti/zrok/issues/157)
|
| 768 |
+
|
| 769 |
+
CHANGE: Migrated from `openziti-test-kitchen` to `openziti` (https://github.com/openziti/zrok/issues/158).
|
| 770 |
+
|
| 771 |
+
CHANGE: Show a hint when `zrok invite` fails, indicating that the user should check to see if they need to be using the `--token` flag and token-based invites (https://github.com/openziti/zrok/issues/172).
|
| 772 |
+
|
| 773 |
+
FIX: Fixed PostgreSQL migration issue where sequences got reset and resulted in primary key collisions on a couple of tables (https://github.com/openziti/zrok/issues/160).
|
| 774 |
+
|
| 775 |
+
FIX: Remove `frontend` instances when `zrok disable`-ing an environment containing them (https://github.com/openziti/zrok/issues/171)
|
| 776 |
+
|
| 777 |
+
## v0.3.x Series
|
| 778 |
+
|
| 779 |
+
The `v0.2` series was a _proof-of-concept_ implementation for the overall `zrok` architecture and the concept.
|
| 780 |
+
|
| 781 |
+
`v0.3` is a massive elaboration of the concept, pivoting it from being a simple ephemeral reverse proxy solution, to being the beginnings of a comprehensive sharing platform, complete with public and private sharing (built on top of OpenZiti).
|
| 782 |
+
|
| 783 |
+
`v0.3.0` includes the minimal functionality required to produce an early, preview version of the elaborated `zrok` concept, suitable for both production use at `zrok.io`, and also suitable for private self-hosting.
|
| 784 |
+
|
| 785 |
+
From `v0.3.0` forward, we will begin tracking notable changes in this document.
|
| 786 |
+
|
| 787 |
+
## v0.2.18
|
| 788 |
+
|
| 789 |
+
* DEFECT: Token generation has been improved to use an alphabet consisting of `[a-zA-Z0-9]`. Service token generation continues to use a case-insensitive alphabet consisting of `[a-z0-9]` to be DNS-safe.
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright 2019 NetFoundry, Inc.
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# zrok - Secure Internet Sharing Made Simple
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+
|
| 5 |
+
**Share anything, anywhere, instantly. Enterprise reliability. No firewall changes. No port forwarding. No hassle.**
|
| 6 |
+
|
| 7 |
+
`zrok` lets you securely share web services, files, and network resources with anyone—whether they're across the internet or your private network. Built on zero-trust networking, it works through firewalls and NAT without requiring any network configuration changes.
|
| 8 |
+
|
| 9 |
+
## Quick Start
|
| 10 |
+
|
| 11 |
+
Get sharing in under 2 minutes:
|
| 12 |
+
|
| 13 |
+
1. **[Install zrok](https://docs.zrok.io/docs/guides/install/)** for your platform
|
| 14 |
+
2. **Get an account**: `zrok invite` (use the free [zrok.io service](https://docs.zrok.io/docs/getting-started/))
|
| 15 |
+
3. **Enable sharing**: `zrok enable`
|
| 16 |
+
|
| 17 |
+
That's it! Now you can share anything:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
# Share a web service publicly
|
| 21 |
+
$ zrok share public localhost:8080
|
| 22 |
+
|
| 23 |
+
# Share files as a network drive
|
| 24 |
+
$ zrok share public --backend-mode drive ~/Documents
|
| 25 |
+
|
| 26 |
+
# Share privately with other zrok users
|
| 27 |
+
$ zrok share private localhost:3000
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+

|
| 31 |
+
|
| 32 |
+
## What You Can Share
|
| 33 |
+
|
| 34 |
+
### Web Services
|
| 35 |
+
Instantly make local web apps accessible over the internet:
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
$ zrok share public localhost:8080
|
| 39 |
+
```
|
| 40 |
+

|
| 41 |
+
|
| 42 |
+
### Files & Directories
|
| 43 |
+
Turn any folder into a shareable network drive:
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
$ zrok share public --backend-mode drive ~/Repos/zrok
|
| 47 |
+
```
|
| 48 |
+

|
| 49 |
+

|
| 50 |
+
|
| 51 |
+
### Private Resources
|
| 52 |
+
Share TCP/UDP services securely with other zrok users—no public internet exposure.
|
| 53 |
+
|
| 54 |
+
## Key Features
|
| 55 |
+
|
| 56 |
+
- **Zero Configuration**: Works through firewalls, NAT, and corporate networks
|
| 57 |
+
- **Secure by Default**: End-to-end encryption with zero-trust architecture
|
| 58 |
+
- **Public & Private Sharing**: Share with anyone or just specific users
|
| 59 |
+
- **Multiple Protocols**: HTTP/HTTPS, TCP, UDP, and file sharing
|
| 60 |
+
- **Cross-Platform**: Windows, macOS, Linux, and Raspberry Pi
|
| 61 |
+
- **Self-Hostable**: Run your own zrok service instance
|
| 62 |
+
|
| 63 |
+
## How It Works
|
| 64 |
+
|
| 65 |
+
`zrok` is built on [OpenZiti](https://docs.openziti.io/docs/learn/introduction/), a programmable zero-trust network overlay. This means:
|
| 66 |
+
|
| 67 |
+
- **No inbound connectivity required**: Works from behind firewalls and NAT
|
| 68 |
+
- **End-to-end encryption**: All traffic is encrypted, even from zrok servers
|
| 69 |
+
- **Peer-to-peer connections**: Direct connections between users when possible
|
| 70 |
+
- **Identity-based access**: Share with specific users, not IP addresses
|
| 71 |
+
|
| 72 |
+
## Developer SDK
|
| 73 |
+
|
| 74 |
+
Embed `zrok` sharing into your applications with our Go SDK:
|
| 75 |
+
|
| 76 |
+
```go
|
| 77 |
+
// Create a share
|
| 78 |
+
shr, err := sdk.CreateShare(root, &sdk.ShareRequest{
|
| 79 |
+
BackendMode: sdk.TcpTunnelBackendMode,
|
| 80 |
+
ShareMode: sdk.PrivateShareMode,
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
// Accept connections
|
| 84 |
+
listener, err := sdk.NewListener(shr.Token, root)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
[Read the SDK guide](https://blog.openziti.io/the-zrok-sdk) for complete examples.
|
| 88 |
+
|
| 89 |
+
## Self-Hosting
|
| 90 |
+
|
| 91 |
+
Run your own `zrok` service—from Raspberry Pi to enterprise scale:
|
| 92 |
+
|
| 93 |
+
- Single binary contains everything you need
|
| 94 |
+
- Scales from small personal instances to large public services
|
| 95 |
+
- Built on the same codebase as the public `zrok.io` service
|
| 96 |
+
|
| 97 |
+
[Self-Hosting Guide](https://docs.zrok.io/docs/guides/self-hosting/self_hosting_guide/)
|
| 98 |
+
|
| 99 |
+
## Resources
|
| 100 |
+
|
| 101 |
+
- **[Documentation](https://docs.zrok.io/)**
|
| 102 |
+
- **[Office Hours Videos](https://www.youtube.com/watch?v=Edqv7yRmXb0&list=PLMUj_5fklasLuM6XiCNqwAFBuZD1t2lO2)**
|
| 103 |
+
- **[Building from Source](./BUILD.md)**
|
| 104 |
+
- **[Contributing](./CONTRIBUTING.md)**
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
*Ready to start sharing? [Get started with zrok →](https://docs.zrok.io/docs/getting-started)*
|
airflow/airflow-webserver.pid
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
257
|
airflow/airflow.cfg
ADDED
|
@@ -0,0 +1,2498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[core]
|
| 2 |
+
# The folder where your airflow pipelines live, most likely a
|
| 3 |
+
# subfolder in a code repository. This path must be absolute.
|
| 4 |
+
#
|
| 5 |
+
# Variable: AIRFLOW__CORE__DAGS_FOLDER
|
| 6 |
+
#
|
| 7 |
+
dags_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/dags
|
| 8 |
+
|
| 9 |
+
# Hostname by providing a path to a callable, which will resolve the hostname.
|
| 10 |
+
# The format is "package.function".
|
| 11 |
+
#
|
| 12 |
+
# For example, default value ``airflow.utils.net.getfqdn`` means that result from patched
|
| 13 |
+
# version of `socket.getfqdn() <https://docs.python.org/3/library/socket.html#socket.getfqdn>`__,
|
| 14 |
+
# see related `CPython Issue <https://github.com/python/cpython/issues/49254>`__.
|
| 15 |
+
#
|
| 16 |
+
# No argument should be required in the function specified.
|
| 17 |
+
# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
|
| 18 |
+
#
|
| 19 |
+
# Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE
|
| 20 |
+
#
|
| 21 |
+
hostname_callable = airflow.utils.net.getfqdn
|
| 22 |
+
|
| 23 |
+
# A callable to check if a python file has airflow dags defined or not and should
|
| 24 |
+
# return ``True`` if it has dags otherwise ``False``.
|
| 25 |
+
# If this is not provided, Airflow uses its own heuristic rules.
|
| 26 |
+
#
|
| 27 |
+
# The function should have the following signature
|
| 28 |
+
#
|
| 29 |
+
# .. code-block:: python
|
| 30 |
+
#
|
| 31 |
+
# def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ...
|
| 32 |
+
#
|
| 33 |
+
# Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE
|
| 34 |
+
#
|
| 35 |
+
might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic
|
| 36 |
+
|
| 37 |
+
# Default timezone in case supplied date times are naive
|
| 38 |
+
# can be `UTC` (default), `system`, or any `IANA <https://www.iana.org/time-zones>`
|
| 39 |
+
# timezone string (e.g. Europe/Amsterdam)
|
| 40 |
+
#
|
| 41 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE
|
| 42 |
+
#
|
| 43 |
+
default_timezone = utc
|
| 44 |
+
|
| 45 |
+
# The executor class that airflow should use. Choices include
|
| 46 |
+
# ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``,
|
| 47 |
+
# ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the
|
| 48 |
+
# full import path to the class when using a custom executor.
|
| 49 |
+
#
|
| 50 |
+
# Variable: AIRFLOW__CORE__EXECUTOR
|
| 51 |
+
#
|
| 52 |
+
executor = SequentialExecutor
|
| 53 |
+
|
| 54 |
+
# The auth manager class that airflow should use. Full import path to the auth manager class.
|
| 55 |
+
#
|
| 56 |
+
# Variable: AIRFLOW__CORE__AUTH_MANAGER
|
| 57 |
+
#
|
| 58 |
+
auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
|
| 59 |
+
|
| 60 |
+
# This defines the maximum number of task instances that can run concurrently per scheduler in
|
| 61 |
+
# Airflow, regardless of the worker count. Generally this value, multiplied by the number of
|
| 62 |
+
# schedulers in your cluster, is the maximum number of task instances with the running
|
| 63 |
+
# state in the metadata database. Setting this value to zero allows unlimited parallelism.
|
| 64 |
+
#
|
| 65 |
+
# Variable: AIRFLOW__CORE__PARALLELISM
|
| 66 |
+
#
|
| 67 |
+
parallelism = 32
|
| 68 |
+
|
| 69 |
+
# The maximum number of task instances allowed to run concurrently in each DAG. To calculate
|
| 70 |
+
# the number of tasks that is running concurrently for a DAG, add up the number of running
|
| 71 |
+
# tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``,
|
| 72 |
+
# which is defaulted as ``[core] max_active_tasks_per_dag``.
|
| 73 |
+
#
|
| 74 |
+
# An example scenario when this would be useful is when you want to stop a new dag with an early
|
| 75 |
+
# start date from stealing all the executor slots in a cluster.
|
| 76 |
+
#
|
| 77 |
+
# Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG
|
| 78 |
+
#
|
| 79 |
+
max_active_tasks_per_dag = 16
|
| 80 |
+
|
| 81 |
+
# Are DAGs paused by default at creation
|
| 82 |
+
#
|
| 83 |
+
# Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION
|
| 84 |
+
#
|
| 85 |
+
dags_are_paused_at_creation = True
|
| 86 |
+
|
| 87 |
+
# The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs
|
| 88 |
+
# if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``,
|
| 89 |
+
# which is defaulted as ``[core] max_active_runs_per_dag``.
|
| 90 |
+
#
|
| 91 |
+
# Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG
|
| 92 |
+
#
|
| 93 |
+
max_active_runs_per_dag = 16
|
| 94 |
+
|
| 95 |
+
# (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused.
|
| 96 |
+
# This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``,
|
| 97 |
+
# which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``.
|
| 98 |
+
# If not specified, then the value is considered as 0,
|
| 99 |
+
# meaning that the dags are never paused out by default.
|
| 100 |
+
#
|
| 101 |
+
# Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG
|
| 102 |
+
#
|
| 103 |
+
max_consecutive_failed_dag_runs_per_dag = 0
|
| 104 |
+
|
| 105 |
+
# The name of the method used in order to start Python processes via the multiprocessing module.
|
| 106 |
+
# This corresponds directly with the options available in the Python docs:
|
| 107 |
+
# `multiprocessing.set_start_method
|
| 108 |
+
# <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method>`__
|
| 109 |
+
# must be one of the values returned by `multiprocessing.get_all_start_methods()
|
| 110 |
+
# <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.get_all_start_methods>`__.
|
| 111 |
+
#
|
| 112 |
+
# Example: mp_start_method = fork
|
| 113 |
+
#
|
| 114 |
+
# Variable: AIRFLOW__CORE__MP_START_METHOD
|
| 115 |
+
#
|
| 116 |
+
# mp_start_method =
|
| 117 |
+
|
| 118 |
+
# Whether to load the DAG examples that ship with Airflow. It's good to
|
| 119 |
+
# get started, but you probably want to set this to ``False`` in a production
|
| 120 |
+
# environment
|
| 121 |
+
#
|
| 122 |
+
# Variable: AIRFLOW__CORE__LOAD_EXAMPLES
|
| 123 |
+
#
|
| 124 |
+
load_examples = True
|
| 125 |
+
|
| 126 |
+
# Path to the folder containing Airflow plugins
|
| 127 |
+
#
|
| 128 |
+
# Variable: AIRFLOW__CORE__PLUGINS_FOLDER
|
| 129 |
+
#
|
| 130 |
+
plugins_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/plugins
|
| 131 |
+
|
| 132 |
+
# Should tasks be executed via forking of the parent process
|
| 133 |
+
#
|
| 134 |
+
# * ``False``: Execute via forking of the parent process
|
| 135 |
+
# * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked
|
| 136 |
+
# up by tasks straight away
|
| 137 |
+
#
|
| 138 |
+
# Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER
|
| 139 |
+
#
|
| 140 |
+
execute_tasks_new_python_interpreter = False
|
| 141 |
+
|
| 142 |
+
# Secret key to save connection passwords in the db
|
| 143 |
+
#
|
| 144 |
+
# Variable: AIRFLOW__CORE__FERNET_KEY
|
| 145 |
+
#
|
| 146 |
+
fernet_key =
|
| 147 |
+
|
| 148 |
+
# Whether to disable pickling dags
|
| 149 |
+
#
|
| 150 |
+
# Variable: AIRFLOW__CORE__DONOT_PICKLE
|
| 151 |
+
#
|
| 152 |
+
donot_pickle = True
|
| 153 |
+
|
| 154 |
+
# How long before timing out a python file import
|
| 155 |
+
#
|
| 156 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT
|
| 157 |
+
#
|
| 158 |
+
dagbag_import_timeout = 30.0
|
| 159 |
+
|
| 160 |
+
# Should a traceback be shown in the UI for dagbag import errors,
|
| 161 |
+
# instead of just the exception message
|
| 162 |
+
#
|
| 163 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS
|
| 164 |
+
#
|
| 165 |
+
dagbag_import_error_tracebacks = True
|
| 166 |
+
|
| 167 |
+
# If tracebacks are shown, how many entries from the traceback should be shown
|
| 168 |
+
#
|
| 169 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH
|
| 170 |
+
#
|
| 171 |
+
dagbag_import_error_traceback_depth = 2
|
| 172 |
+
|
| 173 |
+
# How long before timing out a DagFileProcessor, which processes a dag file
|
| 174 |
+
#
|
| 175 |
+
# Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT
|
| 176 |
+
#
|
| 177 |
+
dag_file_processor_timeout = 50
|
| 178 |
+
|
| 179 |
+
# The class to use for running task instances in a subprocess.
|
| 180 |
+
# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
|
| 181 |
+
# when using a custom task runner.
|
| 182 |
+
#
|
| 183 |
+
# Variable: AIRFLOW__CORE__TASK_RUNNER
|
| 184 |
+
#
|
| 185 |
+
task_runner = StandardTaskRunner
|
| 186 |
+
|
| 187 |
+
# If set, tasks without a ``run_as_user`` argument will be run with this user
|
| 188 |
+
# Can be used to de-elevate a sudo user running Airflow when executing tasks
|
| 189 |
+
#
|
| 190 |
+
# Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION
|
| 191 |
+
#
|
| 192 |
+
default_impersonation =
|
| 193 |
+
|
| 194 |
+
# What security module to use (for example kerberos)
|
| 195 |
+
#
|
| 196 |
+
# Variable: AIRFLOW__CORE__SECURITY
|
| 197 |
+
#
|
| 198 |
+
security =
|
| 199 |
+
|
| 200 |
+
# Turn unit test mode on (overwrites many configuration options with test
|
| 201 |
+
# values at runtime)
|
| 202 |
+
#
|
| 203 |
+
# Variable: AIRFLOW__CORE__UNIT_TEST_MODE
|
| 204 |
+
#
|
| 205 |
+
unit_test_mode = False
|
| 206 |
+
|
| 207 |
+
# Whether to enable pickling for xcom (note that this is insecure and allows for
|
| 208 |
+
# RCE exploits).
|
| 209 |
+
#
|
| 210 |
+
# Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING
|
| 211 |
+
#
|
| 212 |
+
enable_xcom_pickling = False
|
| 213 |
+
|
| 214 |
+
# What classes can be imported during deserialization. This is a multi line value.
|
| 215 |
+
# The individual items will be parsed as a pattern to a glob function.
|
| 216 |
+
# Python built-in classes (like dict) are always allowed.
|
| 217 |
+
#
|
| 218 |
+
# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES
|
| 219 |
+
#
|
| 220 |
+
allowed_deserialization_classes = airflow.*
|
| 221 |
+
|
| 222 |
+
# What classes can be imported during deserialization. This is a multi line value.
|
| 223 |
+
# The individual items will be parsed as regexp patterns.
|
| 224 |
+
# This is a secondary option to ``[core] allowed_deserialization_classes``.
|
| 225 |
+
#
|
| 226 |
+
# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP
|
| 227 |
+
#
|
| 228 |
+
allowed_deserialization_classes_regexp =
|
| 229 |
+
|
| 230 |
+
# When a task is killed forcefully, this is the amount of time in seconds that
|
| 231 |
+
# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
|
| 232 |
+
#
|
| 233 |
+
# Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME
|
| 234 |
+
#
|
| 235 |
+
killed_task_cleanup_time = 60
|
| 236 |
+
|
| 237 |
+
# Whether to override params with dag_run.conf. If you pass some key-value pairs
|
| 238 |
+
# through ``airflow dags backfill -c`` or
|
| 239 |
+
# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
|
| 240 |
+
#
|
| 241 |
+
# Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS
|
| 242 |
+
#
|
| 243 |
+
dag_run_conf_overrides_params = True
|
| 244 |
+
|
| 245 |
+
# If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive).
|
| 246 |
+
#
|
| 247 |
+
# Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE
|
| 248 |
+
#
|
| 249 |
+
dag_discovery_safe_mode = True
|
| 250 |
+
|
| 251 |
+
# The pattern syntax used in the
|
| 252 |
+
# `.airflowignore
|
| 253 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#airflowignore>`__
|
| 254 |
+
# files in the DAG directories. Valid values are ``regexp`` or ``glob``.
|
| 255 |
+
#
|
| 256 |
+
# Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX
|
| 257 |
+
#
|
| 258 |
+
dag_ignore_file_syntax = regexp
|
| 259 |
+
|
| 260 |
+
# The number of retries each task is going to have by default. Can be overridden at dag or task level.
|
| 261 |
+
#
|
| 262 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES
|
| 263 |
+
#
|
| 264 |
+
default_task_retries = 0
|
| 265 |
+
|
| 266 |
+
# The number of seconds each task is going to wait by default between retries. Can be overridden at
|
| 267 |
+
# dag or task level.
|
| 268 |
+
#
|
| 269 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY
|
| 270 |
+
#
|
| 271 |
+
default_task_retry_delay = 300
|
| 272 |
+
|
| 273 |
+
# The maximum delay (in seconds) each task is going to wait by default between retries.
|
| 274 |
+
# This is a global setting and cannot be overridden at task or DAG level.
|
| 275 |
+
#
|
| 276 |
+
# Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY
|
| 277 |
+
#
|
| 278 |
+
max_task_retry_delay = 86400
|
| 279 |
+
|
| 280 |
+
# The weighting method used for the effective total priority weight of the task
|
| 281 |
+
#
|
| 282 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE
|
| 283 |
+
#
|
| 284 |
+
default_task_weight_rule = downstream
|
| 285 |
+
|
| 286 |
+
# Maximum possible time (in seconds) that task will have for execution of auxiliary processes
|
| 287 |
+
# (like listeners, mini scheduler...) after task is marked as success..
|
| 288 |
+
#
|
| 289 |
+
# Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME
|
| 290 |
+
#
|
| 291 |
+
task_success_overtime = 20
|
| 292 |
+
|
| 293 |
+
# The default task execution_timeout value for the operators. Expected an integer value to
|
| 294 |
+
# be passed into timedelta as seconds. If not specified, then the value is considered as None,
|
| 295 |
+
# meaning that the operators are never timed out by default.
|
| 296 |
+
#
|
| 297 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT
|
| 298 |
+
#
|
| 299 |
+
default_task_execution_timeout =
|
| 300 |
+
|
| 301 |
+
# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
|
| 302 |
+
#
|
| 303 |
+
# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL
|
| 304 |
+
#
|
| 305 |
+
min_serialized_dag_update_interval = 30
|
| 306 |
+
|
| 307 |
+
# If ``True``, serialized DAGs are compressed before writing to DB.
|
| 308 |
+
#
|
| 309 |
+
# .. note::
|
| 310 |
+
#
|
| 311 |
+
# This will disable the DAG dependencies view
|
| 312 |
+
#
|
| 313 |
+
# Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS
|
| 314 |
+
#
|
| 315 |
+
compress_serialized_dags = False
|
| 316 |
+
|
| 317 |
+
# Fetching serialized DAG can not be faster than a minimum interval to reduce database
|
| 318 |
+
# read rate. This config controls when your DAGs are updated in the Webserver
|
| 319 |
+
#
|
| 320 |
+
# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL
|
| 321 |
+
#
|
| 322 |
+
min_serialized_dag_fetch_interval = 10
|
| 323 |
+
|
| 324 |
+
# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
|
| 325 |
+
# in the Database.
|
| 326 |
+
# All the template_fields for each of Task Instance are stored in the Database.
|
| 327 |
+
# Keeping this number small may cause an error when you try to view ``Rendered`` tab in
|
| 328 |
+
# TaskInstance view for older tasks.
|
| 329 |
+
#
|
| 330 |
+
# Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK
|
| 331 |
+
#
|
| 332 |
+
max_num_rendered_ti_fields_per_task = 30
|
| 333 |
+
|
| 334 |
+
# On each dagrun check against defined SLAs
|
| 335 |
+
#
|
| 336 |
+
# Variable: AIRFLOW__CORE__CHECK_SLAS
|
| 337 |
+
#
|
| 338 |
+
check_slas = True
|
| 339 |
+
|
| 340 |
+
# Path to custom XCom class that will be used to store and resolve operators results
|
| 341 |
+
#
|
| 342 |
+
# Example: xcom_backend = path.to.CustomXCom
|
| 343 |
+
#
|
| 344 |
+
# Variable: AIRFLOW__CORE__XCOM_BACKEND
|
| 345 |
+
#
|
| 346 |
+
xcom_backend = airflow.models.xcom.BaseXCom
|
| 347 |
+
|
| 348 |
+
# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
|
| 349 |
+
# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
|
| 350 |
+
#
|
| 351 |
+
# Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS
|
| 352 |
+
#
|
| 353 |
+
lazy_load_plugins = True
|
| 354 |
+
|
| 355 |
+
# By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
|
| 356 |
+
# Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or
|
| 357 |
+
# loaded from module.
|
| 358 |
+
#
|
| 359 |
+
# Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS
|
| 360 |
+
#
|
| 361 |
+
lazy_discover_providers = True
|
| 362 |
+
|
| 363 |
+
# Hide sensitive **Variables** or **Connection extra json keys** from UI
|
| 364 |
+
# and task logs when set to ``True``
|
| 365 |
+
#
|
| 366 |
+
# .. note::
|
| 367 |
+
#
|
| 368 |
+
# Connection passwords are always hidden in logs
|
| 369 |
+
#
|
| 370 |
+
# Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS
|
| 371 |
+
#
|
| 372 |
+
hide_sensitive_var_conn_fields = True
|
| 373 |
+
|
| 374 |
+
# A comma-separated list of extra sensitive keywords to look for in variables names or connection's
|
| 375 |
+
# extra JSON.
|
| 376 |
+
#
|
| 377 |
+
# Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES
|
| 378 |
+
#
|
| 379 |
+
sensitive_var_conn_names =
|
| 380 |
+
|
| 381 |
+
# Task Slot counts for ``default_pool``. This setting would not have any effect in an existing
|
| 382 |
+
# deployment where the ``default_pool`` is already created. For existing deployments, users can
|
| 383 |
+
# change the number of slots using Webserver, API or the CLI
|
| 384 |
+
#
|
| 385 |
+
# Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT
|
| 386 |
+
#
|
| 387 |
+
default_pool_task_slot_count = 128
|
| 388 |
+
|
| 389 |
+
# The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
|
| 390 |
+
# length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
|
| 391 |
+
# mapped tasks from clogging the scheduler.
|
| 392 |
+
#
|
| 393 |
+
# Variable: AIRFLOW__CORE__MAX_MAP_LENGTH
|
| 394 |
+
#
|
| 395 |
+
max_map_length = 1024
|
| 396 |
+
|
| 397 |
+
# The default umask to use for process when run in daemon mode (scheduler, worker, etc.)
|
| 398 |
+
#
|
| 399 |
+
# This controls the file-creation mode mask which determines the initial value of file permission bits
|
| 400 |
+
# for newly created files.
|
| 401 |
+
#
|
| 402 |
+
# This value is treated as an octal-integer.
|
| 403 |
+
#
|
| 404 |
+
# Variable: AIRFLOW__CORE__DAEMON_UMASK
|
| 405 |
+
#
|
| 406 |
+
daemon_umask = 0o077
|
| 407 |
+
|
| 408 |
+
# Class to use as dataset manager.
|
| 409 |
+
#
|
| 410 |
+
# Example: dataset_manager_class = airflow.datasets.manager.DatasetManager
|
| 411 |
+
#
|
| 412 |
+
# Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS
|
| 413 |
+
#
|
| 414 |
+
# dataset_manager_class =
|
| 415 |
+
|
| 416 |
+
# Kwargs to supply to dataset manager.
|
| 417 |
+
#
|
| 418 |
+
# Example: dataset_manager_kwargs = {"some_param": "some_value"}
|
| 419 |
+
#
|
| 420 |
+
# Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS
|
| 421 |
+
#
|
| 422 |
+
# dataset_manager_kwargs =
|
| 423 |
+
|
| 424 |
+
# Dataset URI validation should raise an exception if it is not compliant with AIP-60.
|
| 425 |
+
# By default this configuration is false, meaning that Airflow 2.x only warns the user.
|
| 426 |
+
# In Airflow 3, this configuration will be enabled by default.
|
| 427 |
+
#
|
| 428 |
+
# Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION
|
| 429 |
+
#
|
| 430 |
+
strict_dataset_uri_validation = False
|
| 431 |
+
|
| 432 |
+
# (experimental) Whether components should use Airflow Internal API for DB connectivity.
|
| 433 |
+
#
|
| 434 |
+
# Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION
|
| 435 |
+
#
|
| 436 |
+
database_access_isolation = False
|
| 437 |
+
|
| 438 |
+
# (experimental) Airflow Internal API url.
|
| 439 |
+
# Only used if ``[core] database_access_isolation`` is ``True``.
|
| 440 |
+
#
|
| 441 |
+
# Example: internal_api_url = http://localhost:8080
|
| 442 |
+
#
|
| 443 |
+
# Variable: AIRFLOW__CORE__INTERNAL_API_URL
|
| 444 |
+
#
|
| 445 |
+
# internal_api_url =
|
| 446 |
+
|
| 447 |
+
# Secret key used to authenticate internal API clients to core. It should be as random as possible.
|
| 448 |
+
# However, when running more than 1 instances of webserver / internal API services, make sure all
|
| 449 |
+
# of them use the same ``secret_key`` otherwise calls will fail on authentication.
|
| 450 |
+
# The authentication token generated using the secret key has a short expiry time though - make
|
| 451 |
+
# sure that time on ALL the machines that you run airflow components on is synchronized
|
| 452 |
+
# (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed.
|
| 453 |
+
#
|
| 454 |
+
# Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY
|
| 455 |
+
#
|
| 456 |
+
internal_api_secret_key = JRNP2IC4kIaVxisy9+AW4A==
|
| 457 |
+
|
| 458 |
+
# The ability to allow testing connections across Airflow UI, API and CLI.
|
| 459 |
+
# Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled
|
| 460 |
+
# Disabled - Disables the test connection functionality and disables the Test Connection button in UI.
|
| 461 |
+
# Enabled - Enables the test connection functionality and shows the Test Connection button in UI.
|
| 462 |
+
# Hidden - Disables the test connection functionality and hides the Test Connection button in UI.
|
| 463 |
+
# Before setting this to Enabled, make sure that you review the users who are able to add/edit
|
| 464 |
+
# connections and ensure they are trusted. Connection testing can be done maliciously leading to
|
| 465 |
+
# undesired and insecure outcomes.
|
| 466 |
+
# See `Airflow Security Model: Capabilities of authenticated UI users
|
| 467 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#capabilities-of-authenticated-ui-users>`__
|
| 468 |
+
# for more details.
|
| 469 |
+
#
|
| 470 |
+
# Variable: AIRFLOW__CORE__TEST_CONNECTION
|
| 471 |
+
#
|
| 472 |
+
test_connection = Disabled
|
| 473 |
+
|
| 474 |
+
# The maximum length of the rendered template field. If the value to be stored in the
|
| 475 |
+
# rendered template field exceeds this size, it's redacted.
|
| 476 |
+
#
|
| 477 |
+
# Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH
|
| 478 |
+
#
|
| 479 |
+
max_templated_field_length = 4096
|
| 480 |
+
|
| 481 |
+
[database]
|
| 482 |
+
# Path to the ``alembic.ini`` file. You can either provide the file path relative
|
| 483 |
+
# to the Airflow home directory or the absolute path if it is located elsewhere.
|
| 484 |
+
#
|
| 485 |
+
# Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH
|
| 486 |
+
#
|
| 487 |
+
alembic_ini_file_path = alembic.ini
|
| 488 |
+
|
| 489 |
+
# The SQLAlchemy connection string to the metadata database.
|
| 490 |
+
# SQLAlchemy supports many different database engines.
|
| 491 |
+
# See: `Set up a Database Backend: Database URI
|
| 492 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri>`__
|
| 493 |
+
# for more details.
|
| 494 |
+
#
|
| 495 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN
|
| 496 |
+
#
|
| 497 |
+
sql_alchemy_conn = sqlite:////kaggle/working/BTC-USDT-ETL-Pipeline/airflow/airflow.db
|
| 498 |
+
|
| 499 |
+
# Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value
|
| 500 |
+
#
|
| 501 |
+
# Example: sql_alchemy_engine_args = {"arg1": true}
|
| 502 |
+
#
|
| 503 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS
|
| 504 |
+
#
|
| 505 |
+
# sql_alchemy_engine_args =
|
| 506 |
+
|
| 507 |
+
# The encoding for the databases
|
| 508 |
+
#
|
| 509 |
+
# Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING
|
| 510 |
+
#
|
| 511 |
+
sql_engine_encoding = utf-8
|
| 512 |
+
|
| 513 |
+
# Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns
|
| 514 |
+
# in case they have different encoding.
|
| 515 |
+
# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
|
| 516 |
+
# the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed
|
| 517 |
+
# the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see
|
| 518 |
+
# `GitHub Issue Comment <https://github.com/apache/airflow/pull/17603#issuecomment-901121618>`__
|
| 519 |
+
# for more details.
|
| 520 |
+
#
|
| 521 |
+
# Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS
|
| 522 |
+
#
|
| 523 |
+
# sql_engine_collation_for_ids =
|
| 524 |
+
|
| 525 |
+
# If SQLAlchemy should pool database connections.
|
| 526 |
+
#
|
| 527 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED
|
| 528 |
+
#
|
| 529 |
+
sql_alchemy_pool_enabled = True
|
| 530 |
+
|
| 531 |
+
# The SQLAlchemy pool size is the maximum number of database connections
|
| 532 |
+
# in the pool. 0 indicates no limit.
|
| 533 |
+
#
|
| 534 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE
|
| 535 |
+
#
|
| 536 |
+
sql_alchemy_pool_size = 5
|
| 537 |
+
|
| 538 |
+
# The maximum overflow size of the pool.
|
| 539 |
+
# When the number of checked-out connections reaches the size set in pool_size,
|
| 540 |
+
# additional connections will be returned up to this limit.
|
| 541 |
+
# When those additional connections are returned to the pool, they are disconnected and discarded.
|
| 542 |
+
# It follows then that the total number of simultaneous connections the pool will allow
|
| 543 |
+
# is **pool_size** + **max_overflow**,
|
| 544 |
+
# and the total number of "sleeping" connections the pool will allow is pool_size.
|
| 545 |
+
# max_overflow can be set to ``-1`` to indicate no overflow limit;
|
| 546 |
+
# no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
|
| 547 |
+
#
|
| 548 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW
|
| 549 |
+
#
|
| 550 |
+
sql_alchemy_max_overflow = 10
|
| 551 |
+
|
| 552 |
+
# The SQLAlchemy pool recycle is the number of seconds a connection
|
| 553 |
+
# can be idle in the pool before it is invalidated. This config does
|
| 554 |
+
# not apply to sqlite. If the number of DB connections is ever exceeded,
|
| 555 |
+
# a lower config value will allow the system to recover faster.
|
| 556 |
+
#
|
| 557 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE
|
| 558 |
+
#
|
| 559 |
+
sql_alchemy_pool_recycle = 1800
|
| 560 |
+
|
| 561 |
+
# Check connection at the start of each connection pool checkout.
|
| 562 |
+
# Typically, this is a simple statement like "SELECT 1".
|
| 563 |
+
# See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic
|
| 564 |
+
# <https://docs.sqlalchemy.org/en/14/core/pooling.html#disconnect-handling-pessimistic>`__
|
| 565 |
+
# for more details.
|
| 566 |
+
#
|
| 567 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING
|
| 568 |
+
#
|
| 569 |
+
sql_alchemy_pool_pre_ping = True
|
| 570 |
+
|
| 571 |
+
# The schema to use for the metadata database.
|
| 572 |
+
# SQLAlchemy supports databases with the concept of multiple schemas.
|
| 573 |
+
#
|
| 574 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA
|
| 575 |
+
#
|
| 576 |
+
sql_alchemy_schema =
|
| 577 |
+
|
| 578 |
+
# Import path for connect args in SQLAlchemy. Defaults to an empty dict.
|
| 579 |
+
# This is useful when you want to configure db engine args that SQLAlchemy won't parse
|
| 580 |
+
# in connection string. This can be set by passing a dictionary containing the create engine parameters.
|
| 581 |
+
# For more details about passing create engine parameters (keepalives variables, timeout etc)
|
| 582 |
+
# in Postgres DB Backend see `Setting up a PostgreSQL Database
|
| 583 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#setting-up-a-postgresql-database>`__
|
| 584 |
+
# e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and
|
| 585 |
+
# can be imported as shown below
|
| 586 |
+
#
|
| 587 |
+
# Example: sql_alchemy_connect_args = airflow_local_settings.connect_args
|
| 588 |
+
#
|
| 589 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS
|
| 590 |
+
#
|
| 591 |
+
# sql_alchemy_connect_args =
|
| 592 |
+
|
| 593 |
+
# Important Warning: Use of sql_alchemy_session_maker Highly Discouraged
|
| 594 |
+
# Import path for function which returns 'sqlalchemy.orm.sessionmaker'.
|
| 595 |
+
# Improper configuration of sql_alchemy_session_maker can lead to serious issues,
|
| 596 |
+
# including data corruption, unrecoverable application crashes. Please review the SQLAlchemy
|
| 597 |
+
# documentation for detailed guidance on proper configuration and best practices.
|
| 598 |
+
#
|
| 599 |
+
# Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker
|
| 600 |
+
#
|
| 601 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER
|
| 602 |
+
#
|
| 603 |
+
# sql_alchemy_session_maker =
|
| 604 |
+
|
| 605 |
+
# Whether to load the default connections that ship with Airflow when ``airflow db init`` is called.
|
| 606 |
+
# It's good to get started, but you probably want to set this to ``False`` in a production environment.
|
| 607 |
+
#
|
| 608 |
+
# Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS
|
| 609 |
+
#
|
| 610 |
+
load_default_connections = True
|
| 611 |
+
|
| 612 |
+
# Number of times the code should be retried in case of DB Operational Errors.
|
| 613 |
+
# Not all transactions will be retried as it can cause undesired state.
|
| 614 |
+
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
|
| 615 |
+
#
|
| 616 |
+
# Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES
|
| 617 |
+
#
|
| 618 |
+
max_db_retries = 3
|
| 619 |
+
|
| 620 |
+
# Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive,
|
| 621 |
+
# and the users can assert the correct version through other means (e.g. through a Helm chart).
|
| 622 |
+
# Accepts ``True`` or ``False``.
|
| 623 |
+
#
|
| 624 |
+
# Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS
|
| 625 |
+
#
|
| 626 |
+
check_migrations = True
|
| 627 |
+
|
| 628 |
+
[logging]
|
| 629 |
+
# The folder where airflow should store its log files.
|
| 630 |
+
# This path must be absolute.
|
| 631 |
+
# There are a few existing configurations that assume this is set to the default.
|
| 632 |
+
# If you choose to override this you may need to update the
|
| 633 |
+
# ``[logging] dag_processor_manager_log_location`` and
|
| 634 |
+
# ``[logging] child_process_log_directory settings`` as well.
|
| 635 |
+
#
|
| 636 |
+
# Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER
|
| 637 |
+
#
|
| 638 |
+
base_log_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs
|
| 639 |
+
|
| 640 |
+
# Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
|
| 641 |
+
# Set this to ``True`` if you want to enable remote logging.
|
| 642 |
+
#
|
| 643 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_LOGGING
|
| 644 |
+
#
|
| 645 |
+
remote_logging = False
|
| 646 |
+
|
| 647 |
+
# Users must supply an Airflow connection id that provides access to the storage
|
| 648 |
+
# location. Depending on your remote logging service, this may only be used for
|
| 649 |
+
# reading logs, not writing them.
|
| 650 |
+
#
|
| 651 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID
|
| 652 |
+
#
|
| 653 |
+
remote_log_conn_id =
|
| 654 |
+
|
| 655 |
+
# Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after
|
| 656 |
+
# they are uploaded to the remote location.
|
| 657 |
+
#
|
| 658 |
+
# Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS
|
| 659 |
+
#
|
| 660 |
+
delete_local_logs = False
|
| 661 |
+
|
| 662 |
+
# Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
|
| 663 |
+
# Credentials
|
| 664 |
+
# <https://cloud.google.com/docs/authentication/application-default-credentials>`__ will
|
| 665 |
+
# be used.
|
| 666 |
+
#
|
| 667 |
+
# Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH
|
| 668 |
+
#
|
| 669 |
+
google_key_path =
|
| 670 |
+
|
| 671 |
+
# Storage bucket URL for remote logging
|
| 672 |
+
# S3 buckets should start with **s3://**
|
| 673 |
+
# Cloudwatch log groups should start with **cloudwatch://**
|
| 674 |
+
# GCS buckets should start with **gs://**
|
| 675 |
+
# WASB buckets should start with **wasb** just to help Airflow select correct handler
|
| 676 |
+
# Stackdriver logs should start with **stackdriver://**
|
| 677 |
+
#
|
| 678 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER
|
| 679 |
+
#
|
| 680 |
+
remote_base_log_folder =
|
| 681 |
+
|
| 682 |
+
# The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__``
|
| 683 |
+
# of remote task handler and it overrides the values provided by Airflow config. For example if you set
|
| 684 |
+
# ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local
|
| 685 |
+
# log files will be deleted after they are uploaded to remote location.
|
| 686 |
+
#
|
| 687 |
+
# Example: remote_task_handler_kwargs = {"delete_local_copy": true}
|
| 688 |
+
#
|
| 689 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS
|
| 690 |
+
#
|
| 691 |
+
remote_task_handler_kwargs =
|
| 692 |
+
|
| 693 |
+
# Use server-side encryption for logs stored in S3
|
| 694 |
+
#
|
| 695 |
+
# Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS
|
| 696 |
+
#
|
| 697 |
+
encrypt_s3_logs = False
|
| 698 |
+
|
| 699 |
+
# Logging level.
|
| 700 |
+
#
|
| 701 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
| 702 |
+
#
|
| 703 |
+
# Variable: AIRFLOW__LOGGING__LOGGING_LEVEL
|
| 704 |
+
#
|
| 705 |
+
logging_level = INFO
|
| 706 |
+
|
| 707 |
+
# Logging level for celery. If not set, it uses the value of logging_level
|
| 708 |
+
#
|
| 709 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
| 710 |
+
#
|
| 711 |
+
# Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL
|
| 712 |
+
#
|
| 713 |
+
celery_logging_level =
|
| 714 |
+
|
| 715 |
+
# Logging level for Flask-appbuilder UI.
|
| 716 |
+
#
|
| 717 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
| 718 |
+
#
|
| 719 |
+
# Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL
|
| 720 |
+
#
|
| 721 |
+
fab_logging_level = WARNING
|
| 722 |
+
|
| 723 |
+
# Logging class
|
| 724 |
+
# Specify the class that will specify the logging configuration
|
| 725 |
+
# This class has to be on the python classpath
|
| 726 |
+
#
|
| 727 |
+
# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
|
| 728 |
+
#
|
| 729 |
+
# Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS
|
| 730 |
+
#
|
| 731 |
+
logging_config_class =
|
| 732 |
+
|
| 733 |
+
# Flag to enable/disable Colored logs in Console
|
| 734 |
+
# Colour the logs when the controlling terminal is a TTY.
|
| 735 |
+
#
|
| 736 |
+
# Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG
|
| 737 |
+
#
|
| 738 |
+
colored_console_log = True
|
| 739 |
+
|
| 740 |
+
# Log format for when Colored logs is enabled
|
| 741 |
+
#
|
| 742 |
+
# Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT
|
| 743 |
+
#
|
| 744 |
+
colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
|
| 745 |
+
|
| 746 |
+
# Specifies the class utilized by Airflow to implement colored logging
|
| 747 |
+
#
|
| 748 |
+
# Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS
|
| 749 |
+
#
|
| 750 |
+
colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
|
| 751 |
+
|
| 752 |
+
# Format of Log line
|
| 753 |
+
#
|
| 754 |
+
# Variable: AIRFLOW__LOGGING__LOG_FORMAT
|
| 755 |
+
#
|
| 756 |
+
log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
|
| 757 |
+
|
| 758 |
+
# Defines the format of log messages for simple logging configuration
|
| 759 |
+
#
|
| 760 |
+
# Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT
|
| 761 |
+
#
|
| 762 |
+
simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
|
| 763 |
+
|
| 764 |
+
# Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory.
|
| 765 |
+
#
|
| 766 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET
|
| 767 |
+
#
|
| 768 |
+
dag_processor_log_target = file
|
| 769 |
+
|
| 770 |
+
# Format of Dag Processor Log line
|
| 771 |
+
#
|
| 772 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT
|
| 773 |
+
#
|
| 774 |
+
dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
|
| 775 |
+
|
| 776 |
+
# Determines the formatter class used by Airflow for structuring its log messages
|
| 777 |
+
# The default formatter class is timezone-aware, which means that timestamps attached to log entries
|
| 778 |
+
# will be adjusted to reflect the local timezone of the Airflow instance
|
| 779 |
+
#
|
| 780 |
+
# Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS
|
| 781 |
+
#
|
| 782 |
+
log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware
|
| 783 |
+
|
| 784 |
+
# An import path to a function to add adaptations of each secret added with
|
| 785 |
+
# ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function
|
| 786 |
+
# is expected to require a single parameter: the secret to be adapted. It may return a
|
| 787 |
+
# single adaptation of the secret or an iterable of adaptations to each be masked as secrets.
|
| 788 |
+
# The original secret will be masked as well as any adaptations returned.
|
| 789 |
+
#
|
| 790 |
+
# Example: secret_mask_adapter = urllib.parse.quote
|
| 791 |
+
#
|
| 792 |
+
# Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER
|
| 793 |
+
#
|
| 794 |
+
secret_mask_adapter =
|
| 795 |
+
|
| 796 |
+
# Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter``
|
| 797 |
+
#
|
| 798 |
+
# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}}
|
| 799 |
+
#
|
| 800 |
+
# Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE
|
| 801 |
+
#
|
| 802 |
+
task_log_prefix_template =
|
| 803 |
+
|
| 804 |
+
# Formatting for how airflow generates file names/paths for each task run.
|
| 805 |
+
#
|
| 806 |
+
# Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE
|
| 807 |
+
#
|
| 808 |
+
log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log
|
| 809 |
+
|
| 810 |
+
# Formatting for how airflow generates file names for log
|
| 811 |
+
#
|
| 812 |
+
# Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE
|
| 813 |
+
#
|
| 814 |
+
log_processor_filename_template = {{ filename }}.log
|
| 815 |
+
|
| 816 |
+
# Full path of dag_processor_manager logfile.
|
| 817 |
+
#
|
| 818 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION
|
| 819 |
+
#
|
| 820 |
+
dag_processor_manager_log_location = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs/dag_processor_manager/dag_processor_manager.log
|
| 821 |
+
|
| 822 |
+
# Whether DAG processor manager will write logs to stdout
|
| 823 |
+
#
|
| 824 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT
|
| 825 |
+
#
|
| 826 |
+
dag_processor_manager_log_stdout = False
|
| 827 |
+
|
| 828 |
+
# Name of handler to read task instance logs.
|
| 829 |
+
# Defaults to use ``task`` handler.
|
| 830 |
+
#
|
| 831 |
+
# Variable: AIRFLOW__LOGGING__TASK_LOG_READER
|
| 832 |
+
#
|
| 833 |
+
task_log_reader = task
|
| 834 |
+
|
| 835 |
+
# A comma\-separated list of third-party logger names that will be configured to print messages to
|
| 836 |
+
# consoles\.
|
| 837 |
+
#
|
| 838 |
+
# Example: extra_logger_names = connexion,sqlalchemy
|
| 839 |
+
#
|
| 840 |
+
# Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES
|
| 841 |
+
#
|
| 842 |
+
extra_logger_names =
|
| 843 |
+
|
| 844 |
+
# When you start an Airflow worker, Airflow starts a tiny web server
|
| 845 |
+
# subprocess to serve the workers local log files to the airflow main
|
| 846 |
+
# web server, who then builds pages and sends them to users. This defines
|
| 847 |
+
# the port on which the logs are served. It needs to be unused, and open
|
| 848 |
+
# visible from the main web server to connect into the workers.
|
| 849 |
+
#
|
| 850 |
+
# Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT
|
| 851 |
+
#
|
| 852 |
+
worker_log_server_port = 8793
|
| 853 |
+
|
| 854 |
+
# Port to serve logs from for triggerer.
|
| 855 |
+
# See ``[logging] worker_log_server_port`` description for more info.
|
| 856 |
+
#
|
| 857 |
+
# Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT
|
| 858 |
+
#
|
| 859 |
+
trigger_log_server_port = 8794
|
| 860 |
+
|
| 861 |
+
# We must parse timestamps to interleave logs between trigger and task. To do so,
|
| 862 |
+
# we need to parse timestamps in log files. In case your log format is non-standard,
|
| 863 |
+
# you may provide import path to callable which takes a string log line and returns
|
| 864 |
+
# the timestamp (datetime.datetime compatible).
|
| 865 |
+
#
|
| 866 |
+
# Example: interleave_timestamp_parser = path.to.my_func
|
| 867 |
+
#
|
| 868 |
+
# Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER
|
| 869 |
+
#
|
| 870 |
+
# interleave_timestamp_parser =
|
| 871 |
+
|
| 872 |
+
# Permissions in the form or of octal string as understood by chmod. The permissions are important
|
| 873 |
+
# when you use impersonation, when logs are written by a different user than airflow. The most secure
|
| 874 |
+
# way of configuring it in this case is to add both users to the same group and make it the default
|
| 875 |
+
# group of both users. Group-writeable logs are default in airflow, but you might decide that you are
|
| 876 |
+
# OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might
|
| 877 |
+
# decide to add more security if you do not use impersonation and change it to ``0o755`` to make it
|
| 878 |
+
# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700``
|
| 879 |
+
# if all the access (read/write) for your logs happens from the same user.
|
| 880 |
+
#
|
| 881 |
+
# Example: file_task_handler_new_folder_permissions = 0o775
|
| 882 |
+
#
|
| 883 |
+
# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS
|
| 884 |
+
#
|
| 885 |
+
file_task_handler_new_folder_permissions = 0o775
|
| 886 |
+
|
| 887 |
+
# Permissions in the form or of octal string as understood by chmod. The permissions are important
|
| 888 |
+
# when you use impersonation, when logs are written by a different user than airflow. The most secure
|
| 889 |
+
# way of configuring it in this case is to add both users to the same group and make it the default
|
| 890 |
+
# group of both users. Group-writeable logs are default in airflow, but you might decide that you are
|
| 891 |
+
# OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might
|
| 892 |
+
# decide to add more security if you do not use impersonation and change it to ``0o644`` to make it
|
| 893 |
+
# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600``
|
| 894 |
+
# if all the access (read/write) for your logs happens from the same user.
|
| 895 |
+
#
|
| 896 |
+
# Example: file_task_handler_new_file_permissions = 0o664
|
| 897 |
+
#
|
| 898 |
+
# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS
|
| 899 |
+
#
|
| 900 |
+
file_task_handler_new_file_permissions = 0o664
|
| 901 |
+
|
| 902 |
+
# By default Celery sends all logs into stderr.
|
| 903 |
+
# If enabled any previous logging handlers will get *removed*.
|
| 904 |
+
# With this option AirFlow will create new handlers
|
| 905 |
+
# and send low level logs like INFO and WARNING to stdout,
|
| 906 |
+
# while sending higher severity logs to stderr.
|
| 907 |
+
#
|
| 908 |
+
# Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION
|
| 909 |
+
#
|
| 910 |
+
celery_stdout_stderr_separation = False
|
| 911 |
+
|
| 912 |
+
# If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from
|
| 913 |
+
# the scheduler, executor, or callback execution context. This can help in circumstances such as
|
| 914 |
+
# when there's something blocking the execution of the task and ordinarily there may be no task
|
| 915 |
+
# logs at all.
|
| 916 |
+
# This is set to ``True`` by default. If you encounter issues with this feature
|
| 917 |
+
# (e.g. scheduler performance issues) it can be disabled.
|
| 918 |
+
#
|
| 919 |
+
# Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER
|
| 920 |
+
#
|
| 921 |
+
enable_task_context_logger = True
|
| 922 |
+
|
| 923 |
+
# A comma separated list of keywords related to errors whose presence should display the line in red
|
| 924 |
+
# color in UI
|
| 925 |
+
#
|
| 926 |
+
# Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS
|
| 927 |
+
#
|
| 928 |
+
color_log_error_keywords = error,exception
|
| 929 |
+
|
| 930 |
+
# A comma separated list of keywords related to warning whose presence should display the line in yellow
|
| 931 |
+
# color in UI
|
| 932 |
+
#
|
| 933 |
+
# Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS
|
| 934 |
+
#
|
| 935 |
+
color_log_warning_keywords = warn
|
| 936 |
+
|
| 937 |
+
[metrics]
|
| 938 |
+
# `StatsD <https://github.com/statsd/statsd>`__ integration settings.
|
| 939 |
+
|
| 940 |
+
# If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use
|
| 941 |
+
# regex pattern matching anywhere within the metric name instead of only prefix matching
|
| 942 |
+
# at the start of the name.
|
| 943 |
+
#
|
| 944 |
+
# Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH
|
| 945 |
+
#
|
| 946 |
+
metrics_use_pattern_match = False
|
| 947 |
+
|
| 948 |
+
# Configure an allow list (comma separated string) to send only certain metrics.
|
| 949 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
|
| 950 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
|
| 951 |
+
#
|
| 952 |
+
# Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
|
| 953 |
+
#
|
| 954 |
+
# Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST
|
| 955 |
+
#
|
| 956 |
+
metrics_allow_list =
|
| 957 |
+
|
| 958 |
+
# Configure a block list (comma separated string) to block certain metrics from being emitted.
|
| 959 |
+
# If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured,
|
| 960 |
+
# ``[metrics] metrics_block_list`` is ignored.
|
| 961 |
+
#
|
| 962 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
|
| 963 |
+
#
|
| 964 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
|
| 965 |
+
#
|
| 966 |
+
# Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
|
| 967 |
+
#
|
| 968 |
+
# Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST
|
| 969 |
+
#
|
| 970 |
+
metrics_block_list =
|
| 971 |
+
|
| 972 |
+
# Enables sending metrics to StatsD.
|
| 973 |
+
#
|
| 974 |
+
# Variable: AIRFLOW__METRICS__STATSD_ON
|
| 975 |
+
#
|
| 976 |
+
statsd_on = False
|
| 977 |
+
|
| 978 |
+
# Specifies the host address where the StatsD daemon (or server) is running
|
| 979 |
+
#
|
| 980 |
+
# Variable: AIRFLOW__METRICS__STATSD_HOST
|
| 981 |
+
#
|
| 982 |
+
statsd_host = localhost
|
| 983 |
+
|
| 984 |
+
# Specifies the port on which the StatsD daemon (or server) is listening to
|
| 985 |
+
#
|
| 986 |
+
# Variable: AIRFLOW__METRICS__STATSD_PORT
|
| 987 |
+
#
|
| 988 |
+
statsd_port = 8125
|
| 989 |
+
|
| 990 |
+
# Defines the namespace for all metrics sent from Airflow to StatsD
|
| 991 |
+
#
|
| 992 |
+
# Variable: AIRFLOW__METRICS__STATSD_PREFIX
|
| 993 |
+
#
|
| 994 |
+
statsd_prefix = airflow
|
| 995 |
+
|
| 996 |
+
# A function that validate the StatsD stat name, apply changes to the stat name if necessary and return
|
| 997 |
+
# the transformed stat name.
|
| 998 |
+
#
|
| 999 |
+
# The function should have the following signature
|
| 1000 |
+
#
|
| 1001 |
+
# .. code-block:: python
|
| 1002 |
+
#
|
| 1003 |
+
# def func_name(stat_name: str) -> str: ...
|
| 1004 |
+
#
|
| 1005 |
+
# Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER
|
| 1006 |
+
#
|
| 1007 |
+
stat_name_handler =
|
| 1008 |
+
|
| 1009 |
+
# To enable datadog integration to send airflow metrics.
|
| 1010 |
+
#
|
| 1011 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED
|
| 1012 |
+
#
|
| 1013 |
+
statsd_datadog_enabled = False
|
| 1014 |
+
|
| 1015 |
+
# List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``)
|
| 1016 |
+
#
|
| 1017 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS
|
| 1018 |
+
#
|
| 1019 |
+
statsd_datadog_tags =
|
| 1020 |
+
|
| 1021 |
+
# Set to ``False`` to disable metadata tags for some of the emitted metrics
|
| 1022 |
+
#
|
| 1023 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS
|
| 1024 |
+
#
|
| 1025 |
+
statsd_datadog_metrics_tags = True
|
| 1026 |
+
|
| 1027 |
+
# If you want to utilise your own custom StatsD client set the relevant
|
| 1028 |
+
# module path below.
|
| 1029 |
+
# Note: The module path must exist on your
|
| 1030 |
+
# `PYTHONPATH <https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH>`
|
| 1031 |
+
# for Airflow to pick it up
|
| 1032 |
+
#
|
| 1033 |
+
# Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH
|
| 1034 |
+
#
|
| 1035 |
+
# statsd_custom_client_path =
|
| 1036 |
+
|
| 1037 |
+
# If you want to avoid sending all the available metrics tags to StatsD,
|
| 1038 |
+
# you can configure a block list of prefixes (comma separated) to filter out metric tags
|
| 1039 |
+
# that start with the elements of the list (e.g: ``job_id,run_id``)
|
| 1040 |
+
#
|
| 1041 |
+
# Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id
|
| 1042 |
+
#
|
| 1043 |
+
# Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS
|
| 1044 |
+
#
|
| 1045 |
+
statsd_disabled_tags = job_id,run_id
|
| 1046 |
+
|
| 1047 |
+
# To enable sending Airflow metrics with StatsD-Influxdb tagging convention.
|
| 1048 |
+
#
|
| 1049 |
+
# Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED
|
| 1050 |
+
#
|
| 1051 |
+
statsd_influxdb_enabled = False
|
| 1052 |
+
|
| 1053 |
+
# Enables sending metrics to OpenTelemetry.
|
| 1054 |
+
#
|
| 1055 |
+
# Variable: AIRFLOW__METRICS__OTEL_ON
|
| 1056 |
+
#
|
| 1057 |
+
otel_on = False
|
| 1058 |
+
|
| 1059 |
+
# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
|
| 1060 |
+
# metrics and traces.
|
| 1061 |
+
#
|
| 1062 |
+
# Variable: AIRFLOW__METRICS__OTEL_HOST
|
| 1063 |
+
#
|
| 1064 |
+
otel_host = localhost
|
| 1065 |
+
|
| 1066 |
+
# Specifies the port of the OpenTelemetry Collector that is listening to.
|
| 1067 |
+
#
|
| 1068 |
+
# Variable: AIRFLOW__METRICS__OTEL_PORT
|
| 1069 |
+
#
|
| 1070 |
+
otel_port = 8889
|
| 1071 |
+
|
| 1072 |
+
# The prefix for the Airflow metrics.
|
| 1073 |
+
#
|
| 1074 |
+
# Variable: AIRFLOW__METRICS__OTEL_PREFIX
|
| 1075 |
+
#
|
| 1076 |
+
otel_prefix = airflow
|
| 1077 |
+
|
| 1078 |
+
# Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces
|
| 1079 |
+
# to the configured OpenTelemetry Collector.
|
| 1080 |
+
#
|
| 1081 |
+
# Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS
|
| 1082 |
+
#
|
| 1083 |
+
otel_interval_milliseconds = 60000
|
| 1084 |
+
|
| 1085 |
+
# If ``True``, all metrics are also emitted to the console. Defaults to ``False``.
|
| 1086 |
+
#
|
| 1087 |
+
# Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON
|
| 1088 |
+
#
|
| 1089 |
+
otel_debugging_on = False
|
| 1090 |
+
|
| 1091 |
+
# The default service name of traces.
|
| 1092 |
+
#
|
| 1093 |
+
# Variable: AIRFLOW__METRICS__OTEL_SERVICE
|
| 1094 |
+
#
|
| 1095 |
+
otel_service = Airflow
|
| 1096 |
+
|
| 1097 |
+
# If ``True``, SSL will be enabled. Defaults to ``False``.
|
| 1098 |
+
# To establish an HTTPS connection to the OpenTelemetry collector,
|
| 1099 |
+
# you need to configure the SSL certificate and key within the OpenTelemetry collector's
|
| 1100 |
+
# ``config.yml`` file.
|
| 1101 |
+
#
|
| 1102 |
+
# Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE
|
| 1103 |
+
#
|
| 1104 |
+
otel_ssl_active = False
|
| 1105 |
+
|
| 1106 |
+
[traces]
|
| 1107 |
+
# Distributed traces integration settings.
|
| 1108 |
+
|
| 1109 |
+
# Enables sending traces to OpenTelemetry.
|
| 1110 |
+
#
|
| 1111 |
+
# Variable: AIRFLOW__TRACES__OTEL_ON
|
| 1112 |
+
#
|
| 1113 |
+
otel_on = False
|
| 1114 |
+
|
| 1115 |
+
# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
|
| 1116 |
+
# traces.
|
| 1117 |
+
#
|
| 1118 |
+
# Variable: AIRFLOW__TRACES__OTEL_HOST
|
| 1119 |
+
#
|
| 1120 |
+
otel_host = localhost
|
| 1121 |
+
|
| 1122 |
+
# Specifies the port of the OpenTelemetry Collector that is listening to.
|
| 1123 |
+
#
|
| 1124 |
+
# Variable: AIRFLOW__TRACES__OTEL_PORT
|
| 1125 |
+
#
|
| 1126 |
+
otel_port = 8889
|
| 1127 |
+
|
| 1128 |
+
# The default service name of traces.
|
| 1129 |
+
#
|
| 1130 |
+
# Variable: AIRFLOW__TRACES__OTEL_SERVICE
|
| 1131 |
+
#
|
| 1132 |
+
otel_service = Airflow
|
| 1133 |
+
|
| 1134 |
+
# If True, all traces are also emitted to the console. Defaults to False.
|
| 1135 |
+
#
|
| 1136 |
+
# Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON
|
| 1137 |
+
#
|
| 1138 |
+
otel_debugging_on = False
|
| 1139 |
+
|
| 1140 |
+
# If True, SSL will be enabled. Defaults to False.
|
| 1141 |
+
# To establish an HTTPS connection to the OpenTelemetry collector,
|
| 1142 |
+
# you need to configure the SSL certificate and key within the OpenTelemetry collector's
|
| 1143 |
+
# config.yml file.
|
| 1144 |
+
#
|
| 1145 |
+
# Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE
|
| 1146 |
+
#
|
| 1147 |
+
otel_ssl_active = False
|
| 1148 |
+
|
| 1149 |
+
# If True, after the task is complete, the full task log messages will be added as the
|
| 1150 |
+
# span events, chunked by 64k size. defaults to False.
|
| 1151 |
+
#
|
| 1152 |
+
# Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT
|
| 1153 |
+
#
|
| 1154 |
+
otel_task_log_event = False
|
| 1155 |
+
|
| 1156 |
+
[secrets]
|
| 1157 |
+
# Full class name of secrets backend to enable (will precede env vars and metastore in search path)
|
| 1158 |
+
#
|
| 1159 |
+
# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
|
| 1160 |
+
#
|
| 1161 |
+
# Variable: AIRFLOW__SECRETS__BACKEND
|
| 1162 |
+
#
|
| 1163 |
+
backend =
|
| 1164 |
+
|
| 1165 |
+
# The backend_kwargs param is loaded into a dictionary and passed to ``__init__``
|
| 1166 |
+
# of secrets backend class. See documentation for the secrets backend you are using.
|
| 1167 |
+
# JSON is expected.
|
| 1168 |
+
#
|
| 1169 |
+
# Example for AWS Systems Manager ParameterStore:
|
| 1170 |
+
# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
|
| 1171 |
+
#
|
| 1172 |
+
# Variable: AIRFLOW__SECRETS__BACKEND_KWARGS
|
| 1173 |
+
#
|
| 1174 |
+
backend_kwargs =
|
| 1175 |
+
|
| 1176 |
+
# .. note:: |experimental|
|
| 1177 |
+
#
|
| 1178 |
+
# Enables local caching of Variables, when parsing DAGs only.
|
| 1179 |
+
# Using this option can make dag parsing faster if Variables are used in top level code, at the expense
|
| 1180 |
+
# of longer propagation time for changes.
|
| 1181 |
+
# Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG
|
| 1182 |
+
# tasks are run.
|
| 1183 |
+
#
|
| 1184 |
+
# Variable: AIRFLOW__SECRETS__USE_CACHE
|
| 1185 |
+
#
|
| 1186 |
+
use_cache = False
|
| 1187 |
+
|
| 1188 |
+
# .. note:: |experimental|
|
| 1189 |
+
#
|
| 1190 |
+
# When the cache is enabled, this is the duration for which we consider an entry in the cache to be
|
| 1191 |
+
# valid. Entries are refreshed if they are older than this many seconds.
|
| 1192 |
+
# It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a
|
| 1193 |
+
# Variable change take effect.
|
| 1194 |
+
#
|
| 1195 |
+
# Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS
|
| 1196 |
+
#
|
| 1197 |
+
cache_ttl_seconds = 900
|
| 1198 |
+
|
| 1199 |
+
[cli]
|
| 1200 |
+
# In what way should the cli access the API. The LocalClient will use the
|
| 1201 |
+
# database directly, while the json_client will use the api running on the
|
| 1202 |
+
# webserver
|
| 1203 |
+
#
|
| 1204 |
+
# Variable: AIRFLOW__CLI__API_CLIENT
|
| 1205 |
+
#
|
| 1206 |
+
api_client = airflow.api.client.local_client
|
| 1207 |
+
|
| 1208 |
+
# If you set web_server_url_prefix, do NOT forget to append it here, ex:
|
| 1209 |
+
# ``endpoint_url = http://localhost:8080/myroot``
|
| 1210 |
+
# So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
|
| 1211 |
+
#
|
| 1212 |
+
# Variable: AIRFLOW__CLI__ENDPOINT_URL
|
| 1213 |
+
#
|
| 1214 |
+
endpoint_url = http://localhost:8080
|
| 1215 |
+
|
| 1216 |
+
[debug]
|
| 1217 |
+
# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
|
| 1218 |
+
# failed task. Helpful for debugging purposes.
|
| 1219 |
+
#
|
| 1220 |
+
# Variable: AIRFLOW__DEBUG__FAIL_FAST
|
| 1221 |
+
#
|
| 1222 |
+
fail_fast = False
|
| 1223 |
+
|
| 1224 |
+
[api]
|
| 1225 |
+
# Enables the deprecated experimental API. Please note that these API endpoints do not have
|
| 1226 |
+
# access control. An authenticated user has full access.
|
| 1227 |
+
#
|
| 1228 |
+
# .. warning::
|
| 1229 |
+
#
|
| 1230 |
+
# This `Experimental REST API
|
| 1231 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/deprecated-rest-api-ref.html>`__ is
|
| 1232 |
+
# deprecated since version 2.0. Please consider using
|
| 1233 |
+
# `the Stable REST API
|
| 1234 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/stable-rest-api-ref.html>`__.
|
| 1235 |
+
# For more information on migration, see
|
| 1236 |
+
# `RELEASE_NOTES.rst <https://github.com/apache/airflow/blob/main/RELEASE_NOTES.rst>`_
|
| 1237 |
+
#
|
| 1238 |
+
# Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API
|
| 1239 |
+
#
|
| 1240 |
+
enable_experimental_api = False
|
| 1241 |
+
|
| 1242 |
+
# Comma separated list of auth backends to authenticate users of the API. See
|
| 1243 |
+
# `Security: API
|
| 1244 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/security/api.html>`__ for possible values.
|
| 1245 |
+
# ("airflow.api.auth.backend.default" allows all requests for historic reasons)
|
| 1246 |
+
#
|
| 1247 |
+
# Variable: AIRFLOW__API__AUTH_BACKENDS
|
| 1248 |
+
#
|
| 1249 |
+
auth_backends = airflow.api.auth.backend.session
|
| 1250 |
+
|
| 1251 |
+
# Used to set the maximum page limit for API requests. If limit passed as param
|
| 1252 |
+
# is greater than maximum page limit, it will be ignored and maximum page limit value
|
| 1253 |
+
# will be set as the limit
|
| 1254 |
+
#
|
| 1255 |
+
# Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT
|
| 1256 |
+
#
|
| 1257 |
+
maximum_page_limit = 100
|
| 1258 |
+
|
| 1259 |
+
# Used to set the default page limit when limit param is zero or not provided in API
|
| 1260 |
+
# requests. Otherwise if positive integer is passed in the API requests as limit, the
|
| 1261 |
+
# smallest number of user given limit or maximum page limit is taken as limit.
|
| 1262 |
+
#
|
| 1263 |
+
# Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT
|
| 1264 |
+
#
|
| 1265 |
+
fallback_page_limit = 100
|
| 1266 |
+
|
| 1267 |
+
# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
|
| 1268 |
+
#
|
| 1269 |
+
# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
|
| 1270 |
+
#
|
| 1271 |
+
# Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE
|
| 1272 |
+
#
|
| 1273 |
+
google_oauth2_audience =
|
| 1274 |
+
|
| 1275 |
+
# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
|
| 1276 |
+
# `the Application Default Credentials
|
| 1277 |
+
# <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
|
| 1278 |
+
# be used.
|
| 1279 |
+
#
|
| 1280 |
+
# Example: google_key_path = /files/service-account-json
|
| 1281 |
+
#
|
| 1282 |
+
# Variable: AIRFLOW__API__GOOGLE_KEY_PATH
|
| 1283 |
+
#
|
| 1284 |
+
google_key_path =
|
| 1285 |
+
|
| 1286 |
+
# Used in response to a preflight request to indicate which HTTP
|
| 1287 |
+
# headers can be used when making the actual request. This header is
|
| 1288 |
+
# the server side response to the browser's
|
| 1289 |
+
# Access-Control-Request-Headers header.
|
| 1290 |
+
#
|
| 1291 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS
|
| 1292 |
+
#
|
| 1293 |
+
access_control_allow_headers =
|
| 1294 |
+
|
| 1295 |
+
# Specifies the method or methods allowed when accessing the resource.
|
| 1296 |
+
#
|
| 1297 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS
|
| 1298 |
+
#
|
| 1299 |
+
access_control_allow_methods =
|
| 1300 |
+
|
| 1301 |
+
# Indicates whether the response can be shared with requesting code from the given origins.
|
| 1302 |
+
# Separate URLs with space.
|
| 1303 |
+
#
|
| 1304 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS
|
| 1305 |
+
#
|
| 1306 |
+
access_control_allow_origins =
|
| 1307 |
+
|
| 1308 |
+
# Indicates whether the **xcomEntries** endpoint supports the **deserialize**
|
| 1309 |
+
# flag. If set to ``False``, setting this flag in a request would result in a
|
| 1310 |
+
# 400 Bad Request error.
|
| 1311 |
+
#
|
| 1312 |
+
# Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT
|
| 1313 |
+
#
|
| 1314 |
+
enable_xcom_deserialize_support = False
|
| 1315 |
+
|
| 1316 |
+
[lineage]
|
| 1317 |
+
# what lineage backend to use
|
| 1318 |
+
#
|
| 1319 |
+
# Variable: AIRFLOW__LINEAGE__BACKEND
|
| 1320 |
+
#
|
| 1321 |
+
backend =
|
| 1322 |
+
|
| 1323 |
+
[operators]
|
| 1324 |
+
# The default owner assigned to each new operator, unless
|
| 1325 |
+
# provided explicitly or passed via ``default_args``
|
| 1326 |
+
#
|
| 1327 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER
|
| 1328 |
+
#
|
| 1329 |
+
default_owner = airflow
|
| 1330 |
+
|
| 1331 |
+
# The default value of attribute "deferrable" in operators and sensors.
|
| 1332 |
+
#
|
| 1333 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE
|
| 1334 |
+
#
|
| 1335 |
+
default_deferrable = false
|
| 1336 |
+
|
| 1337 |
+
# Indicates the default number of CPU units allocated to each operator when no specific CPU request
|
| 1338 |
+
# is specified in the operator's configuration
|
| 1339 |
+
#
|
| 1340 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS
|
| 1341 |
+
#
|
| 1342 |
+
default_cpus = 1
|
| 1343 |
+
|
| 1344 |
+
# Indicates the default number of RAM allocated to each operator when no specific RAM request
|
| 1345 |
+
# is specified in the operator's configuration
|
| 1346 |
+
#
|
| 1347 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_RAM
|
| 1348 |
+
#
|
| 1349 |
+
default_ram = 512
|
| 1350 |
+
|
| 1351 |
+
# Indicates the default number of disk storage allocated to each operator when no specific disk request
|
| 1352 |
+
# is specified in the operator's configuration
|
| 1353 |
+
#
|
| 1354 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_DISK
|
| 1355 |
+
#
|
| 1356 |
+
default_disk = 512
|
| 1357 |
+
|
| 1358 |
+
# Indicates the default number of GPUs allocated to each operator when no specific GPUs request
|
| 1359 |
+
# is specified in the operator's configuration
|
| 1360 |
+
#
|
| 1361 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS
|
| 1362 |
+
#
|
| 1363 |
+
default_gpus = 0
|
| 1364 |
+
|
| 1365 |
+
# Default queue that tasks get assigned to and that worker listen on.
|
| 1366 |
+
#
|
| 1367 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE
|
| 1368 |
+
#
|
| 1369 |
+
default_queue = default
|
| 1370 |
+
|
| 1371 |
+
# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
|
| 1372 |
+
# If set to ``False``, an exception will be thrown,
|
| 1373 |
+
# otherwise only the console message will be displayed.
|
| 1374 |
+
#
|
| 1375 |
+
# Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS
|
| 1376 |
+
#
|
| 1377 |
+
allow_illegal_arguments = False
|
| 1378 |
+
|
| 1379 |
+
[webserver]
|
| 1380 |
+
# The message displayed when a user attempts to execute actions beyond their authorised privileges.
|
| 1381 |
+
#
|
| 1382 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE
|
| 1383 |
+
#
|
| 1384 |
+
access_denied_message = Access is Denied
|
| 1385 |
+
|
| 1386 |
+
# Path of webserver config file used for configuring the webserver parameters
|
| 1387 |
+
#
|
| 1388 |
+
# Variable: AIRFLOW__WEBSERVER__CONFIG_FILE
|
| 1389 |
+
#
|
| 1390 |
+
config_file = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/webserver_config.py
|
| 1391 |
+
|
| 1392 |
+
# The base url of your website: Airflow cannot guess what domain or CNAME you are using.
|
| 1393 |
+
# This is used to create links in the Log Url column in the Browse - Task Instances menu,
|
| 1394 |
+
# as well as in any automated emails sent by Airflow that contain links to your webserver.
|
| 1395 |
+
#
|
| 1396 |
+
# Variable: AIRFLOW__WEBSERVER__BASE_URL
|
| 1397 |
+
#
|
| 1398 |
+
base_url = http://localhost:8080
|
| 1399 |
+
|
| 1400 |
+
# Default timezone to display all dates in the UI, can be UTC, system, or
|
| 1401 |
+
# any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the
|
| 1402 |
+
# default value of core/default_timezone will be used
|
| 1403 |
+
#
|
| 1404 |
+
# Example: default_ui_timezone = America/New_York
|
| 1405 |
+
#
|
| 1406 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE
|
| 1407 |
+
#
|
| 1408 |
+
default_ui_timezone = UTC
|
| 1409 |
+
|
| 1410 |
+
# The ip specified when starting the web server
|
| 1411 |
+
#
|
| 1412 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST
|
| 1413 |
+
#
|
| 1414 |
+
web_server_host = 0.0.0.0
|
| 1415 |
+
|
| 1416 |
+
# The port on which to run the web server
|
| 1417 |
+
#
|
| 1418 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT
|
| 1419 |
+
#
|
| 1420 |
+
web_server_port = 8080
|
| 1421 |
+
|
| 1422 |
+
# Paths to the SSL certificate and key for the web server. When both are
|
| 1423 |
+
# provided SSL will be enabled. This does not change the web server port.
|
| 1424 |
+
#
|
| 1425 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT
|
| 1426 |
+
#
|
| 1427 |
+
web_server_ssl_cert =
|
| 1428 |
+
|
| 1429 |
+
# Paths to the SSL certificate and key for the web server. When both are
|
| 1430 |
+
# provided SSL will be enabled. This does not change the web server port.
|
| 1431 |
+
#
|
| 1432 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY
|
| 1433 |
+
#
|
| 1434 |
+
web_server_ssl_key =
|
| 1435 |
+
|
| 1436 |
+
# The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the
|
| 1437 |
+
# ``database`` backend, sessions are store in the database and they can be
|
| 1438 |
+
# managed there (for example when you reset password of the user, all sessions for that user are
|
| 1439 |
+
# deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client
|
| 1440 |
+
# side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted
|
| 1441 |
+
# when you reset password of the user, which means that other than waiting for expiry time, the only
|
| 1442 |
+
# way to invalidate all sessions for a user is to change secret_key and restart webserver (which
|
| 1443 |
+
# also invalidates and logs out all other user's sessions).
|
| 1444 |
+
#
|
| 1445 |
+
# When you are using ``database`` backend, make sure to keep your database session table small
|
| 1446 |
+
# by periodically running ``airflow db clean --table session`` command, especially if you have
|
| 1447 |
+
# automated API calls that will create a new session for each call rather than reuse the sessions
|
| 1448 |
+
# stored in browser cookies.
|
| 1449 |
+
#
|
| 1450 |
+
# Example: session_backend = securecookie
|
| 1451 |
+
#
|
| 1452 |
+
# Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND
|
| 1453 |
+
#
|
| 1454 |
+
session_backend = database
|
| 1455 |
+
|
| 1456 |
+
# Number of seconds the webserver waits before killing gunicorn master that doesn't respond
|
| 1457 |
+
#
|
| 1458 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT
|
| 1459 |
+
#
|
| 1460 |
+
web_server_master_timeout = 120
|
| 1461 |
+
|
| 1462 |
+
# Number of seconds the gunicorn webserver waits before timing out on a worker
|
| 1463 |
+
#
|
| 1464 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT
|
| 1465 |
+
#
|
| 1466 |
+
web_server_worker_timeout = 120
|
| 1467 |
+
|
| 1468 |
+
# Number of workers to refresh at a time. When set to 0, worker refresh is
|
| 1469 |
+
# disabled. When nonzero, airflow periodically refreshes webserver workers by
|
| 1470 |
+
# bringing up new ones and killing old ones.
|
| 1471 |
+
#
|
| 1472 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE
|
| 1473 |
+
#
|
| 1474 |
+
worker_refresh_batch_size = 1
|
| 1475 |
+
|
| 1476 |
+
# Number of seconds to wait before refreshing a batch of workers.
|
| 1477 |
+
#
|
| 1478 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL
|
| 1479 |
+
#
|
| 1480 |
+
worker_refresh_interval = 6000
|
| 1481 |
+
|
| 1482 |
+
# If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes,
|
| 1483 |
+
# then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower,
|
| 1484 |
+
# uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production.
|
| 1485 |
+
#
|
| 1486 |
+
# Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE
|
| 1487 |
+
#
|
| 1488 |
+
reload_on_plugin_change = False
|
| 1489 |
+
|
| 1490 |
+
# Secret key used to run your flask app. It should be as random as possible. However, when running
|
| 1491 |
+
# more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise
|
| 1492 |
+
# one of them will error with "CSRF session token is missing".
|
| 1493 |
+
# The webserver key is also used to authorize requests to Celery workers when logs are retrieved.
|
| 1494 |
+
# The token generated using the secret key has a short expiry time though - make sure that time on
|
| 1495 |
+
# ALL the machines that you run airflow components on is synchronized (for example using ntpd)
|
| 1496 |
+
# otherwise you might get "forbidden" errors when the logs are accessed.
|
| 1497 |
+
#
|
| 1498 |
+
# Variable: AIRFLOW__WEBSERVER__SECRET_KEY
|
| 1499 |
+
#
|
| 1500 |
+
secret_key = JRNP2IC4kIaVxisy9+AW4A==
|
| 1501 |
+
|
| 1502 |
+
# Number of workers to run the Gunicorn web server
|
| 1503 |
+
#
|
| 1504 |
+
# Variable: AIRFLOW__WEBSERVER__WORKERS
|
| 1505 |
+
#
|
| 1506 |
+
workers = 4
|
| 1507 |
+
|
| 1508 |
+
# The worker class gunicorn should use. Choices include
|
| 1509 |
+
# ``sync`` (default), ``eventlet``, ``gevent``.
|
| 1510 |
+
#
|
| 1511 |
+
# .. warning::
|
| 1512 |
+
#
|
| 1513 |
+
# When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT``
|
| 1514 |
+
# environment variable to ``"1"`` to make sure gevent patching is done as early as possible.
|
| 1515 |
+
#
|
| 1516 |
+
# Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may
|
| 1517 |
+
# affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash.
|
| 1518 |
+
#
|
| 1519 |
+
# See related Issues / PRs for more details:
|
| 1520 |
+
#
|
| 1521 |
+
# * https://github.com/benoitc/gunicorn/issues/2796
|
| 1522 |
+
# * https://github.com/apache/airflow/issues/8212
|
| 1523 |
+
# * https://github.com/apache/airflow/pull/28283
|
| 1524 |
+
#
|
| 1525 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_CLASS
|
| 1526 |
+
#
|
| 1527 |
+
worker_class = sync
|
| 1528 |
+
|
| 1529 |
+
# Log files for the gunicorn webserver. '-' means log to stderr.
|
| 1530 |
+
#
|
| 1531 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE
|
| 1532 |
+
#
|
| 1533 |
+
access_logfile = -
|
| 1534 |
+
|
| 1535 |
+
# Log files for the gunicorn webserver. '-' means log to stderr.
|
| 1536 |
+
#
|
| 1537 |
+
# Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE
|
| 1538 |
+
#
|
| 1539 |
+
error_logfile = -
|
| 1540 |
+
|
| 1541 |
+
# Access log format for gunicorn webserver.
|
| 1542 |
+
# default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"``
|
| 1543 |
+
# See `Gunicorn Settings: 'access_log_format' Reference
|
| 1544 |
+
# <https://docs.gunicorn.org/en/stable/settings.html#access-log-format>`__ for more details
|
| 1545 |
+
#
|
| 1546 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT
|
| 1547 |
+
#
|
| 1548 |
+
access_logformat =
|
| 1549 |
+
|
| 1550 |
+
# Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values
|
| 1551 |
+
# except those that have security implications. ``True`` shows all values. ``False`` hides the
|
| 1552 |
+
# configuration completely.
|
| 1553 |
+
#
|
| 1554 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG
|
| 1555 |
+
#
|
| 1556 |
+
expose_config = False
|
| 1557 |
+
|
| 1558 |
+
# Expose hostname in the web server
|
| 1559 |
+
#
|
| 1560 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME
|
| 1561 |
+
#
|
| 1562 |
+
expose_hostname = False
|
| 1563 |
+
|
| 1564 |
+
# Expose stacktrace in the web server
|
| 1565 |
+
#
|
| 1566 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE
|
| 1567 |
+
#
|
| 1568 |
+
expose_stacktrace = False
|
| 1569 |
+
|
| 1570 |
+
# Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times``
|
| 1571 |
+
#
|
| 1572 |
+
# Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW
|
| 1573 |
+
#
|
| 1574 |
+
dag_default_view = grid
|
| 1575 |
+
|
| 1576 |
+
# Default DAG orientation. Valid values are:
|
| 1577 |
+
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
|
| 1578 |
+
#
|
| 1579 |
+
# Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION
|
| 1580 |
+
#
|
| 1581 |
+
dag_orientation = LR
|
| 1582 |
+
|
| 1583 |
+
# Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical``
|
| 1584 |
+
#
|
| 1585 |
+
# Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER
|
| 1586 |
+
#
|
| 1587 |
+
grid_view_sorting_order = topological
|
| 1588 |
+
|
| 1589 |
+
# The amount of time (in secs) webserver will wait for initial handshake
|
| 1590 |
+
# while fetching logs from other worker machine
|
| 1591 |
+
#
|
| 1592 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC
|
| 1593 |
+
#
|
| 1594 |
+
log_fetch_timeout_sec = 5
|
| 1595 |
+
|
| 1596 |
+
# Time interval (in secs) to wait before next log fetching.
|
| 1597 |
+
#
|
| 1598 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC
|
| 1599 |
+
#
|
| 1600 |
+
log_fetch_delay_sec = 2
|
| 1601 |
+
|
| 1602 |
+
# Distance away from page bottom to enable auto tailing.
|
| 1603 |
+
#
|
| 1604 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET
|
| 1605 |
+
#
|
| 1606 |
+
log_auto_tailing_offset = 30
|
| 1607 |
+
|
| 1608 |
+
# Animation speed for auto tailing log display.
|
| 1609 |
+
#
|
| 1610 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED
|
| 1611 |
+
#
|
| 1612 |
+
log_animation_speed = 1000
|
| 1613 |
+
|
| 1614 |
+
# By default, the webserver shows paused DAGs. Flip this to hide paused
|
| 1615 |
+
# DAGs by default
|
| 1616 |
+
#
|
| 1617 |
+
# Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT
|
| 1618 |
+
#
|
| 1619 |
+
hide_paused_dags_by_default = False
|
| 1620 |
+
|
| 1621 |
+
# Consistent page size across all listing views in the UI
|
| 1622 |
+
#
|
| 1623 |
+
# Variable: AIRFLOW__WEBSERVER__PAGE_SIZE
|
| 1624 |
+
#
|
| 1625 |
+
page_size = 100
|
| 1626 |
+
|
| 1627 |
+
# Define the color of navigation bar
|
| 1628 |
+
#
|
| 1629 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR
|
| 1630 |
+
#
|
| 1631 |
+
navbar_color = #fff
|
| 1632 |
+
|
| 1633 |
+
# Define the color of text in the navigation bar
|
| 1634 |
+
#
|
| 1635 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR
|
| 1636 |
+
#
|
| 1637 |
+
navbar_text_color = #51504f
|
| 1638 |
+
|
| 1639 |
+
# Define the color of navigation bar links when hovered
|
| 1640 |
+
#
|
| 1641 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR
|
| 1642 |
+
#
|
| 1643 |
+
navbar_hover_color = #eee
|
| 1644 |
+
|
| 1645 |
+
# Define the color of text in the navigation bar when hovered
|
| 1646 |
+
#
|
| 1647 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR
|
| 1648 |
+
#
|
| 1649 |
+
navbar_text_hover_color = #51504f
|
| 1650 |
+
|
| 1651 |
+
# Define the color of the logo text
|
| 1652 |
+
#
|
| 1653 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR
|
| 1654 |
+
#
|
| 1655 |
+
navbar_logo_text_color = #51504f
|
| 1656 |
+
|
| 1657 |
+
# Default dagrun to show in UI
|
| 1658 |
+
#
|
| 1659 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER
|
| 1660 |
+
#
|
| 1661 |
+
default_dag_run_display_number = 25
|
| 1662 |
+
|
| 1663 |
+
# Enable werkzeug ``ProxyFix`` middleware for reverse proxy
|
| 1664 |
+
#
|
| 1665 |
+
# Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX
|
| 1666 |
+
#
|
| 1667 |
+
enable_proxy_fix = False
|
| 1668 |
+
|
| 1669 |
+
# Number of values to trust for ``X-Forwarded-For``.
|
| 1670 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
| 1671 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
| 1672 |
+
#
|
| 1673 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR
|
| 1674 |
+
#
|
| 1675 |
+
proxy_fix_x_for = 1
|
| 1676 |
+
|
| 1677 |
+
# Number of values to trust for ``X-Forwarded-Proto``.
|
| 1678 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
| 1679 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
| 1680 |
+
#
|
| 1681 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO
|
| 1682 |
+
#
|
| 1683 |
+
proxy_fix_x_proto = 1
|
| 1684 |
+
|
| 1685 |
+
# Number of values to trust for ``X-Forwarded-Host``.
|
| 1686 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
| 1687 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
| 1688 |
+
#
|
| 1689 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST
|
| 1690 |
+
#
|
| 1691 |
+
proxy_fix_x_host = 1
|
| 1692 |
+
|
| 1693 |
+
# Number of values to trust for ``X-Forwarded-Port``.
|
| 1694 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
| 1695 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
| 1696 |
+
#
|
| 1697 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT
|
| 1698 |
+
#
|
| 1699 |
+
proxy_fix_x_port = 1
|
| 1700 |
+
|
| 1701 |
+
# Number of values to trust for ``X-Forwarded-Prefix``.
|
| 1702 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
| 1703 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
| 1704 |
+
#
|
| 1705 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX
|
| 1706 |
+
#
|
| 1707 |
+
proxy_fix_x_prefix = 1
|
| 1708 |
+
|
| 1709 |
+
# Set secure flag on session cookie
|
| 1710 |
+
#
|
| 1711 |
+
# Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE
|
| 1712 |
+
#
|
| 1713 |
+
cookie_secure = False
|
| 1714 |
+
|
| 1715 |
+
# Set samesite policy on session cookie
|
| 1716 |
+
#
|
| 1717 |
+
# Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE
|
| 1718 |
+
#
|
| 1719 |
+
cookie_samesite = Lax
|
| 1720 |
+
|
| 1721 |
+
# Default setting for wrap toggle on DAG code and TI log views.
|
| 1722 |
+
#
|
| 1723 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP
|
| 1724 |
+
#
|
| 1725 |
+
default_wrap = False
|
| 1726 |
+
|
| 1727 |
+
# Allow the UI to be rendered in a frame
|
| 1728 |
+
#
|
| 1729 |
+
# Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED
|
| 1730 |
+
#
|
| 1731 |
+
x_frame_enabled = True
|
| 1732 |
+
|
| 1733 |
+
# Send anonymous user activity to your analytics tool
|
| 1734 |
+
# choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo``
|
| 1735 |
+
#
|
| 1736 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL
|
| 1737 |
+
#
|
| 1738 |
+
# analytics_tool =
|
| 1739 |
+
|
| 1740 |
+
# Unique ID of your account in the analytics tool
|
| 1741 |
+
#
|
| 1742 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID
|
| 1743 |
+
#
|
| 1744 |
+
# analytics_id =
|
| 1745 |
+
|
| 1746 |
+
# Your instances url, only applicable to Matomo.
|
| 1747 |
+
#
|
| 1748 |
+
# Example: analytics_url = https://your.matomo.instance.com/
|
| 1749 |
+
#
|
| 1750 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL
|
| 1751 |
+
#
|
| 1752 |
+
# analytics_url =
|
| 1753 |
+
|
| 1754 |
+
# 'Recent Tasks' stats will show for old DagRuns if set
|
| 1755 |
+
#
|
| 1756 |
+
# Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS
|
| 1757 |
+
#
|
| 1758 |
+
show_recent_stats_for_completed_runs = True
|
| 1759 |
+
|
| 1760 |
+
# The UI cookie lifetime in minutes. User will be logged out from UI after
|
| 1761 |
+
# ``[webserver] session_lifetime_minutes`` of non-activity
|
| 1762 |
+
#
|
| 1763 |
+
# Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES
|
| 1764 |
+
#
|
| 1765 |
+
session_lifetime_minutes = 43200
|
| 1766 |
+
|
| 1767 |
+
# Sets a custom page title for the DAGs overview page and site title for all pages
|
| 1768 |
+
#
|
| 1769 |
+
# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME
|
| 1770 |
+
#
|
| 1771 |
+
# instance_name =
|
| 1772 |
+
|
| 1773 |
+
# Whether the custom page title for the DAGs overview page contains any Markup language
|
| 1774 |
+
#
|
| 1775 |
+
# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP
|
| 1776 |
+
#
|
| 1777 |
+
instance_name_has_markup = False
|
| 1778 |
+
|
| 1779 |
+
# How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
|
| 1780 |
+
# when auto-refresh is turned on
|
| 1781 |
+
#
|
| 1782 |
+
# Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL
|
| 1783 |
+
#
|
| 1784 |
+
auto_refresh_interval = 3
|
| 1785 |
+
|
| 1786 |
+
# Boolean for displaying warning for publicly viewable deployment
|
| 1787 |
+
#
|
| 1788 |
+
# Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE
|
| 1789 |
+
#
|
| 1790 |
+
warn_deployment_exposure = True
|
| 1791 |
+
|
| 1792 |
+
# Comma separated string of view events to exclude from dag audit view.
|
| 1793 |
+
# All other events will be added minus the ones passed here.
|
| 1794 |
+
# The audit logs in the db will not be affected by this parameter.
|
| 1795 |
+
#
|
| 1796 |
+
# Example: audit_view_excluded_events = cli_task_run,running,success
|
| 1797 |
+
#
|
| 1798 |
+
# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS
|
| 1799 |
+
#
|
| 1800 |
+
# audit_view_excluded_events =
|
| 1801 |
+
|
| 1802 |
+
# Comma separated string of view events to include in dag audit view.
|
| 1803 |
+
# If passed, only these events will populate the dag audit view.
|
| 1804 |
+
# The audit logs in the db will not be affected by this parameter.
|
| 1805 |
+
#
|
| 1806 |
+
# Example: audit_view_included_events = dagrun_cleared,failed
|
| 1807 |
+
#
|
| 1808 |
+
# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS
|
| 1809 |
+
#
|
| 1810 |
+
# audit_view_included_events =
|
| 1811 |
+
|
| 1812 |
+
# Boolean for running SwaggerUI in the webserver.
|
| 1813 |
+
#
|
| 1814 |
+
# Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI
|
| 1815 |
+
#
|
| 1816 |
+
enable_swagger_ui = True
|
| 1817 |
+
|
| 1818 |
+
# Boolean for running Internal API in the webserver.
|
| 1819 |
+
#
|
| 1820 |
+
# Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API
|
| 1821 |
+
#
|
| 1822 |
+
run_internal_api = False
|
| 1823 |
+
|
| 1824 |
+
# The caching algorithm used by the webserver. Must be a valid hashlib function name.
|
| 1825 |
+
#
|
| 1826 |
+
# Example: caching_hash_method = sha256
|
| 1827 |
+
#
|
| 1828 |
+
# Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD
|
| 1829 |
+
#
|
| 1830 |
+
caching_hash_method = md5
|
| 1831 |
+
|
| 1832 |
+
# Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger
|
| 1833 |
+
# without displaying a form to add a **dag_run.conf**, ``True`` to always display the form.
|
| 1834 |
+
# The form is displayed always if parameters are defined.
|
| 1835 |
+
#
|
| 1836 |
+
# Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS
|
| 1837 |
+
#
|
| 1838 |
+
show_trigger_form_if_no_params = False
|
| 1839 |
+
|
| 1840 |
+
# Number of recent DAG run configurations in the selector on the trigger web form.
|
| 1841 |
+
#
|
| 1842 |
+
# Example: num_recent_configurations_for_trigger = 10
|
| 1843 |
+
#
|
| 1844 |
+
# Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER
|
| 1845 |
+
#
|
| 1846 |
+
num_recent_configurations_for_trigger = 5
|
| 1847 |
+
|
| 1848 |
+
# A DAG author is able to provide any raw HTML into ``doc_md`` or params description in
|
| 1849 |
+
# ``description_md`` for text formatting. This is including potentially unsafe javascript.
|
| 1850 |
+
# Displaying the DAG or trigger form in web UI provides the DAG author the potential to
|
| 1851 |
+
# inject malicious code into clients browsers. To ensure the web UI is safe by default,
|
| 1852 |
+
# raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML
|
| 1853 |
+
# support in markdown by setting this option to ``True``.
|
| 1854 |
+
#
|
| 1855 |
+
# This parameter also enables the deprecated fields ``description_html`` and
|
| 1856 |
+
# ``custom_html_form`` in DAG params until the feature is removed in a future version.
|
| 1857 |
+
#
|
| 1858 |
+
# Example: allow_raw_html_descriptions = False
|
| 1859 |
+
#
|
| 1860 |
+
# Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS
|
| 1861 |
+
#
|
| 1862 |
+
allow_raw_html_descriptions = False
|
| 1863 |
+
|
| 1864 |
+
# The maximum size of the request payload (in MB) that can be sent.
|
| 1865 |
+
#
|
| 1866 |
+
# Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE
|
| 1867 |
+
#
|
| 1868 |
+
allowed_payload_size = 1.0
|
| 1869 |
+
|
| 1870 |
+
# Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes
|
| 1871 |
+
# to a DAG that may be running on sensitive environments like production.
|
| 1872 |
+
# When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause,
|
| 1873 |
+
# Trigger a DAG
|
| 1874 |
+
#
|
| 1875 |
+
# Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE
|
| 1876 |
+
#
|
| 1877 |
+
require_confirmation_dag_change = False
|
| 1878 |
+
|
| 1879 |
+
[email]
|
| 1880 |
+
# Configuration email backend and whether to
|
| 1881 |
+
# send email alerts on retry or failure
|
| 1882 |
+
|
| 1883 |
+
# Email backend to use
|
| 1884 |
+
#
|
| 1885 |
+
# Variable: AIRFLOW__EMAIL__EMAIL_BACKEND
|
| 1886 |
+
#
|
| 1887 |
+
email_backend = airflow.utils.email.send_email_smtp
|
| 1888 |
+
|
| 1889 |
+
# Email connection to use
|
| 1890 |
+
#
|
| 1891 |
+
# Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID
|
| 1892 |
+
#
|
| 1893 |
+
email_conn_id = smtp_default
|
| 1894 |
+
|
| 1895 |
+
# Whether email alerts should be sent when a task is retried
|
| 1896 |
+
#
|
| 1897 |
+
# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY
|
| 1898 |
+
#
|
| 1899 |
+
default_email_on_retry = True
|
| 1900 |
+
|
| 1901 |
+
# Whether email alerts should be sent when a task failed
|
| 1902 |
+
#
|
| 1903 |
+
# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE
|
| 1904 |
+
#
|
| 1905 |
+
default_email_on_failure = True
|
| 1906 |
+
|
| 1907 |
+
# File that will be used as the template for Email subject (which will be rendered using Jinja2).
|
| 1908 |
+
# If not set, Airflow uses a base template.
|
| 1909 |
+
#
|
| 1910 |
+
# Example: subject_template = /path/to/my_subject_template_file
|
| 1911 |
+
#
|
| 1912 |
+
# Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE
|
| 1913 |
+
#
|
| 1914 |
+
# subject_template =
|
| 1915 |
+
|
| 1916 |
+
# File that will be used as the template for Email content (which will be rendered using Jinja2).
|
| 1917 |
+
# If not set, Airflow uses a base template.
|
| 1918 |
+
#
|
| 1919 |
+
# Example: html_content_template = /path/to/my_html_content_template_file
|
| 1920 |
+
#
|
| 1921 |
+
# Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE
|
| 1922 |
+
#
|
| 1923 |
+
# html_content_template =
|
| 1924 |
+
|
| 1925 |
+
# Email address that will be used as sender address.
|
| 1926 |
+
# It can either be raw email or the complete address in a format ``Sender Name <sender@email.com>``
|
| 1927 |
+
#
|
| 1928 |
+
# Example: from_email = Airflow <airflow@example.com>
|
| 1929 |
+
#
|
| 1930 |
+
# Variable: AIRFLOW__EMAIL__FROM_EMAIL
|
| 1931 |
+
#
|
| 1932 |
+
# from_email =
|
| 1933 |
+
|
| 1934 |
+
# ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
|
| 1935 |
+
# which sets it to ``ssl.create_default_context()`` which provides the right balance between
|
| 1936 |
+
# compatibility and security, it however requires that certificates in your operating system are
|
| 1937 |
+
# updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
|
| 1938 |
+
# keys installed on your machines. You can switch it to "none" if you want to disable checking
|
| 1939 |
+
# of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
|
| 1940 |
+
# if your infrastructure is not sufficiently secured. It should only be set temporarily while you
|
| 1941 |
+
# are fixing your certificate configuration. This can be typically done by upgrading to newer
|
| 1942 |
+
# version of the operating system you run Airflow components on,by upgrading/refreshing proper
|
| 1943 |
+
# certificates in the OS or by updating certificates for your mail servers.
|
| 1944 |
+
#
|
| 1945 |
+
# Example: ssl_context = default
|
| 1946 |
+
#
|
| 1947 |
+
# Variable: AIRFLOW__EMAIL__SSL_CONTEXT
|
| 1948 |
+
#
|
| 1949 |
+
ssl_context = default
|
| 1950 |
+
|
| 1951 |
+
[smtp]
|
| 1952 |
+
# If you want airflow to send emails on retries, failure, and you want to use
|
| 1953 |
+
# the airflow.utils.email.send_email_smtp function, you have to configure an
|
| 1954 |
+
# smtp server here
|
| 1955 |
+
|
| 1956 |
+
# Specifies the host server address used by Airflow when sending out email notifications via SMTP.
|
| 1957 |
+
#
|
| 1958 |
+
# Variable: AIRFLOW__SMTP__SMTP_HOST
|
| 1959 |
+
#
|
| 1960 |
+
smtp_host = localhost
|
| 1961 |
+
|
| 1962 |
+
# Determines whether to use the STARTTLS command when connecting to the SMTP server.
|
| 1963 |
+
#
|
| 1964 |
+
# Variable: AIRFLOW__SMTP__SMTP_STARTTLS
|
| 1965 |
+
#
|
| 1966 |
+
smtp_starttls = True
|
| 1967 |
+
|
| 1968 |
+
# Determines whether to use an SSL connection when talking to the SMTP server.
|
| 1969 |
+
#
|
| 1970 |
+
# Variable: AIRFLOW__SMTP__SMTP_SSL
|
| 1971 |
+
#
|
| 1972 |
+
smtp_ssl = False
|
| 1973 |
+
|
| 1974 |
+
# Username to authenticate when connecting to smtp server.
|
| 1975 |
+
#
|
| 1976 |
+
# Example: smtp_user = airflow
|
| 1977 |
+
#
|
| 1978 |
+
# Variable: AIRFLOW__SMTP__SMTP_USER
|
| 1979 |
+
#
|
| 1980 |
+
# smtp_user =
|
| 1981 |
+
|
| 1982 |
+
# Password to authenticate when connecting to smtp server.
|
| 1983 |
+
#
|
| 1984 |
+
# Example: smtp_password = airflow
|
| 1985 |
+
#
|
| 1986 |
+
# Variable: AIRFLOW__SMTP__SMTP_PASSWORD
|
| 1987 |
+
#
|
| 1988 |
+
# smtp_password =
|
| 1989 |
+
|
| 1990 |
+
# Defines the port number on which Airflow connects to the SMTP server to send email notifications.
|
| 1991 |
+
#
|
| 1992 |
+
# Variable: AIRFLOW__SMTP__SMTP_PORT
|
| 1993 |
+
#
|
| 1994 |
+
smtp_port = 25
|
| 1995 |
+
|
| 1996 |
+
# Specifies the default **from** email address used when Airflow sends email notifications.
|
| 1997 |
+
#
|
| 1998 |
+
# Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM
|
| 1999 |
+
#
|
| 2000 |
+
smtp_mail_from = airflow@example.com
|
| 2001 |
+
|
| 2002 |
+
# Determines the maximum time (in seconds) the Apache Airflow system will wait for a
|
| 2003 |
+
# connection to the SMTP server to be established.
|
| 2004 |
+
#
|
| 2005 |
+
# Variable: AIRFLOW__SMTP__SMTP_TIMEOUT
|
| 2006 |
+
#
|
| 2007 |
+
smtp_timeout = 30
|
| 2008 |
+
|
| 2009 |
+
# Defines the maximum number of times Airflow will attempt to connect to the SMTP server.
|
| 2010 |
+
#
|
| 2011 |
+
# Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT
|
| 2012 |
+
#
|
| 2013 |
+
smtp_retry_limit = 5
|
| 2014 |
+
|
| 2015 |
+
[sentry]
|
| 2016 |
+
# `Sentry <https://docs.sentry.io>`__ integration. Here you can supply
|
| 2017 |
+
# additional configuration options based on the Python platform.
|
| 2018 |
+
# See `Python / Configuration / Basic Options
|
| 2019 |
+
# <https://docs.sentry.io/platforms/python/configuration/options/>`__ for more details.
|
| 2020 |
+
# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
|
| 2021 |
+
# ``ignore_errors``, ``before_breadcrumb``, ``transport``.
|
| 2022 |
+
|
| 2023 |
+
# Enable error reporting to Sentry
|
| 2024 |
+
#
|
| 2025 |
+
# Variable: AIRFLOW__SENTRY__SENTRY_ON
|
| 2026 |
+
#
|
| 2027 |
+
sentry_on = false
|
| 2028 |
+
|
| 2029 |
+
#
|
| 2030 |
+
# Variable: AIRFLOW__SENTRY__SENTRY_DSN
|
| 2031 |
+
#
|
| 2032 |
+
sentry_dsn =
|
| 2033 |
+
|
| 2034 |
+
# Dotted path to a before_send function that the sentry SDK should be configured to use.
|
| 2035 |
+
#
|
| 2036 |
+
# Variable: AIRFLOW__SENTRY__BEFORE_SEND
|
| 2037 |
+
#
|
| 2038 |
+
# before_send =
|
| 2039 |
+
|
| 2040 |
+
[scheduler]
|
| 2041 |
+
# Task instances listen for external kill signal (when you clear tasks
|
| 2042 |
+
# from the CLI or the UI), this defines the frequency at which they should
|
| 2043 |
+
# listen (in seconds).
|
| 2044 |
+
#
|
| 2045 |
+
# Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC
|
| 2046 |
+
#
|
| 2047 |
+
job_heartbeat_sec = 5
|
| 2048 |
+
|
| 2049 |
+
# The scheduler constantly tries to trigger new tasks (look at the
|
| 2050 |
+
# scheduler section in the docs for more information). This defines
|
| 2051 |
+
# how often the scheduler should run (in seconds).
|
| 2052 |
+
#
|
| 2053 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC
|
| 2054 |
+
#
|
| 2055 |
+
scheduler_heartbeat_sec = 5
|
| 2056 |
+
|
| 2057 |
+
# The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the
|
| 2058 |
+
# scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default
|
| 2059 |
+
# to the value of ``[scheduler] scheduler_zombie_task_threshold``.
|
| 2060 |
+
#
|
| 2061 |
+
# Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC
|
| 2062 |
+
#
|
| 2063 |
+
local_task_job_heartbeat_sec = 0
|
| 2064 |
+
|
| 2065 |
+
# The number of times to try to schedule each DAG file
|
| 2066 |
+
# -1 indicates unlimited number
|
| 2067 |
+
#
|
| 2068 |
+
# Variable: AIRFLOW__SCHEDULER__NUM_RUNS
|
| 2069 |
+
#
|
| 2070 |
+
num_runs = -1
|
| 2071 |
+
|
| 2072 |
+
# Controls how long the scheduler will sleep between loops, but if there was nothing to do
|
| 2073 |
+
# in the loop. i.e. if it scheduled something then it will start the next loop
|
| 2074 |
+
# iteration straight away.
|
| 2075 |
+
#
|
| 2076 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME
|
| 2077 |
+
#
|
| 2078 |
+
scheduler_idle_sleep_time = 1
|
| 2079 |
+
|
| 2080 |
+
# Number of seconds after which a DAG file is parsed. The DAG file is parsed every
|
| 2081 |
+
# ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
|
| 2082 |
+
# this interval. Keeping this number low will increase CPU usage.
|
| 2083 |
+
#
|
| 2084 |
+
# Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL
|
| 2085 |
+
#
|
| 2086 |
+
min_file_process_interval = 30
|
| 2087 |
+
|
| 2088 |
+
# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
|
| 2089 |
+
# the expected files) which should be deactivated, as well as datasets that are no longer
|
| 2090 |
+
# referenced and should be marked as orphaned.
|
| 2091 |
+
#
|
| 2092 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL
|
| 2093 |
+
#
|
| 2094 |
+
parsing_cleanup_interval = 60
|
| 2095 |
+
|
| 2096 |
+
# How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale
|
| 2097 |
+
# DAGs (DAGs which are no longer present in the expected files). The reason why we need
|
| 2098 |
+
# this threshold is to account for the time between when the file is parsed and when the
|
| 2099 |
+
# DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``,
|
| 2100 |
+
# but when you have a long timeout configured, it results in a significant delay in the
|
| 2101 |
+
# deactivation of stale dags.
|
| 2102 |
+
#
|
| 2103 |
+
# Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD
|
| 2104 |
+
#
|
| 2105 |
+
stale_dag_threshold = 50
|
| 2106 |
+
|
| 2107 |
+
# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
|
| 2108 |
+
#
|
| 2109 |
+
# Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL
|
| 2110 |
+
#
|
| 2111 |
+
dag_dir_list_interval = 300
|
| 2112 |
+
|
| 2113 |
+
# How often should stats be printed to the logs. Setting to 0 will disable printing stats
|
| 2114 |
+
#
|
| 2115 |
+
# Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL
|
| 2116 |
+
#
|
| 2117 |
+
print_stats_interval = 30
|
| 2118 |
+
|
| 2119 |
+
# How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled)
|
| 2120 |
+
#
|
| 2121 |
+
# Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL
|
| 2122 |
+
#
|
| 2123 |
+
pool_metrics_interval = 5.0
|
| 2124 |
+
|
| 2125 |
+
# If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold``
|
| 2126 |
+
# ago (in seconds), scheduler is considered unhealthy.
|
| 2127 |
+
# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
|
| 2128 |
+
# for SchedulerJob.
|
| 2129 |
+
#
|
| 2130 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD
|
| 2131 |
+
#
|
| 2132 |
+
scheduler_health_check_threshold = 30
|
| 2133 |
+
|
| 2134 |
+
# When you start a scheduler, airflow starts a tiny web server
|
| 2135 |
+
# subprocess to serve a health check if this is set to ``True``
|
| 2136 |
+
#
|
| 2137 |
+
# Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK
|
| 2138 |
+
#
|
| 2139 |
+
enable_health_check = False
|
| 2140 |
+
|
| 2141 |
+
# When you start a scheduler, airflow starts a tiny web server
|
| 2142 |
+
# subprocess to serve a health check on this host
|
| 2143 |
+
#
|
| 2144 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST
|
| 2145 |
+
#
|
| 2146 |
+
scheduler_health_check_server_host = 0.0.0.0
|
| 2147 |
+
|
| 2148 |
+
# When you start a scheduler, airflow starts a tiny web server
|
| 2149 |
+
# subprocess to serve a health check on this port
|
| 2150 |
+
#
|
| 2151 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT
|
| 2152 |
+
#
|
| 2153 |
+
scheduler_health_check_server_port = 8974
|
| 2154 |
+
|
| 2155 |
+
# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
|
| 2156 |
+
#
|
| 2157 |
+
# Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL
|
| 2158 |
+
#
|
| 2159 |
+
orphaned_tasks_check_interval = 300.0
|
| 2160 |
+
|
| 2161 |
+
# Determines the directory where logs for the child processes of the scheduler will be stored
|
| 2162 |
+
#
|
| 2163 |
+
# Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY
|
| 2164 |
+
#
|
| 2165 |
+
child_process_log_directory = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs/scheduler
|
| 2166 |
+
|
| 2167 |
+
# Local task jobs periodically heartbeat to the DB. If the job has
|
| 2168 |
+
# not heartbeat in this many seconds, the scheduler will mark the
|
| 2169 |
+
# associated task instance as failed and will re-schedule the task.
|
| 2170 |
+
#
|
| 2171 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD
|
| 2172 |
+
#
|
| 2173 |
+
scheduler_zombie_task_threshold = 300
|
| 2174 |
+
|
| 2175 |
+
# How often (in seconds) should the scheduler check for zombie tasks.
|
| 2176 |
+
#
|
| 2177 |
+
# Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL
|
| 2178 |
+
#
|
| 2179 |
+
zombie_detection_interval = 10.0
|
| 2180 |
+
|
| 2181 |
+
# Turn off scheduler catchup by setting this to ``False``.
|
| 2182 |
+
# Default behavior is unchanged and
|
| 2183 |
+
# Command Line Backfills still work, but the scheduler
|
| 2184 |
+
# will not do scheduler catchup if this is ``False``,
|
| 2185 |
+
# however it can be set on a per DAG basis in the
|
| 2186 |
+
# DAG definition (catchup)
|
| 2187 |
+
#
|
| 2188 |
+
# Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT
|
| 2189 |
+
#
|
| 2190 |
+
catchup_by_default = True
|
| 2191 |
+
|
| 2192 |
+
# Setting this to ``True`` will make first task instance of a task
|
| 2193 |
+
# ignore depends_on_past setting. A task instance will be considered
|
| 2194 |
+
# as the first task instance of a task when there is no task instance
|
| 2195 |
+
# in the DB with an execution_date earlier than it., i.e. no manual marking
|
| 2196 |
+
# success will be needed for a newly added task to be scheduled.
|
| 2197 |
+
#
|
| 2198 |
+
# Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT
|
| 2199 |
+
#
|
| 2200 |
+
ignore_first_depends_on_past_by_default = True
|
| 2201 |
+
|
| 2202 |
+
# This changes the batch size of queries in the scheduling main loop.
|
| 2203 |
+
# This should not be greater than ``[core] parallelism``.
|
| 2204 |
+
# If this is too high, SQL query performance may be impacted by
|
| 2205 |
+
# complexity of query predicate, and/or excessive locking.
|
| 2206 |
+
# Additionally, you may hit the maximum allowable query length for your db.
|
| 2207 |
+
# Set this to 0 to use the value of ``[core] parallelism``
|
| 2208 |
+
#
|
| 2209 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY
|
| 2210 |
+
#
|
| 2211 |
+
max_tis_per_query = 16
|
| 2212 |
+
|
| 2213 |
+
# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
|
| 2214 |
+
# If this is set to ``False`` then you should not run more than a single
|
| 2215 |
+
# scheduler at once
|
| 2216 |
+
#
|
| 2217 |
+
# Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING
|
| 2218 |
+
#
|
| 2219 |
+
use_row_level_locking = True
|
| 2220 |
+
|
| 2221 |
+
# Max number of DAGs to create DagRuns for per scheduler loop.
|
| 2222 |
+
#
|
| 2223 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP
|
| 2224 |
+
#
|
| 2225 |
+
max_dagruns_to_create_per_loop = 10
|
| 2226 |
+
|
| 2227 |
+
# How many DagRuns should a scheduler examine (and lock) when scheduling
|
| 2228 |
+
# and queuing tasks.
|
| 2229 |
+
#
|
| 2230 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE
|
| 2231 |
+
#
|
| 2232 |
+
max_dagruns_per_loop_to_schedule = 20
|
| 2233 |
+
|
| 2234 |
+
# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
|
| 2235 |
+
# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
|
| 2236 |
+
# dags in some circumstances
|
| 2237 |
+
#
|
| 2238 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION
|
| 2239 |
+
#
|
| 2240 |
+
schedule_after_task_execution = True
|
| 2241 |
+
|
| 2242 |
+
# The scheduler reads dag files to extract the airflow modules that are going to be used,
|
| 2243 |
+
# and imports them ahead of time to avoid having to re-do it for each parsing process.
|
| 2244 |
+
# This flag can be set to ``False`` to disable this behavior in case an airflow module needs
|
| 2245 |
+
# to be freshly imported each time (at the cost of increased DAG parsing time).
|
| 2246 |
+
#
|
| 2247 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES
|
| 2248 |
+
#
|
| 2249 |
+
parsing_pre_import_modules = True
|
| 2250 |
+
|
| 2251 |
+
# The scheduler can run multiple processes in parallel to parse dags.
|
| 2252 |
+
# This defines how many processes will run.
|
| 2253 |
+
#
|
| 2254 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES
|
| 2255 |
+
#
|
| 2256 |
+
parsing_processes = 2
|
| 2257 |
+
|
| 2258 |
+
# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
|
| 2259 |
+
# The scheduler will list and sort the dag files to decide the parsing order.
|
| 2260 |
+
#
|
| 2261 |
+
# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
|
| 2262 |
+
# recently modified DAGs first.
|
| 2263 |
+
# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
|
| 2264 |
+
# same host. This is useful when running with Scheduler in HA mode where each scheduler can
|
| 2265 |
+
# parse different DAG files.
|
| 2266 |
+
# * ``alphabetical``: Sort by filename
|
| 2267 |
+
#
|
| 2268 |
+
# Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE
|
| 2269 |
+
#
|
| 2270 |
+
file_parsing_sort_mode = modified_time
|
| 2271 |
+
|
| 2272 |
+
# Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
|
| 2273 |
+
# job.
|
| 2274 |
+
#
|
| 2275 |
+
# Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR
|
| 2276 |
+
#
|
| 2277 |
+
standalone_dag_processor = False
|
| 2278 |
+
|
| 2279 |
+
# Only applicable if ``[scheduler] standalone_dag_processor`` is true and callbacks are stored
|
| 2280 |
+
# in database. Contains maximum number of callbacks that are fetched during a single loop.
|
| 2281 |
+
#
|
| 2282 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP
|
| 2283 |
+
#
|
| 2284 |
+
max_callbacks_per_loop = 20
|
| 2285 |
+
|
| 2286 |
+
# Only applicable if ``[scheduler] standalone_dag_processor`` is true.
|
| 2287 |
+
# Time in seconds after which dags, which were not updated by Dag Processor are deactivated.
|
| 2288 |
+
#
|
| 2289 |
+
# Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION
|
| 2290 |
+
#
|
| 2291 |
+
dag_stale_not_seen_duration = 600
|
| 2292 |
+
|
| 2293 |
+
# Turn off scheduler use of cron intervals by setting this to ``False``.
|
| 2294 |
+
# DAGs submitted manually in the web UI or with trigger_dag will still run.
|
| 2295 |
+
#
|
| 2296 |
+
# Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE
|
| 2297 |
+
#
|
| 2298 |
+
use_job_schedule = True
|
| 2299 |
+
|
| 2300 |
+
# Allow externally triggered DagRuns for Execution Dates in the future
|
| 2301 |
+
# Only has effect if schedule_interval is set to None in DAG
|
| 2302 |
+
#
|
| 2303 |
+
# Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE
|
| 2304 |
+
#
|
| 2305 |
+
allow_trigger_in_future = False
|
| 2306 |
+
|
| 2307 |
+
# How often to check for expired trigger requests that have not run yet.
|
| 2308 |
+
#
|
| 2309 |
+
# Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL
|
| 2310 |
+
#
|
| 2311 |
+
trigger_timeout_check_interval = 15
|
| 2312 |
+
|
| 2313 |
+
# Amount of time a task can be in the queued state before being retried or set to failed.
|
| 2314 |
+
#
|
| 2315 |
+
# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT
|
| 2316 |
+
#
|
| 2317 |
+
task_queued_timeout = 600.0
|
| 2318 |
+
|
| 2319 |
+
# How often to check for tasks that have been in the queued state for
|
| 2320 |
+
# longer than ``[scheduler] task_queued_timeout``.
|
| 2321 |
+
#
|
| 2322 |
+
# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL
|
| 2323 |
+
#
|
| 2324 |
+
task_queued_timeout_check_interval = 120.0
|
| 2325 |
+
|
| 2326 |
+
# The run_id pattern used to verify the validity of user input to the run_id parameter when
|
| 2327 |
+
# triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id
|
| 2328 |
+
# for scheduled DAG runs or DAG runs triggered without changing the run_id parameter.
|
| 2329 |
+
#
|
| 2330 |
+
# Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN
|
| 2331 |
+
#
|
| 2332 |
+
allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$
|
| 2333 |
+
|
| 2334 |
+
# Whether to create DAG runs that span an interval or one single point in time for cron schedules, when
|
| 2335 |
+
# a cron string is provided to ``schedule`` argument of a DAG.
|
| 2336 |
+
#
|
| 2337 |
+
# * ``True``: **CronDataIntervalTimetable** is used, which is suitable
|
| 2338 |
+
# for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous
|
| 2339 |
+
# interval up to the scheduled datetime.
|
| 2340 |
+
# * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself.
|
| 2341 |
+
#
|
| 2342 |
+
# Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will
|
| 2343 |
+
# try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of
|
| 2344 |
+
# the data interval, but the DAG Run will try to schedule at the end of the data interval.
|
| 2345 |
+
#
|
| 2346 |
+
# Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS
|
| 2347 |
+
#
|
| 2348 |
+
create_cron_data_intervals = True
|
| 2349 |
+
|
| 2350 |
+
[triggerer]
|
| 2351 |
+
# How many triggers a single Triggerer will run at once, by default.
|
| 2352 |
+
#
|
| 2353 |
+
# Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY
|
| 2354 |
+
#
|
| 2355 |
+
default_capacity = 1000
|
| 2356 |
+
|
| 2357 |
+
# How often to heartbeat the Triggerer job to ensure it hasn't been killed.
|
| 2358 |
+
#
|
| 2359 |
+
# Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC
|
| 2360 |
+
#
|
| 2361 |
+
job_heartbeat_sec = 5
|
| 2362 |
+
|
| 2363 |
+
# If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold``
|
| 2364 |
+
# ago (in seconds), triggerer is considered unhealthy.
|
| 2365 |
+
# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
|
| 2366 |
+
# for TriggererJob.
|
| 2367 |
+
#
|
| 2368 |
+
# Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD
|
| 2369 |
+
#
|
| 2370 |
+
triggerer_health_check_threshold = 30
|
| 2371 |
+
|
| 2372 |
+
[kerberos]
|
| 2373 |
+
# Location of your ccache file once kinit has been performed.
|
| 2374 |
+
#
|
| 2375 |
+
# Variable: AIRFLOW__KERBEROS__CCACHE
|
| 2376 |
+
#
|
| 2377 |
+
ccache = /tmp/airflow_krb5_ccache
|
| 2378 |
+
|
| 2379 |
+
# gets augmented with fqdn
|
| 2380 |
+
#
|
| 2381 |
+
# Variable: AIRFLOW__KERBEROS__PRINCIPAL
|
| 2382 |
+
#
|
| 2383 |
+
principal = airflow
|
| 2384 |
+
|
| 2385 |
+
# Determines the frequency at which initialization or re-initialization processes occur.
|
| 2386 |
+
#
|
| 2387 |
+
# Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY
|
| 2388 |
+
#
|
| 2389 |
+
reinit_frequency = 3600
|
| 2390 |
+
|
| 2391 |
+
# Path to the kinit executable
|
| 2392 |
+
#
|
| 2393 |
+
# Variable: AIRFLOW__KERBEROS__KINIT_PATH
|
| 2394 |
+
#
|
| 2395 |
+
kinit_path = kinit
|
| 2396 |
+
|
| 2397 |
+
# Designates the path to the Kerberos keytab file for the Airflow user
|
| 2398 |
+
#
|
| 2399 |
+
# Variable: AIRFLOW__KERBEROS__KEYTAB
|
| 2400 |
+
#
|
| 2401 |
+
keytab = airflow.keytab
|
| 2402 |
+
|
| 2403 |
+
# Allow to disable ticket forwardability.
|
| 2404 |
+
#
|
| 2405 |
+
# Variable: AIRFLOW__KERBEROS__FORWARDABLE
|
| 2406 |
+
#
|
| 2407 |
+
forwardable = True
|
| 2408 |
+
|
| 2409 |
+
# Allow to remove source IP from token, useful when using token behind NATted Docker host.
|
| 2410 |
+
#
|
| 2411 |
+
# Variable: AIRFLOW__KERBEROS__INCLUDE_IP
|
| 2412 |
+
#
|
| 2413 |
+
include_ip = True
|
| 2414 |
+
|
| 2415 |
+
[sensors]
|
| 2416 |
+
# Sensor default timeout, 7 days by default (7 * 24 * 60 * 60).
|
| 2417 |
+
#
|
| 2418 |
+
# Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT
|
| 2419 |
+
#
|
| 2420 |
+
default_timeout = 604800
|
| 2421 |
+
|
| 2422 |
+
[usage_data_collection]
|
| 2423 |
+
# Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic platform and usage data
|
| 2424 |
+
# during operation. This data assists Airflow maintainers in better understanding how Airflow is used.
|
| 2425 |
+
# Insights gained from this telemetry are critical for prioritizing patches, minor releases, and
|
| 2426 |
+
# security fixes. Additionally, this information supports key decisions related to the development road map.
|
| 2427 |
+
# Check the FAQ doc for more information on what data is collected.
|
| 2428 |
+
#
|
| 2429 |
+
# Deployments can opt-out of analytics by setting the ``enabled`` option
|
| 2430 |
+
# to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable.
|
| 2431 |
+
# Individual users can easily opt-out of analytics in various ways documented in the
|
| 2432 |
+
# `Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__.
|
| 2433 |
+
|
| 2434 |
+
# Enable or disable usage data collection and sending.
|
| 2435 |
+
#
|
| 2436 |
+
# Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED
|
| 2437 |
+
#
|
| 2438 |
+
enabled = True
|
| 2439 |
+
|
| 2440 |
+
[common.io]
|
| 2441 |
+
# Common IO configuration section
|
| 2442 |
+
|
| 2443 |
+
# Path to a location on object storage where XComs can be stored in url format.
|
| 2444 |
+
#
|
| 2445 |
+
# Example: xcom_objectstorage_path = s3://conn_id@bucket/path
|
| 2446 |
+
#
|
| 2447 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH
|
| 2448 |
+
#
|
| 2449 |
+
xcom_objectstorage_path =
|
| 2450 |
+
|
| 2451 |
+
# Threshold in bytes for storing XComs in object storage. -1 means always store in the
|
| 2452 |
+
# database. 0 means always store in object storage. Any positive number means
|
| 2453 |
+
# it will be stored in object storage if the size of the value is greater than the threshold.
|
| 2454 |
+
#
|
| 2455 |
+
# Example: xcom_objectstorage_threshold = 1000000
|
| 2456 |
+
#
|
| 2457 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD
|
| 2458 |
+
#
|
| 2459 |
+
xcom_objectstorage_threshold = -1
|
| 2460 |
+
|
| 2461 |
+
# Compression algorithm to use when storing XComs in object storage. Supported algorithms
|
| 2462 |
+
# are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used.
|
| 2463 |
+
# Note that the compression algorithm must be available in the Python installation (e.g.
|
| 2464 |
+
# python-snappy for snappy). Zip, gz, bz2 are available by default.
|
| 2465 |
+
#
|
| 2466 |
+
# Example: xcom_objectstorage_compression = gz
|
| 2467 |
+
#
|
| 2468 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION
|
| 2469 |
+
#
|
| 2470 |
+
xcom_objectstorage_compression =
|
| 2471 |
+
|
| 2472 |
+
[fab]
|
| 2473 |
+
# This section contains configs specific to FAB provider.
|
| 2474 |
+
|
| 2475 |
+
# Boolean for enabling rate limiting on authentication endpoints.
|
| 2476 |
+
#
|
| 2477 |
+
# Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED
|
| 2478 |
+
#
|
| 2479 |
+
auth_rate_limited = True
|
| 2480 |
+
|
| 2481 |
+
# Rate limit for authentication endpoints.
|
| 2482 |
+
#
|
| 2483 |
+
# Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT
|
| 2484 |
+
#
|
| 2485 |
+
auth_rate_limit = 5 per 40 second
|
| 2486 |
+
|
| 2487 |
+
# Update FAB permissions and sync security manager roles
|
| 2488 |
+
# on webserver startup
|
| 2489 |
+
#
|
| 2490 |
+
# Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS
|
| 2491 |
+
#
|
| 2492 |
+
update_fab_perms = True
|
| 2493 |
+
|
| 2494 |
+
[imap]
|
| 2495 |
+
# Options for IMAP provider.
|
| 2496 |
+
|
| 2497 |
+
# ssl_context =
|
| 2498 |
+
|
airflow/airflow.db
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33e2a2d0de019f573198b2bd423a7d642c3f01f321c8527c3e8f04b9ee73d60a
|
| 3 |
+
size 1282048
|
airflow/dags/new6.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from airflow import DAG
|
| 2 |
+
from airflow.operators.python import PythonOperator
|
| 3 |
+
from datetime import datetime, timedelta, timezone
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# Add project root to path
|
| 8 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
| 9 |
+
|
| 10 |
+
from components.btcusdt_ingest_data import crawl_data_from_sources
|
| 11 |
+
from components.datalake_cr import up_to_minio
|
| 12 |
+
from components.process_data import extract_from_minio, transform_financial_data
|
| 13 |
+
from components.duckdb_api import push_to_duckdb
|
| 14 |
+
from components.duckdb2csv import duckdb_to_csv
|
| 15 |
+
from components.model.training import train_lstm_model
|
| 16 |
+
from components.model.evaluation import metric_and_predict_lstm_model
|
| 17 |
+
from components.utils.file_utils import (
|
| 18 |
+
load_extract_config,
|
| 19 |
+
define_server_filenames,
|
| 20 |
+
load_pipeline_config
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# DAG default arguments
|
| 24 |
+
default_args = {
|
| 25 |
+
'owner': 'airflow',
|
| 26 |
+
'start_date': datetime(2025, 10, 7, 20, 0),
|
| 27 |
+
'retries': 1,
|
| 28 |
+
'retry_delay': timedelta(minutes=5),
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# Define DAGs
|
| 32 |
+
dag_1 = DAG('crawl_to_minio', default_args=default_args,
|
| 33 |
+
schedule_interval='@monthly', max_active_runs=1, catchup=False)
|
| 34 |
+
dag_2 = DAG('etl_to_duckdb', default_args=default_args,
|
| 35 |
+
schedule_interval='@monthly', max_active_runs=1, catchup=False)
|
| 36 |
+
dag_3 = DAG('lstm_forecast', default_args=default_args,
|
| 37 |
+
schedule_interval='@monthly', max_active_runs=1, catchup=False)
|
| 38 |
+
dag_4 = DAG('duckdb_to_csv_export', default_args=default_args,
|
| 39 |
+
schedule_interval='@monthly', max_active_runs=1, catchup=False)
|
| 40 |
+
|
| 41 |
+
# Load pipeline configuration
|
| 42 |
+
pipeline_config = load_pipeline_config()
|
| 43 |
+
|
| 44 |
+
# DAG 1: Crawl to MinIO
|
| 45 |
+
download_binance_csv = PythonOperator(
|
| 46 |
+
task_id='download_binance_csv',
|
| 47 |
+
python_callable=crawl_data_from_sources,
|
| 48 |
+
dag=dag_1
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
extract_filenames_task = PythonOperator(
|
| 52 |
+
task_id='extract_filenames',
|
| 53 |
+
python_callable=define_server_filenames,
|
| 54 |
+
dag=dag_1
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
upload_to_minio_storage = PythonOperator(
|
| 58 |
+
task_id='upload_to_minio',
|
| 59 |
+
python_callable=up_to_minio,
|
| 60 |
+
op_kwargs={
|
| 61 |
+
'client_files': '{{ ti.xcom_pull(task_ids="download_binance_csv") }}',
|
| 62 |
+
'server_files': '{{ ti.xcom_pull(task_ids="extract_filenames") }}',
|
| 63 |
+
'bucket_name': pipeline_config['minio']['bucket_name']
|
| 64 |
+
},
|
| 65 |
+
dag=dag_1
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# DAG 2: MinIO to DuckDB
|
| 69 |
+
extract_data = PythonOperator(
|
| 70 |
+
task_id='extract_data',
|
| 71 |
+
python_callable=extract_from_minio,
|
| 72 |
+
op_kwargs={
|
| 73 |
+
'bucket_name': pipeline_config['minio']['bucket_name'],
|
| 74 |
+
'file_names': load_extract_config("extract_data.yml")["files"]
|
| 75 |
+
},
|
| 76 |
+
dag=dag_2
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
transform_data = PythonOperator(
|
| 80 |
+
task_id='transform_data',
|
| 81 |
+
python_callable=transform_financial_data,
|
| 82 |
+
op_kwargs={
|
| 83 |
+
'parquet_file_paths': '{{ ti.xcom_pull(task_ids="extract_data") }}',
|
| 84 |
+
'temp_parquet_path': pipeline_config['paths']['temp_parquet_path'],
|
| 85 |
+
'output_parquet_path': pipeline_config['paths']['output_parquet_path']
|
| 86 |
+
},
|
| 87 |
+
dag=dag_2
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
push_to_warehouse = PythonOperator(
|
| 91 |
+
task_id='export_duckdb',
|
| 92 |
+
python_callable=push_to_duckdb,
|
| 93 |
+
op_kwargs={
|
| 94 |
+
'duckdb_path': pipeline_config['paths']['duckdb_path'],
|
| 95 |
+
'parquet_path': '{{ ti.xcom_pull(task_ids="transform_data") }}'
|
| 96 |
+
},
|
| 97 |
+
dag=dag_2
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# DAG 3: LSTM Forecasting
|
| 101 |
+
train_lstm = PythonOperator(
|
| 102 |
+
task_id='train_lstm_model',
|
| 103 |
+
python_callable=train_lstm_model,
|
| 104 |
+
dag=dag_3
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# metric_and_predict_lstm = PythonOperator(
|
| 108 |
+
# task_id='metric_and_predict_lstm',
|
| 109 |
+
# python_callable=metric_and_predict_lstm_model,
|
| 110 |
+
# provide_context=True,
|
| 111 |
+
# dag=dag_3
|
| 112 |
+
# )
|
| 113 |
+
|
| 114 |
+
metric_and_predict_lstm = PythonOperator(
|
| 115 |
+
task_id='metric_and_predict_lstm',
|
| 116 |
+
python_callable=metric_and_predict_lstm_model,
|
| 117 |
+
op_kwargs={
|
| 118 |
+
'train_result': '{{ ti.xcom_pull(task_ids="train_lstm_model") }}'
|
| 119 |
+
},
|
| 120 |
+
provide_context=True, # Still needed for Jinja templating in op_kwargs
|
| 121 |
+
dag=dag_3
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# metric_and_predict_lstm = PythonOperator(
|
| 125 |
+
# task_id='metric_and_predict_lstm',
|
| 126 |
+
# python_callable=metric_and_predict_lstm_model,
|
| 127 |
+
# op_kwargs={'ti': '{{ ti }}'}, # Explicitly pass task instance
|
| 128 |
+
# provide_context=True, # Ensure context is provided for XCom
|
| 129 |
+
# dag=dag_3
|
| 130 |
+
# )
|
| 131 |
+
|
| 132 |
+
# DAG 4: DuckDB to CSV
|
| 133 |
+
export_duckdb_to_csv = PythonOperator(
|
| 134 |
+
task_id='export_duckdb_to_csv',
|
| 135 |
+
python_callable=duckdb_to_csv,
|
| 136 |
+
op_kwargs={
|
| 137 |
+
'duckdb_path': pipeline_config['paths']['duckdb_path'],
|
| 138 |
+
'output_csv_path': pipeline_config['paths']['output_csv_path']
|
| 139 |
+
},
|
| 140 |
+
dag=dag_4
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Dependencies
|
| 144 |
+
download_binance_csv >> extract_filenames_task >> upload_to_minio_storage
|
| 145 |
+
extract_data >> transform_data >> push_to_warehouse
|
| 146 |
+
train_lstm >> metric_and_predict_lstm
|
| 147 |
+
export_duckdb_to_csv
|
airflow/webserver_config.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Licensed to the Apache Software Foundation (ASF) under one
|
| 3 |
+
# or more contributor license agreements. See the NOTICE file
|
| 4 |
+
# distributed with this work for additional information
|
| 5 |
+
# regarding copyright ownership. The ASF licenses this file
|
| 6 |
+
# to you under the Apache License, Version 2.0 (the
|
| 7 |
+
# "License"); you may not use this file except in compliance
|
| 8 |
+
# with the License. You may obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing,
|
| 13 |
+
# software distributed under the License is distributed on an
|
| 14 |
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
| 15 |
+
# KIND, either express or implied. See the License for the
|
| 16 |
+
# specific language governing permissions and limitations
|
| 17 |
+
# under the License.
|
| 18 |
+
"""Default configuration for the Airflow webserver."""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import os
|
| 23 |
+
|
| 24 |
+
from flask_appbuilder.const import AUTH_DB
|
| 25 |
+
|
| 26 |
+
# from airflow.www.fab_security.manager import AUTH_LDAP
|
| 27 |
+
# from airflow.www.fab_security.manager import AUTH_OAUTH
|
| 28 |
+
# from airflow.www.fab_security.manager import AUTH_OID
|
| 29 |
+
# from airflow.www.fab_security.manager import AUTH_REMOTE_USER
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
basedir = os.path.abspath(os.path.dirname(__file__))
|
| 33 |
+
|
| 34 |
+
# Flask-WTF flag for CSRF
|
| 35 |
+
WTF_CSRF_ENABLED = True
|
| 36 |
+
WTF_CSRF_TIME_LIMIT = None
|
| 37 |
+
|
| 38 |
+
# ----------------------------------------------------
|
| 39 |
+
# AUTHENTICATION CONFIG
|
| 40 |
+
# ----------------------------------------------------
|
| 41 |
+
# For details on how to set up each of the following authentication, see
|
| 42 |
+
# http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods
|
| 43 |
+
# for details.
|
| 44 |
+
|
| 45 |
+
# The authentication type
|
| 46 |
+
# AUTH_OID : Is for OpenID
|
| 47 |
+
# AUTH_DB : Is for database
|
| 48 |
+
# AUTH_LDAP : Is for LDAP
|
| 49 |
+
# AUTH_REMOTE_USER : Is for using REMOTE_USER from web server
|
| 50 |
+
# AUTH_OAUTH : Is for OAuth
|
| 51 |
+
AUTH_TYPE = AUTH_DB
|
| 52 |
+
|
| 53 |
+
# Uncomment to setup Full admin role name
|
| 54 |
+
# AUTH_ROLE_ADMIN = 'Admin'
|
| 55 |
+
|
| 56 |
+
# Uncomment and set to desired role to enable access without authentication
|
| 57 |
+
# AUTH_ROLE_PUBLIC = 'Viewer'
|
| 58 |
+
|
| 59 |
+
# Will allow user self registration
|
| 60 |
+
# AUTH_USER_REGISTRATION = True
|
| 61 |
+
|
| 62 |
+
# The recaptcha it's automatically enabled for user self registration is active and the keys are necessary
|
| 63 |
+
# RECAPTCHA_PRIVATE_KEY = PRIVATE_KEY
|
| 64 |
+
# RECAPTCHA_PUBLIC_KEY = PUBLIC_KEY
|
| 65 |
+
|
| 66 |
+
# Config for Flask-Mail necessary for user self registration
|
| 67 |
+
# MAIL_SERVER = 'smtp.gmail.com'
|
| 68 |
+
# MAIL_USE_TLS = True
|
| 69 |
+
# MAIL_USERNAME = 'yourappemail@gmail.com'
|
| 70 |
+
# MAIL_PASSWORD = 'passwordformail'
|
| 71 |
+
# MAIL_DEFAULT_SENDER = 'sender@gmail.com'
|
| 72 |
+
|
| 73 |
+
# The default user self registration role
|
| 74 |
+
# AUTH_USER_REGISTRATION_ROLE = "Public"
|
| 75 |
+
|
| 76 |
+
# When using OAuth Auth, uncomment to setup provider(s) info
|
| 77 |
+
# Google OAuth example:
|
| 78 |
+
# OAUTH_PROVIDERS = [{
|
| 79 |
+
# 'name':'google',
|
| 80 |
+
# 'token_key':'access_token',
|
| 81 |
+
# 'icon':'fa-google',
|
| 82 |
+
# 'remote_app': {
|
| 83 |
+
# 'api_base_url':'https://www.googleapis.com/oauth2/v2/',
|
| 84 |
+
# 'client_kwargs':{
|
| 85 |
+
# 'scope': 'email profile'
|
| 86 |
+
# },
|
| 87 |
+
# 'access_token_url':'https://accounts.google.com/o/oauth2/token',
|
| 88 |
+
# 'authorize_url':'https://accounts.google.com/o/oauth2/auth',
|
| 89 |
+
# 'request_token_url': None,
|
| 90 |
+
# 'client_id': GOOGLE_KEY,
|
| 91 |
+
# 'client_secret': GOOGLE_SECRET_KEY,
|
| 92 |
+
# }
|
| 93 |
+
# }]
|
| 94 |
+
|
| 95 |
+
# When using LDAP Auth, setup the ldap server
|
| 96 |
+
# AUTH_LDAP_SERVER = "ldap://ldapserver.new"
|
| 97 |
+
|
| 98 |
+
# When using OpenID Auth, uncomment to setup OpenID providers.
|
| 99 |
+
# example for OpenID authentication
|
| 100 |
+
# OPENID_PROVIDERS = [
|
| 101 |
+
# { 'name': 'Yahoo', 'url': 'https://me.yahoo.com' },
|
| 102 |
+
# { 'name': 'AOL', 'url': 'http://openid.aol.com/<username>' },
|
| 103 |
+
# { 'name': 'Flickr', 'url': 'http://www.flickr.com/<username>' },
|
| 104 |
+
# { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }]
|
| 105 |
+
|
| 106 |
+
# ----------------------------------------------------
|
| 107 |
+
# Theme CONFIG
|
| 108 |
+
# ----------------------------------------------------
|
| 109 |
+
# Flask App Builder comes up with a number of predefined themes
|
| 110 |
+
# that you can use for Apache Airflow.
|
| 111 |
+
# http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes
|
| 112 |
+
# Please make sure to remove "navbar_color" configuration from airflow.cfg
|
| 113 |
+
# in order to fully utilize the theme. (or use that property in conjunction with theme)
|
| 114 |
+
# APP_THEME = "bootstrap-theme.css" # default bootstrap
|
| 115 |
+
# APP_THEME = "amelia.css"
|
| 116 |
+
# APP_THEME = "cerulean.css"
|
| 117 |
+
# APP_THEME = "cosmo.css"
|
| 118 |
+
# APP_THEME = "cyborg.css"
|
| 119 |
+
# APP_THEME = "darkly.css"
|
| 120 |
+
# APP_THEME = "flatly.css"
|
| 121 |
+
# APP_THEME = "journal.css"
|
| 122 |
+
# APP_THEME = "lumen.css"
|
| 123 |
+
# APP_THEME = "paper.css"
|
| 124 |
+
# APP_THEME = "readable.css"
|
| 125 |
+
# APP_THEME = "sandstone.css"
|
| 126 |
+
# APP_THEME = "simplex.css"
|
| 127 |
+
# APP_THEME = "slate.css"
|
| 128 |
+
# APP_THEME = "solar.css"
|
| 129 |
+
# APP_THEME = "spacelab.css"
|
| 130 |
+
# APP_THEME = "superhero.css"
|
| 131 |
+
# APP_THEME = "united.css"
|
| 132 |
+
# APP_THEME = "yeti.css"
|
analytics/BTCUSDT_report.pdf
ADDED
|
Binary file (60.6 kB). View file
|
|
|
ckpts/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.keras
|
| 2 |
+
*.pkl
|
ckpts/model_2025-10-28-11-33-51-(+07).h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86e23253135898780db99f96e8ff9625297d3d3389bd5f928a024f49c59e547c
|
| 3 |
+
size 2626736
|
ckpts/scaler_2025-10-28-11-33-51-(+07).pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e9ec47a4549e291a30cbac7225656f56d65069b62ead6d139323ec28a3933f3
|
| 3 |
+
size 523
|
components/__init__.py
ADDED
|
File without changes
|
components/btcusdt_ingest_data.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import zipfile
|
| 3 |
+
import io
|
| 4 |
+
import os
|
| 5 |
+
import yaml
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
def add_column_headers(csv_data: bytes, expected_columns: list) -> bytes:
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
# Read CSV data into a DataFrame without headers
|
| 15 |
+
df = pd.read_csv(io.BytesIO(csv_data), header=None)
|
| 16 |
+
|
| 17 |
+
# Validate column count
|
| 18 |
+
if len(df.columns) != len(expected_columns):
|
| 19 |
+
raise ValueError(
|
| 20 |
+
f"CSV has {len(df.columns)} columns, expected {len(expected_columns)}"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Assign column names
|
| 24 |
+
df.columns = expected_columns
|
| 25 |
+
|
| 26 |
+
# Convert DataFrame back to CSV bytes with headers
|
| 27 |
+
csv_buffer = io.StringIO()
|
| 28 |
+
df.to_csv(csv_buffer, index=False)
|
| 29 |
+
return csv_buffer.getvalue().encode('utf-8')
|
| 30 |
+
|
| 31 |
+
except pd.errors.ParserError as e:
|
| 32 |
+
raise Exception(f"Failed to parse CSV data: {e}")
|
| 33 |
+
except Exception as e:
|
| 34 |
+
raise Exception(f"Failed to process CSV with headers: {e}")
|
| 35 |
+
|
| 36 |
+
def download_and_extract_binance_data(url: str, output_path: str = "temp/input.csv") -> pd.DataFrame:
|
| 37 |
+
expected_columns = [
|
| 38 |
+
"Open time", "Open", "High", "Low", "Close", "Volume",
|
| 39 |
+
"Close time", "Quote asset volume", "Number of trades",
|
| 40 |
+
"Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
response = requests.get(url, timeout=10)
|
| 45 |
+
response.raise_for_status()
|
| 46 |
+
|
| 47 |
+
with io.BytesIO(response.content) as zip_file:
|
| 48 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
| 49 |
+
csv_file_name = zip_ref.namelist()[0]
|
| 50 |
+
csv_data = zip_ref.read(csv_file_name)
|
| 51 |
+
# csv_data = add_column_headers(csv_data, expected_columns)
|
| 52 |
+
with open(output_path, 'wb') as output_file:
|
| 53 |
+
output_file.write(csv_data)
|
| 54 |
+
|
| 55 |
+
print(f"Successfully downloaded and extracted data to {output_path}")
|
| 56 |
+
|
| 57 |
+
# Load the saved CSV into a DataFrame
|
| 58 |
+
df = pd.read_csv(output_path)
|
| 59 |
+
|
| 60 |
+
# Print the first few rows to inspect the data
|
| 61 |
+
# print("First few rows of the CSV:")
|
| 62 |
+
# print(df.head())
|
| 63 |
+
# print("Open time and Close time dtypes:", df["Open time"].dtype, df["Close time"].dtype)
|
| 64 |
+
|
| 65 |
+
# Check for invalid or out-of-range timestamps
|
| 66 |
+
# try:
|
| 67 |
+
# df["Open time"] = pd.to_datetime(df["Open time"], unit="ms", errors="coerce")
|
| 68 |
+
# df["Close time"] = pd.to_datetime(df["Close time"], unit="ms", errors="coerce")
|
| 69 |
+
# except Exception as e:
|
| 70 |
+
# print(f"Timestamp conversion error: {e}")
|
| 71 |
+
# print("Invalid Open time values:", df[df["Open time"].isna()])
|
| 72 |
+
# print("Invalid Close time values:", df[df["Close time"].isna()])
|
| 73 |
+
# raise
|
| 74 |
+
|
| 75 |
+
print("CSV structure validated and headers added successfully")
|
| 76 |
+
return df
|
| 77 |
+
|
| 78 |
+
except requests.RequestException as e:
|
| 79 |
+
raise Exception(f"Failed to download file from {url}: {e}")
|
| 80 |
+
except zipfile.BadZipFile as e:
|
| 81 |
+
raise Exception(f"Invalid ZIP file: {e}")
|
| 82 |
+
except IOError as e:
|
| 83 |
+
raise Exception(f"Failed to write to {output_path}: {e}")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
raise Exception(f"Failed during processing: {e}")
|
| 86 |
+
|
| 87 |
+
def crawl_data_from_sources():
|
| 88 |
+
try:
|
| 89 |
+
# Load data sources configuration
|
| 90 |
+
sources_path = Path("configs/data_sources.yml")
|
| 91 |
+
with open(sources_path, 'r') as file:
|
| 92 |
+
data_sources = yaml.safe_load(file)
|
| 93 |
+
if not data_sources or not isinstance(data_sources, list):
|
| 94 |
+
raise ValueError("Invalid or empty data_sources configuration file")
|
| 95 |
+
|
| 96 |
+
# Load data limit configuration
|
| 97 |
+
limits_path = Path("configs/data_limit.yml")
|
| 98 |
+
with open(limits_path, 'r') as file:
|
| 99 |
+
data_limits = yaml.safe_load(file)
|
| 100 |
+
if not data_limits or not isinstance(data_limits, list):
|
| 101 |
+
raise ValueError("Invalid or empty data_limit configuration file")
|
| 102 |
+
|
| 103 |
+
# Create a dictionary of limits for each data source
|
| 104 |
+
limits_dict = {limit['name']: limit['limit'] for limit in data_limits if isinstance(limit, dict) and 'name' in limit and 'limit' in limit}
|
| 105 |
+
|
| 106 |
+
output_paths = []
|
| 107 |
+
# Process each data source
|
| 108 |
+
for data_source in data_sources:
|
| 109 |
+
try:
|
| 110 |
+
if not isinstance(data_source, dict) or 'name' not in data_source or 'url' not in data_source:
|
| 111 |
+
print(f"Skipping invalid data source: {data_source}")
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
# Get allowed periods for this data source
|
| 115 |
+
allowed_periods = limits_dict.get(data_source['name'], [])
|
| 116 |
+
|
| 117 |
+
# Process each allowed period
|
| 118 |
+
for period in allowed_periods:
|
| 119 |
+
try:
|
| 120 |
+
# Validate period format
|
| 121 |
+
try:
|
| 122 |
+
datetime.strptime(period, '%Y-%m')
|
| 123 |
+
except ValueError:
|
| 124 |
+
print(f"Invalid period format for {data_source['name']}: {period}")
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
# Construct unique output path
|
| 128 |
+
output_path = f"temp/{data_source['name']}-{period}.csv"
|
| 129 |
+
|
| 130 |
+
# Create the directory if it doesn't exist
|
| 131 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 132 |
+
|
| 133 |
+
# Construct URL
|
| 134 |
+
url = f"{data_source['url']}{data_source['name']}-{period}.zip"
|
| 135 |
+
|
| 136 |
+
# Download and extract data
|
| 137 |
+
download_and_extract_binance_data(url, output_path)
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"Failed to process period {period} for {data_source['name']}: {e}")
|
| 141 |
+
continue
|
| 142 |
+
output_paths.append(output_path)
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"Failed to process data source {data_source.get('name', 'unknown')}: {e}")
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
return output_paths
|
| 149 |
+
|
| 150 |
+
except (yaml.YAMLError, FileNotFoundError) as e:
|
| 151 |
+
raise Exception(f"Failed to load configuration: {e}")
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise Exception(f"Script execution failed: {e}")
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
out_paths = crawl_data_from_sources()
|
| 157 |
+
print("Downloaded files:", out_paths)
|
components/datalake_cr.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import ast
|
| 4 |
+
|
| 5 |
+
# Add the project root directory to the Python path
|
| 6 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 7 |
+
from minio_api.client import sign_in, upload_file, download_file, create_bucket, list_objects
|
| 8 |
+
|
| 9 |
+
def up_to_minio(client_files, server_files, bucket_name="minio-ngrok-bucket"):
|
| 10 |
+
"""Upload the local CSV file to MinIO."""
|
| 11 |
+
# Convert stringified lists to actual lists if necessary
|
| 12 |
+
if isinstance(client_files, str):
|
| 13 |
+
try:
|
| 14 |
+
client_files = ast.literal_eval(client_files)
|
| 15 |
+
except (ValueError, SyntaxError) as e:
|
| 16 |
+
raise ValueError(f"Failed to parse client_files as a list: {client_files}, error: {e}")
|
| 17 |
+
if isinstance(server_files, str):
|
| 18 |
+
try:
|
| 19 |
+
server_files = ast.literal_eval(server_files)
|
| 20 |
+
except (ValueError, SyntaxError) as e:
|
| 21 |
+
raise ValueError(f"Failed to parse server_files as a list: {server_files}, error: {e}")
|
| 22 |
+
|
| 23 |
+
for client_file, server_file in zip(client_files, server_files):
|
| 24 |
+
# Check if local file exists
|
| 25 |
+
if not os.path.exists(client_file):
|
| 26 |
+
raise FileNotFoundError(f"Local file {client_file} does not exist")
|
| 27 |
+
|
| 28 |
+
minio_client = sign_in()
|
| 29 |
+
# Create bucket
|
| 30 |
+
create_bucket(minio_client, bucket_name)
|
| 31 |
+
|
| 32 |
+
# Upload file
|
| 33 |
+
print("Uploading file to MinIO:", client_file, "as", server_file)
|
| 34 |
+
upload_file(minio_client, bucket_name, client_file, server_file)
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
# Example usage
|
| 38 |
+
try:
|
| 39 |
+
up_to_minio(["temp/BTCUSDT-1s-2025-09.csv"],
|
| 40 |
+
["BTCUSDT-1s-2025-09.csv"],
|
| 41 |
+
"minio-ngrok-bucket")
|
| 42 |
+
print("File uploaded successfully.")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"Error uploading file: {e}")
|
components/delete_lstm_predict.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from components.model.model_utils import create_sequences
|
| 4 |
+
from model import build_lstm_model
|
| 5 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 6 |
+
from tensorflow import keras
|
| 7 |
+
|
| 8 |
+
df = pd.read_csv('/tmp/BTCUSDT-1s-2024-05.csv', header=None)
|
| 9 |
+
df.columns = [
|
| 10 |
+
"open_time", "open", "high", "low", "close", "volume",
|
| 11 |
+
"close_time", "quote_asset_volume", "number_of_trades",
|
| 12 |
+
"taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
prices = df['close'].astype(float).values.reshape(-1, 1)
|
| 16 |
+
scaler = MinMaxScaler()
|
| 17 |
+
prices_scaled = scaler.fit_transform(prices)
|
| 18 |
+
|
| 19 |
+
split_idx = int(len(prices_scaled) * 0.8)
|
| 20 |
+
test_data = prices_scaled[split_idx - 60:]
|
| 21 |
+
|
| 22 |
+
seq_length = 60
|
| 23 |
+
X_test, y_test = create_sequences(test_data, seq_length)
|
| 24 |
+
|
| 25 |
+
model = build_lstm_model(seq_length)
|
| 26 |
+
model.load_weights('./ckpts/lstm_checkpoint.keras')
|
| 27 |
+
|
| 28 |
+
loss = model.evaluate(X_test, y_test, verbose=0)
|
| 29 |
+
print(f"Test loss: {loss}")
|
| 30 |
+
|
| 31 |
+
last_seq = prices_scaled[-seq_length:]
|
| 32 |
+
next_pred = model.predict(last_seq.reshape(1, seq_length, 1))
|
| 33 |
+
next_price = scaler.inverse_transform(next_pred)
|
| 34 |
+
print(f"Predicted next close price: {next_price[0][0]}")
|
| 35 |
+
checkpoint_cb = keras.callbacks.ModelCheckpoint(
|
| 36 |
+
'./ckpts/lstm_checkpoint.keras', save_best_only=True, monitor='val_loss'
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Train
|
| 40 |
+
model.fit(
|
| 41 |
+
X_train, y_train,
|
| 42 |
+
epochs=5,
|
| 43 |
+
batch_size=64,
|
| 44 |
+
validation_data=(X_test, y_test),
|
| 45 |
+
callbacks=[checkpoint_cb],
|
| 46 |
+
verbose=2
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Evaluate and predict
|
| 50 |
+
loss = model.evaluate(X_test, y_test, verbose=0)
|
| 51 |
+
print(f"Test loss: {loss}")
|
| 52 |
+
|
| 53 |
+
last_seq = prices_scaled[-seq_length:]
|
| 54 |
+
next_pred = model.predict(last_seq.reshape(1, seq_length, 1))
|
| 55 |
+
next_price = scaler.inverse_transform(next_pred)
|
| 56 |
+
print(f"Predicted next close price: {next_price[0][0]}")
|
components/delete_lstm_train.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from components.model.model_utils import create_sequences
|
| 4 |
+
from model import build_lstm_model
|
| 5 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 6 |
+
from tensorflow import keras
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
df = pd.read_csv('/tmp/BTCUSDT-1s-2024-05.csv', header=None)
|
| 10 |
+
df.columns = [
|
| 11 |
+
"open_time", "open", "high", "low", "close", "volume",
|
| 12 |
+
"close_time", "quote_asset_volume", "number_of_trades",
|
| 13 |
+
"taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
prices = df['close'].astype(float).values.reshape(-1, 1)
|
| 17 |
+
scaler = MinMaxScaler()
|
| 18 |
+
prices_scaled = scaler.fit_transform(prices)
|
| 19 |
+
|
| 20 |
+
split_idx = int(len(prices_scaled) * 0.8)
|
| 21 |
+
train_data = prices_scaled[:split_idx]
|
| 22 |
+
test_data = prices_scaled[split_idx - 60:]
|
| 23 |
+
|
| 24 |
+
seq_length = 60
|
| 25 |
+
X_train, y_train = create_sequences(train_data, seq_length)
|
| 26 |
+
X_test, y_test = create_sequences(test_data, seq_length)
|
| 27 |
+
|
| 28 |
+
model = build_lstm_model(seq_length)
|
| 29 |
+
|
| 30 |
+
os.makedirs('./ckpts', exist_ok=True)
|
| 31 |
+
checkpoint_cb = keras.callbacks.ModelCheckpoint(
|
| 32 |
+
'./ckpts/lstm_checkpoint.keras', save_best_only=True, monitor='val_loss'
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
model.fit(
|
| 36 |
+
X_train, y_train,
|
| 37 |
+
epochs=5,
|
| 38 |
+
batch_size=64,
|
| 39 |
+
validation_data=(X_test, y_test),
|
| 40 |
+
callbacks=[checkpoint_cb],
|
| 41 |
+
verbose=2
|
| 42 |
+
)
|
components/delete_model.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tensorflow import keras
|
| 2 |
+
|
| 3 |
+
def build_lstm_model(seq_length):
|
| 4 |
+
model = keras.Sequential([
|
| 5 |
+
keras.layers.LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
|
| 6 |
+
keras.layers.LSTM(50),
|
| 7 |
+
keras.layers.Dense(1)
|
| 8 |
+
])
|
| 9 |
+
model.compile(optimizer='adam', loss='mse')
|
| 10 |
+
return model
|
components/duckdb2csv.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
def duckdb_to_csv(duckdb_path, output_csv_path):
|
| 6 |
+
try:
|
| 7 |
+
# Connect to DuckDB
|
| 8 |
+
con = duckdb.connect(duckdb_path)
|
| 9 |
+
# Query data
|
| 10 |
+
df = con.execute("SELECT * FROM aggregated_financial_data").fetchdf()
|
| 11 |
+
if df.empty:
|
| 12 |
+
raise ValueError("No data found in table 'aggregated_financial_data'")
|
| 13 |
+
# Save to CSV
|
| 14 |
+
df.to_csv(output_csv_path, index=False)
|
| 15 |
+
logging.info(f"Successfully exported data to {output_csv_path}")
|
| 16 |
+
except Exception as e:
|
| 17 |
+
logging.error(f"Error in duckdb_to_csv: {str(e)}")
|
| 18 |
+
raise
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
duckdb_to_csv("duckdb_databases/financial_data.db",
|
| 22 |
+
"analytics/financial_data.csv")
|
components/duckdb_api.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import duckdb
|
| 3 |
+
import shutil
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
def push_to_duckdb(duckdb_path, parquet_path, temp_parquet_path="temp/duckdb_temp_parquet"):
|
| 7 |
+
"""
|
| 8 |
+
Push the aggregated data from a Parquet directory to DuckDB.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
duckdb_path (str): Path to the DuckDB database file
|
| 12 |
+
parquet_path (str): Path to the Parquet directory containing the aggregated data
|
| 13 |
+
temp_parquet_path (str): Temporary path for storing Parquet files
|
| 14 |
+
"""
|
| 15 |
+
# Validate input parquet_path
|
| 16 |
+
if not isinstance(parquet_path, str):
|
| 17 |
+
raise ValueError(f"parquet_path must be a string, got {type(parquet_path)}: {parquet_path}")
|
| 18 |
+
if not os.path.exists(parquet_path):
|
| 19 |
+
raise FileNotFoundError(f"Parquet directory does not exist at {parquet_path}")
|
| 20 |
+
if not os.path.isdir(parquet_path):
|
| 21 |
+
raise ValueError(f"parquet_path must be a directory, got a file at {parquet_path}")
|
| 22 |
+
|
| 23 |
+
# Ensure the temporary directory is clean before copying
|
| 24 |
+
if os.path.exists(temp_parquet_path):
|
| 25 |
+
shutil.rmtree(temp_parquet_path)
|
| 26 |
+
os.makedirs(temp_parquet_path, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
# Copy the input Parquet directory to the temporary directory
|
| 29 |
+
try:
|
| 30 |
+
shutil.copytree(parquet_path, temp_parquet_path, dirs_exist_ok=True)
|
| 31 |
+
print(f"Copied Parquet directory from {parquet_path} to {temp_parquet_path}")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
raise RuntimeError(f"Failed to copy Parquet directory from {parquet_path} to {temp_parquet_path}: {e}")
|
| 34 |
+
|
| 35 |
+
# Connect to DuckDB
|
| 36 |
+
directory = os.path.dirname(duckdb_path)
|
| 37 |
+
if directory and not os.path.exists(directory):
|
| 38 |
+
os.makedirs(directory)
|
| 39 |
+
|
| 40 |
+
con = duckdb.connect(duckdb_path)
|
| 41 |
+
|
| 42 |
+
# Create or replace the table in DuckDB by reading the Parquet files
|
| 43 |
+
try:
|
| 44 |
+
con.execute(f"""
|
| 45 |
+
CREATE OR REPLACE TABLE aggregated_financial_data AS
|
| 46 |
+
SELECT * FROM parquet_scan('{temp_parquet_path}/*.parquet')
|
| 47 |
+
""")
|
| 48 |
+
print(f"Successfully loaded data into DuckDB table from {temp_parquet_path}")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
raise RuntimeError(f"Failed to load Parquet files into DuckDB: {e}")
|
| 51 |
+
finally:
|
| 52 |
+
con.close()
|
| 53 |
+
|
| 54 |
+
# Clean up temporary Parquet directory
|
| 55 |
+
if os.path.exists(temp_parquet_path):
|
| 56 |
+
shutil.rmtree(temp_parquet_path)
|
| 57 |
+
print(f"Cleaned up temporary directory {temp_parquet_path}")
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
from process_data import process_financial_data
|
| 61 |
+
duckdb_path = "duckdb_databases/financial_data.db"
|
| 62 |
+
parquet_path = process_financial_data()
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
push_to_duckdb(duckdb_path, parquet_path)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"Error pushing to DuckDB: {e}")
|
| 68 |
+
|
components/model/__init__.py
ADDED
|
File without changes
|
components/model/data_utils.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import pyarrow.parquet as pq
|
| 6 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 7 |
+
|
| 8 |
+
# Configure logging
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
def create_data_loader(parquet_paths: list, scaler: MinMaxScaler, seq_length: int, batch_size: int) -> tf.data.Dataset:
|
| 12 |
+
"""Create a tf.data.Dataset from Parquet files for LSTM training or evaluation.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
parquet_paths (list): List of paths to Parquet files.
|
| 16 |
+
scaler (MinMaxScaler): Scaler fitted on the data.
|
| 17 |
+
seq_length (int): Length of input sequences.
|
| 18 |
+
batch_size (int): Batch size for the dataset.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
tf.data.Dataset: Dataset yielding (sequence, target) pairs with shapes (batch_size, seq_length, 1) and (batch_size, 1).
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
ValueError: If inputs are invalid or no valid data is found.
|
| 25 |
+
"""
|
| 26 |
+
if not parquet_paths:
|
| 27 |
+
logger.error("No parquet paths provided")
|
| 28 |
+
raise ValueError("parquet_paths cannot be empty")
|
| 29 |
+
if not isinstance(scaler, MinMaxScaler):
|
| 30 |
+
logger.error("Invalid scaler provided")
|
| 31 |
+
raise ValueError("scaler must be an instance of MinMaxScaler")
|
| 32 |
+
if not isinstance(seq_length, int) or seq_length <= 0:
|
| 33 |
+
logger.error(f"Invalid seq_length: {seq_length}")
|
| 34 |
+
raise ValueError("seq_length must be a positive integer")
|
| 35 |
+
if not isinstance(batch_size, int) or batch_size <= 0:
|
| 36 |
+
logger.error(f"Invalid batch_size: {batch_size}")
|
| 37 |
+
raise ValueError("batch_size must be a positive integer")
|
| 38 |
+
|
| 39 |
+
total_sequences = 0
|
| 40 |
+
def _scaled_generator():
|
| 41 |
+
nonlocal total_sequences
|
| 42 |
+
for path in parquet_paths:
|
| 43 |
+
if not os.path.exists(path):
|
| 44 |
+
logger.warning(f"Parquet file not found, skipping: {path}")
|
| 45 |
+
continue
|
| 46 |
+
try:
|
| 47 |
+
file_size = os.path.getsize(path) / (1024 * 1024) # Size in MB
|
| 48 |
+
if file_size < 100: # Load small files into memory
|
| 49 |
+
df = pd.read_parquet(path, columns=['Close'])
|
| 50 |
+
logger.debug(f"Loaded {path} into memory, size: {file_size:.2f} MB")
|
| 51 |
+
if 'Close' not in df.columns or df['Close'].isna().any():
|
| 52 |
+
logger.warning(f"Invalid or missing 'Close' column in {path}")
|
| 53 |
+
continue
|
| 54 |
+
prices = df['Close'].astype('float32').values.reshape(-1, 1)
|
| 55 |
+
if prices.size <= seq_length:
|
| 56 |
+
logger.warning(f"File {path} has {prices.size} rows, insufficient for seq_length {seq_length}")
|
| 57 |
+
continue
|
| 58 |
+
scaled = scaler.transform(prices)
|
| 59 |
+
for j in range(len(scaled) - seq_length):
|
| 60 |
+
total_sequences += 1
|
| 61 |
+
yield scaled[j:j + seq_length], scaled[j + seq_length]
|
| 62 |
+
else:
|
| 63 |
+
parquet_file = pq.ParquetFile(path)
|
| 64 |
+
for batch in parquet_file.iter_batches(batch_size=10_000, columns=['Close']):
|
| 65 |
+
chunk = batch.to_pandas()
|
| 66 |
+
if 'Close' not in chunk.columns or chunk['Close'].isna().any():
|
| 67 |
+
logger.warning(f"Invalid or missing 'Close' column in {path}")
|
| 68 |
+
continue
|
| 69 |
+
prices = chunk['Close'].astype('float32').values.reshape(-1, 1)
|
| 70 |
+
scaled = scaler.transform(prices)
|
| 71 |
+
logger.debug(f"Processing batch from {path}, scaled shape: {scaled.shape}")
|
| 72 |
+
for j in range(len(scaled) - seq_length):
|
| 73 |
+
total_sequences += 1
|
| 74 |
+
yield scaled[j:j + seq_length], scaled[j + seq_length]
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Error processing parquet file {path}: {e}")
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
if total_sequences == 0:
|
| 80 |
+
logger.error("No valid sequences generated from any Parquet file")
|
| 81 |
+
raise ValueError("No valid sequences generated from any Parquet file")
|
| 82 |
+
|
| 83 |
+
dataset = tf.data.Dataset.from_generator(
|
| 84 |
+
_scaled_generator,
|
| 85 |
+
output_types=(tf.float32, tf.float32),
|
| 86 |
+
output_shapes=((seq_length, 1), (1,))
|
| 87 |
+
).batch(batch_size).prefetch(tf.data.AUTOTUNE)
|
| 88 |
+
|
| 89 |
+
logger.info(f"Created data loader with seq_length={seq_length}, batch_size={batch_size}, total_sequences={total_sequences}")
|
| 90 |
+
return dataset
|
components/model/evaluation.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 6 |
+
import pickle
|
| 7 |
+
from sklearn.metrics import (
|
| 8 |
+
mean_squared_error,
|
| 9 |
+
mean_absolute_error,
|
| 10 |
+
mean_absolute_percentage_error
|
| 11 |
+
)
|
| 12 |
+
from typing import Dict, List, Tuple
|
| 13 |
+
from datetime import datetime, timezone
|
| 14 |
+
import tensorflow as tf
|
| 15 |
+
import sys
|
| 16 |
+
import ast
|
| 17 |
+
|
| 18 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
| 19 |
+
from components.utils.file_utils import load_extract_config, get_parquet_file_names
|
| 20 |
+
from components.model.model_utils import build_model_from_config
|
| 21 |
+
from components.model.data_utils import create_data_loader
|
| 22 |
+
from components.utils.utils import parse_timezone
|
| 23 |
+
|
| 24 |
+
logging.basicConfig(
|
| 25 |
+
level=logging.INFO,
|
| 26 |
+
format='%(asctime)s %(levelname)s: %(message)s',
|
| 27 |
+
datefmt='%Y-%m-%d %H:%M:%S %Z'
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
# def model_evaluate(model, scaler: MinMaxScaler, ds: tf.data.Dataset) -> Tuple[float, float]:
|
| 32 |
+
# """Evaluate a model on a dataset and return RMSE and MAE.
|
| 33 |
+
|
| 34 |
+
# Args:
|
| 35 |
+
# model: Trained Keras model.
|
| 36 |
+
# scaler (MinMaxScaler): Scaler used for data normalization.
|
| 37 |
+
# ds (tf.data.Dataset): Dataset to evaluate on.
|
| 38 |
+
|
| 39 |
+
# Returns:
|
| 40 |
+
# Tuple[float, float]: RMSE and MAE metrics.
|
| 41 |
+
# """
|
| 42 |
+
# y_true, y_pred = [], []
|
| 43 |
+
# for X, y in ds:
|
| 44 |
+
# pred = model.predict(X, verbose=2)
|
| 45 |
+
# y_true.append(y.numpy())
|
| 46 |
+
# y_pred.append(pred)
|
| 47 |
+
# y_true = np.concatenate(y_true)
|
| 48 |
+
# y_pred = np.concatenate(y_pred)
|
| 49 |
+
# y_true_orig = scaler.inverse_transform(y_true)
|
| 50 |
+
# y_pred_orig = scaler.inverse_transform(y_pred)
|
| 51 |
+
# return (np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
|
| 52 |
+
# mean_absolute_error(y_true_orig, y_pred_orig))
|
| 53 |
+
|
| 54 |
+
def model_evaluate(model, scaler: MinMaxScaler, ds: tf.data.Dataset) -> Tuple[float, float]:
|
| 55 |
+
"""Evaluate a model on a dataset and return RMSE and MAE.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
model: Trained Keras model.
|
| 59 |
+
scaler (MinMaxScaler): Scaler used for data normalization.
|
| 60 |
+
ds (tf.data.Dataset): Dataset to evaluate on.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
Tuple[float, float]: RMSE and MAE metrics.
|
| 64 |
+
"""
|
| 65 |
+
# Collect true labels (y) from dataset
|
| 66 |
+
y_true = []
|
| 67 |
+
for _, y in ds:
|
| 68 |
+
y_true.append(y.numpy())
|
| 69 |
+
y_true = np.concatenate(y_true)
|
| 70 |
+
|
| 71 |
+
# Predict the entire dataset
|
| 72 |
+
y_pred = model.predict(ds, verbose=0) # Silent predictions
|
| 73 |
+
|
| 74 |
+
# Inverse transform to original scale
|
| 75 |
+
y_true_orig = scaler.inverse_transform(y_true)
|
| 76 |
+
y_pred_orig = scaler.inverse_transform(y_pred)
|
| 77 |
+
|
| 78 |
+
# Calculate metrics
|
| 79 |
+
return (np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
|
| 80 |
+
mean_absolute_error(y_true_orig, y_pred_orig),
|
| 81 |
+
mean_absolute_percentage_error(y_true_orig, y_pred_orig))
|
| 82 |
+
|
| 83 |
+
def metric_and_predict_lstm_model(train_result: Dict) -> Dict:
|
| 84 |
+
"""Evaluate the trained LSTM model and predict the next price.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
train_result (Dict): Training result dictionary from train_lstm_model task.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Dict: Evaluation metrics and prediction metadata.
|
| 91 |
+
"""
|
| 92 |
+
# Access ti directly from kwargs
|
| 93 |
+
if not train_result:
|
| 94 |
+
raise ValueError("No training result provided.")
|
| 95 |
+
|
| 96 |
+
# Convert string representation to dictionary if necessary
|
| 97 |
+
train_result = ast.literal_eval(train_result)
|
| 98 |
+
|
| 99 |
+
cfg = load_extract_config('model_config.yml')
|
| 100 |
+
parquet_folder = load_extract_config('pipeline_config.yml')['paths']['parquet_folder']
|
| 101 |
+
os.makedirs(parquet_folder, exist_ok=True)
|
| 102 |
+
|
| 103 |
+
model_cfg = cfg['model']
|
| 104 |
+
data_cfg = cfg['data']
|
| 105 |
+
out_cfg = cfg['output']
|
| 106 |
+
dt_str = train_result['datetime']
|
| 107 |
+
model_filename = train_result['model_filename']
|
| 108 |
+
dataset_merge = train_result['dataset_merge']
|
| 109 |
+
|
| 110 |
+
model_path = train_result['model_path']
|
| 111 |
+
scaler_path = train_result['scaler_path']
|
| 112 |
+
seq_length = data_cfg['seq_length']
|
| 113 |
+
batch_size = cfg['evaluation'].get('eval_batch_size', 64)
|
| 114 |
+
|
| 115 |
+
# Load scaler and model
|
| 116 |
+
with open(scaler_path, 'rb') as f:
|
| 117 |
+
scaler = pickle.load(f)
|
| 118 |
+
model = build_model_from_config(seq_length, cfg)
|
| 119 |
+
model.load_weights(model_path)
|
| 120 |
+
|
| 121 |
+
# Create dataset
|
| 122 |
+
parquet_paths = [parquet_folder for el in get_parquet_file_names()]
|
| 123 |
+
dataset = create_data_loader(parquet_paths, scaler, seq_length, batch_size)
|
| 124 |
+
|
| 125 |
+
# Calculate splits
|
| 126 |
+
total_seqs = sum(max(0, len(pd.read_parquet(path, columns=['Close'])) - seq_length)
|
| 127 |
+
for path in parquet_paths if os.path.exists(path))
|
| 128 |
+
if total_seqs == 0:
|
| 129 |
+
raise ValueError("Not enough sequences for evaluation.")
|
| 130 |
+
|
| 131 |
+
steps_total = (total_seqs + batch_size - 1) // batch_size
|
| 132 |
+
steps_train = int(steps_total * data_cfg['train_ratio'])
|
| 133 |
+
steps_val = int(steps_total * data_cfg['val_ratio'])
|
| 134 |
+
steps_test = steps_total - steps_train - steps_val
|
| 135 |
+
|
| 136 |
+
train_ds = dataset.take(steps_train)
|
| 137 |
+
val_ds = dataset.skip(steps_train).take(steps_val)
|
| 138 |
+
test_ds = dataset.skip(steps_train + steps_val)
|
| 139 |
+
|
| 140 |
+
# Evaluate model
|
| 141 |
+
# train_rmse, train_mae = model_evaluate(model, scaler, train_ds)
|
| 142 |
+
# val_rmse, val_mae = model_evaluate(model, scaler, val_ds)
|
| 143 |
+
train_rmse, train_mae, train_mape = model_evaluate(model, scaler, train_ds)
|
| 144 |
+
val_rmse, val_mae, val_mape = model_evaluate(model, scaler, val_ds)
|
| 145 |
+
test_rmse, test_mae, test_mape = model_evaluate(model, scaler, test_ds)
|
| 146 |
+
|
| 147 |
+
# Save metrics
|
| 148 |
+
metrics_path = os.path.join(out_cfg['metrics']['metrics_dir'], f"metrics_{dt_str}.csv")
|
| 149 |
+
os.makedirs(out_cfg['metrics']['metrics_dir'], exist_ok=True)
|
| 150 |
+
|
| 151 |
+
metrics_data = [
|
| 152 |
+
[model_filename, dataset_merge, "Train", "RMSE", train_rmse],
|
| 153 |
+
[model_filename, dataset_merge, "Train", "MAE", train_mae],
|
| 154 |
+
[model_filename, dataset_merge, "Train", "MAPE", train_mape],
|
| 155 |
+
[model_filename, dataset_merge, "Val", "RMSE", val_rmse],
|
| 156 |
+
[model_filename, dataset_merge, "Val", "MAE", val_mae],
|
| 157 |
+
[model_filename, dataset_merge, "Val", "MAPE", val_mape],
|
| 158 |
+
[model_filename, dataset_merge, "Test", "RMSE", test_rmse],
|
| 159 |
+
[model_filename, dataset_merge, "Test", "MAE", test_mae],
|
| 160 |
+
[model_filename, dataset_merge, "Test", "MAPE", test_mape],
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
metrics_df = pd.DataFrame(
|
| 164 |
+
metrics_data,
|
| 165 |
+
columns=['model_path', 'dataset_merge', 'Split', 'Metric', 'Value']
|
| 166 |
+
)
|
| 167 |
+
metrics_df.to_csv(metrics_path, index=False)
|
| 168 |
+
|
| 169 |
+
# Predict next price
|
| 170 |
+
last_chunk = None
|
| 171 |
+
for path in reversed(parquet_paths):
|
| 172 |
+
if os.path.exists(path):
|
| 173 |
+
df_tail = pd.read_parquet(path).tail(seq_length)
|
| 174 |
+
if len(df_tail) >= seq_length:
|
| 175 |
+
last_chunk = df_tail['Close'].values.astype('float32').reshape(-1, 1)
|
| 176 |
+
break
|
| 177 |
+
if last_chunk is None:
|
| 178 |
+
raise ValueError("Not enough recent data for prediction.")
|
| 179 |
+
|
| 180 |
+
last_scaled = scaler.transform(last_chunk)
|
| 181 |
+
next_scaled = model.predict(last_scaled.reshape(1, seq_length, 1), verbose=2)
|
| 182 |
+
next_price = scaler.inverse_transform(next_scaled)[0][0]
|
| 183 |
+
|
| 184 |
+
# Save prediction
|
| 185 |
+
pred_path = os.path.join(out_cfg['predictions']['pred_dir'], f"prediction_{dt_str}.txt")
|
| 186 |
+
os.makedirs(os.path.dirname(pred_path), exist_ok=True)
|
| 187 |
+
|
| 188 |
+
with open(pred_path, 'w') as f:
|
| 189 |
+
f.write(f"Model Run: {dt_str}\n")
|
| 190 |
+
f.write(f"Model File: {model_filename}\n")
|
| 191 |
+
f.write(f"Dataset Merged: {dataset_merge}\n")
|
| 192 |
+
f.write(f"Architecture: {model_cfg['architecture'].upper()}\n")
|
| 193 |
+
f.write(f"Predicted Next Close: {next_price:.6f}\n")
|
| 194 |
+
f.write(f"Based on last {seq_length} timesteps.\n\n")
|
| 195 |
+
f.write("Evaluation Metrics:\n")
|
| 196 |
+
f.write(f" Train -> RMSE: {train_rmse:8.6f} | MAE: {train_mae:8.6f} | MAPE: {train_mape:8.6f}\n")
|
| 197 |
+
f.write(f" Val -> RMSE: {val_rmse:8.6f} | MAE: {val_mae:8.6f} | MAPE: {val_mape:8.6f}\n")
|
| 198 |
+
f.write(f" Test -> RMSE: {test_rmse:8.6f} | MAE: {test_mae:8.6f} | MAPE: {test_mape:8.6f}\n")
|
| 199 |
+
|
| 200 |
+
logging.info(f"Next price: {next_price:.4f} | Test RMSE: {test_rmse:.6f} | Dataset: {dataset_merge}")
|
| 201 |
+
|
| 202 |
+
return {
|
| 203 |
+
'metrics_path': metrics_path,
|
| 204 |
+
'prediction_path': pred_path,
|
| 205 |
+
'next_price': float(next_price)
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
logger.info("Running standalone evaluation test")
|
| 210 |
+
# Simulate training result for testing
|
| 211 |
+
cfg = load_extract_config('model_config.yml')
|
| 212 |
+
out_cfg = cfg['output']
|
| 213 |
+
data_cfg = cfg['data']
|
| 214 |
+
|
| 215 |
+
# Mock training result (adjust paths to match an actual trained model and scaler)
|
| 216 |
+
mock_train_result = {
|
| 217 |
+
'model_path': os.path.join(out_cfg['checkpoints']['model_dir'],
|
| 218 |
+
'model_2025-10-24-21-59-42-(+07).h5'),
|
| 219 |
+
'model_filename': 'model_2025-10-24-18-40-00-(+07).h5',
|
| 220 |
+
'scaler_path': os.path.join(out_cfg['checkpoints']['scaler_dir'],
|
| 221 |
+
'scaler_2025-10-24-21-59-42-(+07).pkl'),
|
| 222 |
+
'datetime': '2025-10-24-21-59-42-(+07',
|
| 223 |
+
'dataset_merge': 'BTCUSDT-1s-2025-08 + BTCUSDT-1s-2025-09'
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Simulate Airflow task instance
|
| 227 |
+
class MockTaskInstance:
|
| 228 |
+
def xcom_pull(self, task_ids):
|
| 229 |
+
return mock_train_result
|
| 230 |
+
|
| 231 |
+
mock_ti = MockTaskInstance()
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
result = metric_and_predict_lstm_model(ti=mock_ti)
|
| 235 |
+
logger.info("Evaluation completed successfully!")
|
| 236 |
+
logger.info(f"Result: {result}")
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Evaluation failed: {str(e)}")
|
| 239 |
+
logger.info("Standalone evaluation run completed")
|
components/model/model_utils.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import pyarrow.parquet as pq
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 8 |
+
from tensorflow import keras
|
| 9 |
+
from typing import Tuple
|
| 10 |
+
from datetime import datetime, timezone, timedelta
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
| 14 |
+
from components.model.data_utils import create_data_loader
|
| 15 |
+
|
| 16 |
+
# Configure logging with +07:00 timezone
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format='%(asctime)s %(levelname)s: %(message)s',
|
| 20 |
+
datefmt='%Y-%m-%d %H:%M:%S %Z'
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
def create_sequences(data: np.ndarray, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 25 |
+
"""Create sequences of data for LSTM model training and prediction.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
data (np.ndarray): Input time series data (scaled), shape (n_samples, n_features).
|
| 29 |
+
seq_length (int): Length of each sequence.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Tuple[np.ndarray, np.ndarray]: (X, y) where X is input sequences (n_samples, seq_length, n_features)
|
| 33 |
+
and y is target values (n_samples, n_features).
|
| 34 |
+
|
| 35 |
+
Raises:
|
| 36 |
+
ValueError: If data is empty, seq_length is invalid, or data has insufficient length.
|
| 37 |
+
"""
|
| 38 |
+
if not isinstance(data, np.ndarray):
|
| 39 |
+
logger.error("Input data must be a numpy array")
|
| 40 |
+
raise ValueError("Input data must be a numpy array")
|
| 41 |
+
if data.size == 0:
|
| 42 |
+
logger.error("Input data is empty")
|
| 43 |
+
raise ValueError("Input data is empty")
|
| 44 |
+
if not isinstance(seq_length, int) or seq_length <= 0:
|
| 45 |
+
logger.error(f"Invalid seq_length: {seq_length}")
|
| 46 |
+
raise ValueError("seq_length must be a positive integer")
|
| 47 |
+
if len(data) <= seq_length:
|
| 48 |
+
logger.error(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
|
| 49 |
+
raise ValueError(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
|
| 50 |
+
|
| 51 |
+
X, y = [], []
|
| 52 |
+
for i in range(len(data) - seq_length):
|
| 53 |
+
sequence = data[i:i + seq_length]
|
| 54 |
+
target = data[i + seq_length]
|
| 55 |
+
X.append(sequence)
|
| 56 |
+
y.append(target)
|
| 57 |
+
|
| 58 |
+
X = np.array(X)
|
| 59 |
+
y = np.array(y)
|
| 60 |
+
|
| 61 |
+
if len(X.shape) == 2:
|
| 62 |
+
X = X.reshape(X.shape[0], X.shape[1], 1)
|
| 63 |
+
|
| 64 |
+
logger.info(f"Created {X.shape[0]} sequences: X shape {X.shape}, y shape {y.shape}")
|
| 65 |
+
return X, y
|
| 66 |
+
|
| 67 |
+
def build_model_from_config(seq_length: int, cfg: dict) -> keras.Model:
|
| 68 |
+
"""Build an LSTM-based model based on configuration.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
seq_length (int): Length of input sequences.
|
| 72 |
+
cfg (dict): Model configuration dictionary with 'model' key containing architecture, units, etc.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
keras.Model: Compiled Keras model.
|
| 76 |
+
|
| 77 |
+
Raises:
|
| 78 |
+
ValueError: If configuration is invalid or architecture is unsupported.
|
| 79 |
+
"""
|
| 80 |
+
if not isinstance(cfg, dict) or 'model' not in cfg:
|
| 81 |
+
logger.error("Invalid configuration: 'model' key missing")
|
| 82 |
+
raise ValueError("Configuration must be a dictionary with a 'model' key")
|
| 83 |
+
|
| 84 |
+
model_cfg = cfg['model']
|
| 85 |
+
arch = model_cfg.get('architecture')
|
| 86 |
+
units = model_cfg.get('units')
|
| 87 |
+
layers = model_cfg.get('layers', 1)
|
| 88 |
+
dropout = model_cfg.get('dropout', 0.2)
|
| 89 |
+
activation = model_cfg.get('activation', 'tanh')
|
| 90 |
+
learning_rate = model_cfg.get('learning_rate', 0.001)
|
| 91 |
+
|
| 92 |
+
if not isinstance(units, int) or units <= 0:
|
| 93 |
+
logger.error(f"Invalid units: {units}")
|
| 94 |
+
raise ValueError("units must be a positive integer")
|
| 95 |
+
if not isinstance(layers, int) or layers <= 0:
|
| 96 |
+
logger.error(f"Invalid layers: {layers}")
|
| 97 |
+
raise ValueError("layers must be a positive integer")
|
| 98 |
+
if not isinstance(dropout, float) or not 0 <= dropout < 1:
|
| 99 |
+
logger.error(f"Invalid dropout: {dropout}")
|
| 100 |
+
raise ValueError("dropout must be a float between 0 and 1")
|
| 101 |
+
if arch not in ['lstm', 'bilstm', 'gru', 'custom']:
|
| 102 |
+
logger.error(f"Unsupported architecture: {arch}")
|
| 103 |
+
raise ValueError(f"Unsupported architecture: {arch}")
|
| 104 |
+
if not isinstance(seq_length, int) or seq_length <= 0:
|
| 105 |
+
logger.error(f"Invalid seq_length: {seq_length}")
|
| 106 |
+
raise ValueError("seq_length must be a positive integer")
|
| 107 |
+
if not isinstance(learning_rate, (int, float)) or learning_rate <= 0:
|
| 108 |
+
logger.error(f"Invalid learning_rate: {learning_rate}")
|
| 109 |
+
raise ValueError("learning_rate must be a positive number")
|
| 110 |
+
|
| 111 |
+
inputs = keras.layers.Input(shape=(seq_length, 1))
|
| 112 |
+
x = inputs
|
| 113 |
+
|
| 114 |
+
if arch == 'lstm':
|
| 115 |
+
# Improved LSTM Layer 1 (100 units, return_sequences=True)
|
| 116 |
+
x = keras.layers.LSTM(
|
| 117 |
+
units, return_sequences=True, activation=activation,
|
| 118 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 119 |
+
)(x)
|
| 120 |
+
|
| 121 |
+
# Attention mechanism
|
| 122 |
+
attention = keras.layers.Attention()([x, x])
|
| 123 |
+
x = keras.layers.Add()([x, attention]) # Residual connection
|
| 124 |
+
|
| 125 |
+
# Improved LSTM Layer 2 (50 units, return_sequences=True)
|
| 126 |
+
x = keras.layers.LSTM(
|
| 127 |
+
units // 2, return_sequences=True, activation=activation,
|
| 128 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 129 |
+
)(x)
|
| 130 |
+
|
| 131 |
+
# Improved LSTM Layer 3 (25 units, return_sequences=False)
|
| 132 |
+
x = keras.layers.LSTM(
|
| 133 |
+
units // 4, return_sequences=False, activation=activation,
|
| 134 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 135 |
+
)(x)
|
| 136 |
+
|
| 137 |
+
# Dense layers
|
| 138 |
+
x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
|
| 139 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 140 |
+
x = keras.layers.Dense(25, activation='relu')(x)
|
| 141 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 142 |
+
|
| 143 |
+
# Output layer
|
| 144 |
+
x = keras.layers.Dense(1)(x)
|
| 145 |
+
elif arch == 'gru':
|
| 146 |
+
# Improved GRU Layer 1 (100 units, return_sequences=True)
|
| 147 |
+
x = keras.layers.GRU(
|
| 148 |
+
units, return_sequences=True, activation=activation,
|
| 149 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 150 |
+
)(x)
|
| 151 |
+
|
| 152 |
+
# Attention mechanism
|
| 153 |
+
attention = keras.layers.Attention()([x, x])
|
| 154 |
+
x = keras.layers.Add()([x, attention]) # Residual connection
|
| 155 |
+
|
| 156 |
+
# Improved GRU Layer 2 (50 units, return_sequences=True)
|
| 157 |
+
x = keras.layers.GRU(
|
| 158 |
+
units // 2, return_sequences=True, activation=activation,
|
| 159 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 160 |
+
)(x)
|
| 161 |
+
|
| 162 |
+
# Improved GRU Layer 3 (25 units, return_sequences=False)
|
| 163 |
+
x = keras.layers.GRU(
|
| 164 |
+
units // 4, return_sequences=False, activation=activation,
|
| 165 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 166 |
+
)(x)
|
| 167 |
+
|
| 168 |
+
# Dense layers
|
| 169 |
+
x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
|
| 170 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 171 |
+
x = keras.layers.Dense(25, activation='relu')(x)
|
| 172 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 173 |
+
|
| 174 |
+
# Output layer
|
| 175 |
+
x = keras.layers.Dense(1)(x)
|
| 176 |
+
elif arch == 'bilstm':
|
| 177 |
+
# Improved BiLSTM Layer 1 (100 units, return_sequences=True)
|
| 178 |
+
x = keras.layers.Bidirectional(
|
| 179 |
+
keras.layers.LSTM(
|
| 180 |
+
units, return_sequences=True, activation=activation,
|
| 181 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 182 |
+
)
|
| 183 |
+
)(x)
|
| 184 |
+
|
| 185 |
+
# Attention mechanism
|
| 186 |
+
attention = keras.layers.Attention()([x, x])
|
| 187 |
+
x = keras.layers.Add()([x, attention]) # Residual connection
|
| 188 |
+
|
| 189 |
+
# Improved BiLSTM Layer 2 (50 units, return_sequences=True)
|
| 190 |
+
x = keras.layers.Bidirectional(
|
| 191 |
+
keras.layers.LSTM(
|
| 192 |
+
units // 2, return_sequences=True, activation=activation,
|
| 193 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 194 |
+
)
|
| 195 |
+
)(x)
|
| 196 |
+
|
| 197 |
+
# Improved BiLSTM Layer 3 (25 units, return_sequences=False)
|
| 198 |
+
x = keras.layers.Bidirectional(
|
| 199 |
+
keras.layers.LSTM(
|
| 200 |
+
units // 4, return_sequences=False, activation=activation,
|
| 201 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 202 |
+
)
|
| 203 |
+
)(x)
|
| 204 |
+
|
| 205 |
+
# Dense layers
|
| 206 |
+
x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
|
| 207 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 208 |
+
x = keras.layers.Dense(25, activation='relu')(x)
|
| 209 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 210 |
+
|
| 211 |
+
# Output layer
|
| 212 |
+
x = keras.layers.Dense(1)(x)
|
| 213 |
+
elif arch == 'custom':
|
| 214 |
+
# Improved Custom Layer 1 (LSTM, 100 units, return_sequences=True)
|
| 215 |
+
x = keras.layers.LSTM(
|
| 216 |
+
units, return_sequences=True, activation=activation,
|
| 217 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 218 |
+
)(x)
|
| 219 |
+
|
| 220 |
+
# Attention mechanism
|
| 221 |
+
attention = keras.layers.Attention()([x, x])
|
| 222 |
+
x = keras.layers.Add()([x, attention]) # Residual connection
|
| 223 |
+
|
| 224 |
+
# Improved Custom Layer 2 (LSTM, 50 units, return_sequences=True)
|
| 225 |
+
x = keras.layers.LSTM(
|
| 226 |
+
units // 2, return_sequences=True, activation=activation,
|
| 227 |
+
dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
|
| 228 |
+
)(x)
|
| 229 |
+
|
| 230 |
+
# Improved Custom Layer 3 (LSTM, 25 units, return_sequences=False)
|
| 231 |
+
x = keras.layers.LSTM(
|
| 232 |
+
units // 4, return_sequences=False, activation=activation,
|
| 233 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 234 |
+
)(x)
|
| 235 |
+
|
| 236 |
+
# Dense layers
|
| 237 |
+
x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
|
| 238 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 239 |
+
x = keras.layers.Dense(25, activation='relu')(x)
|
| 240 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 241 |
+
|
| 242 |
+
# Output layer
|
| 243 |
+
x = keras.layers.Dense(1)(x)
|
| 244 |
+
|
| 245 |
+
model = keras.Model(inputs, x)
|
| 246 |
+
|
| 247 |
+
optimizer_name = model_cfg.get('optimizer', 'adam').lower()
|
| 248 |
+
if optimizer_name == 'adam':
|
| 249 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 250 |
+
else:
|
| 251 |
+
logger.warning(f"Optimizer {optimizer_name} not explicitly handled, using default parameters")
|
| 252 |
+
optimizer = optimizer_name
|
| 253 |
+
|
| 254 |
+
model.compile(
|
| 255 |
+
optimizer=optimizer,
|
| 256 |
+
loss=model_cfg.get('loss', 'mse'),
|
| 257 |
+
metrics=['mae']
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
logger.info(f"Built model: architecture={arch}, units={units}, layers={layers}, learning_rate={learning_rate}")
|
| 261 |
+
return model
|
| 262 |
+
|
| 263 |
+
if __name__ == "__main__":
|
| 264 |
+
import pandas as pd
|
| 265 |
+
from components.utils.file_utils import load_config
|
| 266 |
+
|
| 267 |
+
logger.info("Running standalone tests for model_utils.py")
|
| 268 |
+
# Test create_sequences
|
| 269 |
+
data = np.array([[10000], [10050], [10100], [10150], [10200]])
|
| 270 |
+
seq_length = 3
|
| 271 |
+
X, y = create_sequences(data, seq_length)
|
| 272 |
+
print(f"create_sequences: X shape {X.shape}, y shape {y.shape}")
|
| 273 |
+
print(f"Sample sequence: {X[0]}, target: {y[0]}")
|
| 274 |
+
|
| 275 |
+
# Test create_data_loader
|
| 276 |
+
scaler = MinMaxScaler()
|
| 277 |
+
scaler.fit(data)
|
| 278 |
+
parquet_paths = ['temp/extracted_from_minio/btcusdt_1h.parquet']
|
| 279 |
+
if not os.path.exists(parquet_paths[0]):
|
| 280 |
+
os.makedirs(os.path.dirname(parquet_paths[0]), exist_ok=True)
|
| 281 |
+
pd.DataFrame({'Close': [10000, 10050, 10100, 10150, 10200]}).to_parquet(parquet_paths[0])
|
| 282 |
+
|
| 283 |
+
dataset = create_data_loader(parquet_paths, scaler, seq_length=3, batch_size=2)
|
| 284 |
+
for x, y in dataset.take(1):
|
| 285 |
+
print(f"create_data_loader: x shape {x.shape}, y shape {y.shape}")
|
| 286 |
+
|
| 287 |
+
# Test build_model_from_config for all architectures
|
| 288 |
+
config = load_config('configs/model_config.yml')
|
| 289 |
+
for arch in ['lstm', 'gru', 'bilstm', 'custom']:
|
| 290 |
+
config['model']['architecture'] = arch
|
| 291 |
+
model = build_model_from_config(seq_length=3, cfg=config)
|
| 292 |
+
print(f"\nModel summary for {arch}:")
|
| 293 |
+
model.summary()
|
| 294 |
+
logger.info("Standalone tests completed successfully.")
|
components/model/old_model_utils.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import pyarrow.parquet as pq
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 8 |
+
from tensorflow import keras
|
| 9 |
+
from typing import Tuple
|
| 10 |
+
from datetime import datetime, timezone, timedelta
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
| 14 |
+
from components.model.data_utils import create_data_loader
|
| 15 |
+
|
| 16 |
+
# Configure logging with +07:00 timezone
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format='%(asctime)s %(levelname)s: %(message)s',
|
| 20 |
+
datefmt='%Y-%m-%d %H:%M:%S %Z'
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
def create_sequences(data: np.ndarray, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 25 |
+
"""Create sequences of data for LSTM model training and prediction.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
data (np.ndarray): Input time series data (scaled), shape (n_samples, n_features).
|
| 29 |
+
seq_length (int): Length of each sequence.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Tuple[np.ndarray, np.ndarray]: (X, y) where X is input sequences (n_samples, seq_length, n_features)
|
| 33 |
+
and y is target values (n_samples, n_features).
|
| 34 |
+
|
| 35 |
+
Raises:
|
| 36 |
+
ValueError: If data is empty, seq_length is invalid, or data has insufficient length.
|
| 37 |
+
"""
|
| 38 |
+
if not isinstance(data, np.ndarray):
|
| 39 |
+
logger.error("Input data must be a numpy array")
|
| 40 |
+
raise ValueError("Input data must be a numpy array")
|
| 41 |
+
if data.size == 0:
|
| 42 |
+
logger.error("Input data is empty")
|
| 43 |
+
raise ValueError("Input data is empty")
|
| 44 |
+
if not isinstance(seq_length, int) or seq_length <= 0:
|
| 45 |
+
logger.error(f"Invalid seq_length: {seq_length}")
|
| 46 |
+
raise ValueError("seq_length must be a positive integer")
|
| 47 |
+
if len(data) <= seq_length:
|
| 48 |
+
logger.error(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
|
| 49 |
+
raise ValueError(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
|
| 50 |
+
|
| 51 |
+
X, y = [], []
|
| 52 |
+
for i in range(len(data) - seq_length):
|
| 53 |
+
sequence = data[i:i + seq_length]
|
| 54 |
+
target = data[i + seq_length]
|
| 55 |
+
X.append(sequence)
|
| 56 |
+
y.append(target)
|
| 57 |
+
|
| 58 |
+
X = np.array(X)
|
| 59 |
+
y = np.array(y)
|
| 60 |
+
|
| 61 |
+
if len(X.shape) == 2:
|
| 62 |
+
X = X.reshape(X.shape[0], X.shape[1], 1)
|
| 63 |
+
|
| 64 |
+
logger.info(f"Created {X.shape[0]} sequences: X shape {X.shape}, y shape {y.shape}")
|
| 65 |
+
return X, y
|
| 66 |
+
|
| 67 |
+
def build_model_from_config(seq_length: int, cfg: dict) -> keras.Model:
|
| 68 |
+
"""Build an LSTM-based model based on configuration.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
seq_length (int): Length of input sequences.
|
| 72 |
+
cfg (dict): Model configuration dictionary with 'model' key containing architecture, units, etc.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
keras.Model: Compiled Keras model.
|
| 76 |
+
|
| 77 |
+
Raises:
|
| 78 |
+
ValueError: If configuration is invalid or architecture is unsupported.
|
| 79 |
+
"""
|
| 80 |
+
if not isinstance(cfg, dict) or 'model' not in cfg:
|
| 81 |
+
logger.error("Invalid configuration: 'model' key missing")
|
| 82 |
+
raise ValueError("Configuration must be a dictionary with a 'model' key")
|
| 83 |
+
|
| 84 |
+
model_cfg = cfg['model']
|
| 85 |
+
arch = model_cfg.get('architecture')
|
| 86 |
+
units = model_cfg.get('units')
|
| 87 |
+
layers = model_cfg.get('layers', 1)
|
| 88 |
+
dropout = model_cfg.get('dropout', 0.2)
|
| 89 |
+
activation = model_cfg.get('activation', 'tanh')
|
| 90 |
+
learning_rate = model_cfg.get('learning_rate', 0.001) # Default learning rate if not specified
|
| 91 |
+
|
| 92 |
+
if not isinstance(units, int) or units <= 0:
|
| 93 |
+
logger.error(f"Invalid units: {units}")
|
| 94 |
+
raise ValueError("units must be a positive integer")
|
| 95 |
+
if not isinstance(layers, int) or layers <= 0:
|
| 96 |
+
logger.error(f"Invalid layers: {layers}")
|
| 97 |
+
raise ValueError("layers must be a positive integer")
|
| 98 |
+
if not isinstance(dropout, float) or not 0 <= dropout < 1:
|
| 99 |
+
logger.error(f"Invalid dropout: {dropout}")
|
| 100 |
+
raise ValueError("dropout must be a float between 0 and 1")
|
| 101 |
+
if arch not in ['lstm', 'bilstm', 'gru', 'custom']:
|
| 102 |
+
logger.error(f"Unsupported architecture: {arch}")
|
| 103 |
+
raise ValueError(f"Unsupported architecture: {arch}")
|
| 104 |
+
if not isinstance(seq_length, int) or seq_length <= 0:
|
| 105 |
+
logger.error(f"Invalid seq_length: {seq_length}")
|
| 106 |
+
raise ValueError("seq_length must be a positive integer")
|
| 107 |
+
if not isinstance(learning_rate, (int, float)) or learning_rate <= 0:
|
| 108 |
+
logger.error(f"Invalid learning_rate: {learning_rate}")
|
| 109 |
+
raise ValueError("learning_rate must be a positive number")
|
| 110 |
+
|
| 111 |
+
inputs = keras.layers.Input(shape=(seq_length, 1))
|
| 112 |
+
x = inputs
|
| 113 |
+
|
| 114 |
+
for i in range(layers):
|
| 115 |
+
return_seq = i < layers - 1
|
| 116 |
+
if arch == 'lstm':
|
| 117 |
+
x = keras.layers.LSTM(
|
| 118 |
+
units, return_sequences=return_seq, activation=activation,
|
| 119 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 120 |
+
)(x)
|
| 121 |
+
elif arch == 'bilstm':
|
| 122 |
+
x = keras.layers.Bidirectional(
|
| 123 |
+
keras.layers.LSTM(
|
| 124 |
+
units, return_sequences=return_seq, activation=activation,
|
| 125 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 126 |
+
)
|
| 127 |
+
)(x)
|
| 128 |
+
elif arch == 'gru':
|
| 129 |
+
x = keras.layers.GRU(
|
| 130 |
+
units, return_sequences=return_seq, activation=activation,
|
| 131 |
+
dropout=dropout, recurrent_dropout=0.1
|
| 132 |
+
)(x)
|
| 133 |
+
elif arch == 'custom':
|
| 134 |
+
x = keras.layers.LSTM(units, return_sequences=True)(x)
|
| 135 |
+
x = keras.layers.LSTM(units // 2, return_sequences=False)(x)
|
| 136 |
+
x = keras.layers.Dense(50, activation='relu')(x)
|
| 137 |
+
x = keras.layers.Dropout(dropout)(x)
|
| 138 |
+
|
| 139 |
+
if arch != 'custom':
|
| 140 |
+
x = keras.layers.Dense(1)(x)
|
| 141 |
+
|
| 142 |
+
model = keras.Model(inputs, x)
|
| 143 |
+
|
| 144 |
+
# Configure optimizer with specified learning rate
|
| 145 |
+
optimizer_name = model_cfg.get('optimizer', 'adam').lower()
|
| 146 |
+
if optimizer_name == 'adam':
|
| 147 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
|
| 148 |
+
else:
|
| 149 |
+
logger.warning(f"Optimizer {optimizer_name} not explicitly handled, using default parameters")
|
| 150 |
+
optimizer = optimizer_name # Fallback to string, Keras will handle it
|
| 151 |
+
|
| 152 |
+
model.compile(
|
| 153 |
+
optimizer=optimizer,
|
| 154 |
+
loss=model_cfg.get('loss', 'mse'),
|
| 155 |
+
metrics=['mae']
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
logger.info(f"Built model: architecture={arch}, units={units}, layers={layers}, learning_rate={learning_rate}")
|
| 159 |
+
return model
|
| 160 |
+
|
| 161 |
+
if __name__ == "__main__":
|
| 162 |
+
import pandas as pd
|
| 163 |
+
from components.utils.file_utils import load_config
|
| 164 |
+
|
| 165 |
+
logger.info("Running standalone tests for lstm_utils.py")
|
| 166 |
+
# Test create_sequences
|
| 167 |
+
data = np.array([[10000], [10050], [10100], [10150], [10200]])
|
| 168 |
+
seq_length = 3
|
| 169 |
+
X, y = create_sequences(data, seq_length)
|
| 170 |
+
print(f"create_sequences: X shape {X.shape}, y shape {y.shape}")
|
| 171 |
+
print(f"Sample sequence: {X[0]}, target: {y[0]}")
|
| 172 |
+
|
| 173 |
+
# Test create_data_loader
|
| 174 |
+
scaler = MinMaxScaler()
|
| 175 |
+
scaler.fit(data)
|
| 176 |
+
parquet_paths = ['temp/extracted_from_minio/btcusdt_1h.parquet']
|
| 177 |
+
if not os.path.exists(parquet_paths[0]):
|
| 178 |
+
os.makedirs(os.path.dirname(parquet_paths[0]), exist_ok=True)
|
| 179 |
+
pd.DataFrame({'Close': [10000, 10050, 10100, 10150, 10200]}).to_parquet(parquet_paths[0])
|
| 180 |
+
|
| 181 |
+
dataset = create_data_loader(parquet_paths, scaler, seq_length=3, batch_size=2)
|
| 182 |
+
for x, y in dataset.take(1):
|
| 183 |
+
print(f"create_data_loader: x shape {x.shape}, y shape {y.shape}")
|
| 184 |
+
|
| 185 |
+
# Test build_model_from_config
|
| 186 |
+
config = load_config('model_config.yml')
|
| 187 |
+
model = build_model_from_config(seq_length=3, cfg=config)
|
| 188 |
+
model.summary()
|
| 189 |
+
logger.info("Standalone tests completed successfully.")
|
components/model/training.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 6 |
+
import pickle
|
| 7 |
+
from datetime import datetime, timezone, timedelta
|
| 8 |
+
import tensorflow as tf
|
| 9 |
+
from tensorflow import keras
|
| 10 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
| 11 |
+
from typing import Dict, List
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
| 15 |
+
from components.utils.file_utils import load_extract_config, get_parquet_file_names
|
| 16 |
+
from components.model.model_utils import build_model_from_config
|
| 17 |
+
from components.model.data_utils import create_data_loader
|
| 18 |
+
from components.utils.utils import parse_timezone
|
| 19 |
+
|
| 20 |
+
# Configure logging with +07:00 timezone
|
| 21 |
+
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format='%(asctime)s %(levelname)s: %(message)s',
|
| 24 |
+
datefmt='%Y-%m-%d %H:%M:%S %Z'
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
def train_lstm_model(**kwargs) -> Dict:
|
| 29 |
+
"""Train an LSTM model for BTC/USDT forecasting and save model and scaler.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
kwargs: Airflow task instance arguments.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Dict: Training metadata including model path, scaler path, metrics, and dataset info.
|
| 36 |
+
"""
|
| 37 |
+
# Verify GPU availability
|
| 38 |
+
gpus = tf.config.list_physical_devices('GPU')
|
| 39 |
+
if not gpus:
|
| 40 |
+
logger.warning("No GPU detected. Training on CPU, which may be slower.")
|
| 41 |
+
else:
|
| 42 |
+
logger.info(f"GPUs detected: {len(gpus)}. Using CUDA for training.")
|
| 43 |
+
for gpu in gpus:
|
| 44 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
| 45 |
+
|
| 46 |
+
cfg = load_extract_config('model_config.yml')
|
| 47 |
+
model_cfg = cfg['model']
|
| 48 |
+
train_cfg = cfg['training']
|
| 49 |
+
data_cfg = cfg['data']
|
| 50 |
+
out_cfg = cfg['output']
|
| 51 |
+
ver_cfg = cfg['versioning']
|
| 52 |
+
|
| 53 |
+
# Parse timezone from YAML
|
| 54 |
+
tz_offset_str = ver_cfg['timezone'] # '+07:00'
|
| 55 |
+
tz = parse_timezone(tz_offset_str)
|
| 56 |
+
|
| 57 |
+
# Get current time in the specified timezone
|
| 58 |
+
dt = datetime.now(tz)
|
| 59 |
+
dt_str = dt.strftime(ver_cfg['datetime_format']) + f"-({dt.strftime('%z')[:3]})"
|
| 60 |
+
model_path = os.path.join(out_cfg['checkpoints']['model_dir'], f"model_{dt_str}.h5")
|
| 61 |
+
scaler_path = os.path.join(out_cfg['checkpoints']['scaler_dir'], f"scaler_{dt_str}.pkl")
|
| 62 |
+
parquet_folder = load_extract_config('pipeline_config.yml')['paths']['parquet_folder']
|
| 63 |
+
|
| 64 |
+
# Ensure output directories exist
|
| 65 |
+
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
| 66 |
+
os.makedirs(os.path.dirname(scaler_path), exist_ok=True)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# Load data
|
| 70 |
+
file_names = get_parquet_file_names()
|
| 71 |
+
parquet_paths = [os.path.join(parquet_folder, el) for el in file_names]
|
| 72 |
+
all_df = pd.DataFrame()
|
| 73 |
+
used_files = []
|
| 74 |
+
|
| 75 |
+
for path, name in zip(parquet_paths, file_names):
|
| 76 |
+
if os.path.exists(path):
|
| 77 |
+
df = pd.read_parquet(path)
|
| 78 |
+
logger.info(f"Loaded {path} with {len(df)} rows")
|
| 79 |
+
all_df = pd.concat([all_df, df], ignore_index=True)
|
| 80 |
+
clean_name = name.replace(".parquet", "").replace(".csv", "")
|
| 81 |
+
used_files.append(clean_name)
|
| 82 |
+
else:
|
| 83 |
+
logger.warning(f"File not found: {path}")
|
| 84 |
+
|
| 85 |
+
if all_df.empty:
|
| 86 |
+
logger.error("No data loaded from Parquet files")
|
| 87 |
+
raise ValueError("No data loaded from Parquet files")
|
| 88 |
+
|
| 89 |
+
dataset_merge = " + ".join(used_files) if used_files else "none"
|
| 90 |
+
logger.info(f"Dataset merged: {dataset_merge}, total rows: {len(all_df)}")
|
| 91 |
+
|
| 92 |
+
# Scale data
|
| 93 |
+
scaler = MinMaxScaler()
|
| 94 |
+
prices = all_df['Close'].astype(float).values.reshape(-1, 1)
|
| 95 |
+
if prices.size <= data_cfg['seq_length']:
|
| 96 |
+
logger.error(f"Total data size {prices.size} is insufficient for seq_length {data_cfg['seq_length']}")
|
| 97 |
+
raise ValueError(f"Total data size {prices.size} is insufficient for seq_length {data_cfg['seq_length']}")
|
| 98 |
+
prices_scaled = scaler.fit_transform(prices)
|
| 99 |
+
|
| 100 |
+
# Create dataset with smaller batch size
|
| 101 |
+
seq_length = data_cfg['seq_length']
|
| 102 |
+
batch_size = train_cfg.get('batch_size', 64) # Default to 64 if not specified
|
| 103 |
+
if batch_size > 8192:
|
| 104 |
+
logger.warning(f"Batch size {batch_size} is large; reducing to 64 to avoid memory issues")
|
| 105 |
+
batch_size = 64
|
| 106 |
+
|
| 107 |
+
dataset = create_data_loader(parquet_paths, scaler, seq_length, batch_size)
|
| 108 |
+
|
| 109 |
+
# Calculate exact number of sequences
|
| 110 |
+
total_seqs = 0
|
| 111 |
+
for path in parquet_paths:
|
| 112 |
+
if os.path.exists(path):
|
| 113 |
+
df = pd.read_parquet(path, columns=['Close'])
|
| 114 |
+
seqs = max(0, len(df) - seq_length)
|
| 115 |
+
total_seqs += seqs
|
| 116 |
+
logger.info(f"File {path}: {len(df)} rows, {seqs} sequences")
|
| 117 |
+
|
| 118 |
+
if total_seqs == 0:
|
| 119 |
+
logger.error("Not enough sequences for training")
|
| 120 |
+
raise ValueError("Not enough sequences for training")
|
| 121 |
+
|
| 122 |
+
# Calculate steps for training, validation, and test
|
| 123 |
+
steps_total = (total_seqs + batch_size - 1) // batch_size
|
| 124 |
+
train_ratio = data_cfg.get('train_ratio', 0.7)
|
| 125 |
+
val_ratio = data_cfg.get('val_ratio', 0.2)
|
| 126 |
+
steps_train = max(1, int(steps_total * train_ratio))
|
| 127 |
+
steps_val = max(1, int(steps_total * val_ratio))
|
| 128 |
+
steps_test = max(1, steps_total - steps_train - steps_val)
|
| 129 |
+
logger.info(f"Dataset splits: total_steps={steps_total}, train={steps_train}, val={steps_val}, test={steps_test}")
|
| 130 |
+
|
| 131 |
+
# Save scaler
|
| 132 |
+
with open(scaler_path, 'wb') as f:
|
| 133 |
+
pickle.dump(scaler, f)
|
| 134 |
+
|
| 135 |
+
train_ds = dataset.take(steps_train)
|
| 136 |
+
val_ds = dataset.skip(steps_train).take(steps_val)
|
| 137 |
+
test_ds = dataset.skip(steps_train + steps_val)
|
| 138 |
+
|
| 139 |
+
# Build and train model
|
| 140 |
+
model = build_model_from_config(seq_length, cfg)
|
| 141 |
+
checkpoint_cb = keras.callbacks.ModelCheckpoint(
|
| 142 |
+
model_path, save_best_only=True, monitor='val_loss', verbose=0
|
| 143 |
+
)
|
| 144 |
+
early_stop = keras.callbacks.EarlyStopping(
|
| 145 |
+
monitor='val_loss', patience=train_cfg['patience'], restore_best_weights=True
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Log model summary
|
| 149 |
+
model.summary(print_fn=lambda x: logger.info(x))
|
| 150 |
+
|
| 151 |
+
# Train with exact steps_per_epoch
|
| 152 |
+
model.fit(
|
| 153 |
+
train_ds,
|
| 154 |
+
epochs=train_cfg['epochs'],
|
| 155 |
+
steps_per_epoch=steps_train,
|
| 156 |
+
validation_data=val_ds,
|
| 157 |
+
validation_steps=steps_val,
|
| 158 |
+
callbacks=[checkpoint_cb, early_stop],
|
| 159 |
+
verbose=2
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# # Test evaluation
|
| 165 |
+
# y_true, y_pred = [], []
|
| 166 |
+
# for X, y in test_ds:
|
| 167 |
+
# pred = model.predict(X, verbose=0)
|
| 168 |
+
# y_true.append(y.numpy())
|
| 169 |
+
# y_pred.append(pred)
|
| 170 |
+
# y_true = np.concatenate(y_true)
|
| 171 |
+
# y_pred = np.concatenate(y_pred)
|
| 172 |
+
# y_true_orig = scaler.inverse_transform(y_true)
|
| 173 |
+
# y_pred_orig = scaler.inverse_transform(y_pred)
|
| 174 |
+
# test_rmse = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
|
| 175 |
+
# test_mae = mean_absolute_error(y_true_orig, y_pred_orig)
|
| 176 |
+
|
| 177 |
+
# logger.info(f"Test RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")
|
| 178 |
+
|
| 179 |
+
return {
|
| 180 |
+
'model_path': model_path,
|
| 181 |
+
'model_filename': os.path.basename(model_path),
|
| 182 |
+
'scaler_path': scaler_path,
|
| 183 |
+
'datetime': dt_str,
|
| 184 |
+
'dataset_merge': dataset_merge,
|
| 185 |
+
# 'test_rmse': float(test_rmse),
|
| 186 |
+
# 'test_mae': float(test_mae)
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
logger.info("Running standalone training test")
|
| 191 |
+
result = train_lstm_model()
|
| 192 |
+
print("Training completed successfully!")
|
| 193 |
+
print(result)
|
| 194 |
+
logger.info("Standalone training run completed")
|
components/old-process_data.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/process_data.py
|
| 2 |
+
from pyspark.sql import SparkSession
|
| 3 |
+
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, IntegerType
|
| 4 |
+
from pyspark.sql.functions import col, row_number, floor, first, max, min, last, sum
|
| 5 |
+
from pyspark.sql.window import Window
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import shutil
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
# Add the project root directory to the Python path
|
| 12 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 13 |
+
from minio_api.client import sign_in
|
| 14 |
+
from minio_api.minio_utils import get_minio_data
|
| 15 |
+
|
| 16 |
+
def initialize_spark_session(app_name="MinIO to Spark DataFrame",
|
| 17 |
+
driver_memory="4g", executor_memory="4g"):
|
| 18 |
+
return SparkSession.builder \
|
| 19 |
+
.appName(app_name) \
|
| 20 |
+
.config("spark.driver.memory", driver_memory) \
|
| 21 |
+
.config("spark.executor.memory", executor_memory) \
|
| 22 |
+
.getOrCreate()
|
| 23 |
+
|
| 24 |
+
def create_dataframe_from_csv(spark, csv_lines, temp_parquet_path="temp/temp_parquet_chunks",
|
| 25 |
+
chunk_size=int(3e+6)):
|
| 26 |
+
os.makedirs(temp_parquet_path, exist_ok=True)
|
| 27 |
+
schema = StructType([
|
| 28 |
+
StructField("Open time", LongType(), True),
|
| 29 |
+
StructField("Open", DoubleType(), True),
|
| 30 |
+
StructField("High", DoubleType(), True),
|
| 31 |
+
StructField("Low", DoubleType(), True),
|
| 32 |
+
StructField("Close", DoubleType(), True),
|
| 33 |
+
StructField("Volume", DoubleType(), True),
|
| 34 |
+
StructField("Close time", LongType(), True),
|
| 35 |
+
StructField("Quote asset volume", DoubleType(), True),
|
| 36 |
+
StructField("Number of trades", IntegerType(), True),
|
| 37 |
+
StructField("Taker buy base asset volume", DoubleType(), True),
|
| 38 |
+
StructField("Taker buy quote asset volume", DoubleType(), True),
|
| 39 |
+
StructField("Ignore", IntegerType(), True)
|
| 40 |
+
])
|
| 41 |
+
|
| 42 |
+
if csv_lines and csv_lines[0].startswith("Open time,"):
|
| 43 |
+
data_lines = csv_lines[1:]
|
| 44 |
+
else:
|
| 45 |
+
data_lines = csv_lines
|
| 46 |
+
|
| 47 |
+
if os.path.exists(temp_parquet_path):
|
| 48 |
+
shutil.rmtree(temp_parquet_path)
|
| 49 |
+
|
| 50 |
+
for i in range(0, len(data_lines), chunk_size):
|
| 51 |
+
chunk = data_lines[i:i + chunk_size]
|
| 52 |
+
rdd_chunk = spark.sparkContext.parallelize(chunk).repartition(8)
|
| 53 |
+
df_chunk = spark.read.schema(schema).csv(rdd_chunk, header=False)
|
| 54 |
+
df_chunk.write.mode("append").parquet(temp_parquet_path)
|
| 55 |
+
|
| 56 |
+
return spark.read.parquet(temp_parquet_path)
|
| 57 |
+
|
| 58 |
+
def resample_dataframe(df, track_each=3600):
|
| 59 |
+
keep_cols = ["Open time", "Open", "High", "Low", "Close", "Number of trades"]
|
| 60 |
+
df = df.select(keep_cols)
|
| 61 |
+
window_spec = Window.orderBy("Open time")
|
| 62 |
+
df = df.withColumn("row_number", row_number().over(window_spec))
|
| 63 |
+
df = df.withColumn("group_id", floor((col("row_number") - 1) / track_each))
|
| 64 |
+
aggregations = [
|
| 65 |
+
first("Open time").alias("Open time"),
|
| 66 |
+
first("Open").alias("Open"),
|
| 67 |
+
max("High").alias("High"),
|
| 68 |
+
min("Low").alias("Low"),
|
| 69 |
+
last("Close").alias("Close"),
|
| 70 |
+
sum("Number of trades").alias("Number of trades")
|
| 71 |
+
]
|
| 72 |
+
aggregated_df = df.groupBy("group_id").agg(*aggregations)
|
| 73 |
+
return aggregated_df.select("Open time", "Open", "High", "Low", "Close", "Number of trades")
|
| 74 |
+
|
| 75 |
+
def process_financial_data(bucket_name="minio-ngrok-bucket", file_name="BTCUSDT-1s-2025-09.csv",
|
| 76 |
+
temp_parquet_path="temp/temp_parquet_chunks",
|
| 77 |
+
output_parquet_path="temp/aggregated_output"):
|
| 78 |
+
minio_client = sign_in()
|
| 79 |
+
spark = initialize_spark_session()
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
csv_lines = get_minio_data(minio_client, bucket_name, file_name)
|
| 83 |
+
print(f"Fetched CSV data from MinIO: {len(csv_lines)} lines")
|
| 84 |
+
df = create_dataframe_from_csv(spark, csv_lines, temp_parquet_path)
|
| 85 |
+
print("Created Spark DataFrame from CSV data.")
|
| 86 |
+
aggregated_df = resample_dataframe(df)
|
| 87 |
+
print("Resampled DataFrame with OHLC aggregations.")
|
| 88 |
+
|
| 89 |
+
# Save aggregated DataFrame to a temporary Parquet directory
|
| 90 |
+
os.makedirs(os.path.dirname(output_parquet_path), exist_ok=True)
|
| 91 |
+
aggregated_df.write.mode("overwrite").parquet(output_parquet_path)
|
| 92 |
+
print(f"Saved aggregated DataFrame to {output_parquet_path}")
|
| 93 |
+
|
| 94 |
+
# Verify that the Parquet directory exists
|
| 95 |
+
if not os.path.exists(output_parquet_path) or not os.path.isdir(output_parquet_path):
|
| 96 |
+
raise FileNotFoundError(f"Parquet directory {output_parquet_path} was not created or is not a directory.")
|
| 97 |
+
else:
|
| 98 |
+
print(f"Verified: Parquet directory exists at {output_parquet_path}")
|
| 99 |
+
|
| 100 |
+
return output_parquet_path
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Error in process_financial_data: {e}")
|
| 104 |
+
raise
|
| 105 |
+
finally:
|
| 106 |
+
spark.stop()
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
# Example usage
|
| 110 |
+
output_parquet_path = process_financial_data()
|
| 111 |
+
print(output_parquet_path)
|
components/process_data.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pyspark.sql import SparkSession
|
| 2 |
+
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, IntegerType
|
| 3 |
+
from pyspark.sql.functions import col, row_number, floor, first, max, min, last, sum
|
| 4 |
+
from pyspark.sql.window import Window
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import shutil
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import ast
|
| 11 |
+
import io
|
| 12 |
+
|
| 13 |
+
# Add the project root directory to the Python path
|
| 14 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 15 |
+
from minio_api.minio_utils import get_minio_data
|
| 16 |
+
from minio_api.client import sign_in
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def initialize_spark_session(app_name="MinIO to Spark DataFrame",
|
| 20 |
+
driver_memory="4g", executor_memory="4g"):
|
| 21 |
+
return SparkSession.builder \
|
| 22 |
+
.appName(app_name) \
|
| 23 |
+
.config("spark.driver.memory", driver_memory) \
|
| 24 |
+
.config("spark.executor.memory", executor_memory) \
|
| 25 |
+
.getOrCreate()
|
| 26 |
+
|
| 27 |
+
def create_dataframe_from_csv(spark, parquet_file_path, schema, temp_parquet_path="temp/temp_parquet_chunks",
|
| 28 |
+
chunk_size=int(3e+6)):
|
| 29 |
+
os.makedirs(temp_parquet_path, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
# Clear the temporary Parquet path if it exists
|
| 32 |
+
if os.path.exists(temp_parquet_path):
|
| 33 |
+
shutil.rmtree(temp_parquet_path)
|
| 34 |
+
|
| 35 |
+
# Read Parquet file directly with Spark, applying the schema
|
| 36 |
+
df = spark.read.schema(schema).parquet(parquet_file_path)
|
| 37 |
+
|
| 38 |
+
# Write the DataFrame to the temporary Parquet path (for consistency with original logic)
|
| 39 |
+
df.write.mode("append").parquet(temp_parquet_path)
|
| 40 |
+
|
| 41 |
+
# Read back the Parquet data from the temporary path
|
| 42 |
+
return spark.read.parquet(temp_parquet_path)
|
| 43 |
+
|
| 44 |
+
def resample_dataframe(df, track_each=3600):
|
| 45 |
+
keep_cols = ["Open time", "Open", "High", "Low", "Close", "Number of trades"]
|
| 46 |
+
df = df.select(keep_cols)
|
| 47 |
+
window_spec = Window.orderBy("Open time")
|
| 48 |
+
df = df.withColumn("row_number", row_number().over(window_spec))
|
| 49 |
+
df = df.withColumn("group_id", floor((col("row_number") - 1) / track_each))
|
| 50 |
+
aggregations = [
|
| 51 |
+
first("Open time").alias("Open time"),
|
| 52 |
+
first("Open").alias("Open"),
|
| 53 |
+
max("High").alias("High"),
|
| 54 |
+
min("Low").alias("Low"),
|
| 55 |
+
last("Close").alias("Close"),
|
| 56 |
+
sum("Number of trades").alias("Number of trades")
|
| 57 |
+
]
|
| 58 |
+
aggregated_df = df.groupBy("group_id").agg(*aggregations)
|
| 59 |
+
return aggregated_df.select("Open time", "Open", "High", "Low", "Close", "Number of trades")
|
| 60 |
+
|
| 61 |
+
def extract_from_minio(bucket_name="minio-ngrok-bucket",
|
| 62 |
+
file_names=["BTCUSDT-1s-2025-09.csv"]):
|
| 63 |
+
minio_client = sign_in()
|
| 64 |
+
out_parquet_file_paths = []
|
| 65 |
+
headers = [
|
| 66 |
+
"Open time", "Open", "High", "Low", "Close", "Volume",
|
| 67 |
+
"Close time", "Quote asset volume", "Number of trades",
|
| 68 |
+
"Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
for file_name in file_names:
|
| 72 |
+
csv_lines = get_minio_data(minio_client, bucket_name, file_name)
|
| 73 |
+
if not csv_lines:
|
| 74 |
+
raise ValueError(f"No data retrieved from MinIO for bucket {bucket_name}, file {file_name}")
|
| 75 |
+
temp_parquet_path = f"temp/extracted_from_minio/{os.path.splitext(os.path.basename(file_name))[0]}.parquet"
|
| 76 |
+
os.makedirs(os.path.dirname(temp_parquet_path), exist_ok=True)
|
| 77 |
+
|
| 78 |
+
# Convert CSV lines to DataFrame with specified headers
|
| 79 |
+
df = pd.read_csv(io.StringIO('\n'.join(csv_lines)), names=headers)
|
| 80 |
+
df.to_parquet(temp_parquet_path, index=False)
|
| 81 |
+
|
| 82 |
+
out_parquet_file_paths.append(temp_parquet_path)
|
| 83 |
+
|
| 84 |
+
return out_parquet_file_paths
|
| 85 |
+
|
| 86 |
+
def transform_financial_data(parquet_file_paths,
|
| 87 |
+
temp_parquet_path="temp/temp_parquet_chunks",
|
| 88 |
+
output_parquet_path="temp/aggregated_output"):
|
| 89 |
+
spark = initialize_spark_session()
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
# Define the schema
|
| 93 |
+
schema = StructType([
|
| 94 |
+
StructField("Open time", LongType(), True),
|
| 95 |
+
StructField("Open", DoubleType(), True),
|
| 96 |
+
StructField("High", DoubleType(), True),
|
| 97 |
+
StructField("Low", DoubleType(), True),
|
| 98 |
+
StructField("Close", DoubleType(), True),
|
| 99 |
+
StructField("Volume", DoubleType(), True),
|
| 100 |
+
StructField("Close time", LongType(), True),
|
| 101 |
+
StructField("Quote asset volume", DoubleType(), True),
|
| 102 |
+
StructField("Number of trades", LongType(), True),
|
| 103 |
+
StructField("Taker buy base asset volume", DoubleType(), True),
|
| 104 |
+
StructField("Taker buy quote asset volume", DoubleType(), True),
|
| 105 |
+
StructField("Ignore", LongType(), True)
|
| 106 |
+
])
|
| 107 |
+
|
| 108 |
+
# output_parquet_paths = []
|
| 109 |
+
if isinstance(parquet_file_paths, str):
|
| 110 |
+
try:
|
| 111 |
+
parquet_file_paths = ast.literal_eval(parquet_file_paths)
|
| 112 |
+
except (ValueError, SyntaxError) as e:
|
| 113 |
+
raise ValueError(f"Failed to parse server_files as a list: {parquet_file_paths}, error: {e}")
|
| 114 |
+
|
| 115 |
+
for parquet_file_path in parquet_file_paths:
|
| 116 |
+
# Create DataFrame using create_dataframe_from_csv
|
| 117 |
+
df = create_dataframe_from_csv(spark, parquet_file_path, schema, temp_parquet_path)
|
| 118 |
+
print("Created Spark DataFrame from CSV file.")
|
| 119 |
+
aggregated_df = resample_dataframe(df)
|
| 120 |
+
print("Resampled DataFrame with OHLC aggregations.")
|
| 121 |
+
|
| 122 |
+
# Save aggregated DataFrame to a temporary Parquet directory
|
| 123 |
+
os.makedirs(os.path.dirname(output_parquet_path), exist_ok=True)
|
| 124 |
+
# aggregated_df.write.mode("overwrite").parquet(output_parquet_path)
|
| 125 |
+
aggregated_df.write.mode("append").parquet(output_parquet_path)
|
| 126 |
+
print(f"Saved aggregated DataFrame to {output_parquet_path}")
|
| 127 |
+
|
| 128 |
+
# Verify that the Parquet directory exists
|
| 129 |
+
if not os.path.exists(output_parquet_path) or not os.path.isdir(output_parquet_path):
|
| 130 |
+
raise FileNotFoundError(f"Parquet directory {output_parquet_path} was not created or is not a directory.")
|
| 131 |
+
else:
|
| 132 |
+
print(f"Verified: Parquet directory exists at {output_parquet_path}")
|
| 133 |
+
# output_parquet_paths.append(output_parquet_path)
|
| 134 |
+
|
| 135 |
+
# name_output_parquet_paths = [os.path.basename(path) for path in output_parquet_paths]
|
| 136 |
+
|
| 137 |
+
return output_parquet_path#, name_output_parquet_paths
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"Error in transform_financial_data: {e}")
|
| 141 |
+
raise
|
| 142 |
+
finally:
|
| 143 |
+
spark.stop()
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
# Example usage
|
| 147 |
+
extracted_parquet_path = extract_from_minio()
|
| 148 |
+
output_parquet_path, name_output_parquet_paths = transform_financial_data(extracted_parquet_path)
|
| 149 |
+
print(output_parquet_path)
|
| 150 |
+
print(name_output_parquet_paths)
|
components/utils/__init__.py
ADDED
|
File without changes
|
components/utils/file_utils.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 8 |
+
|
| 9 |
+
def load_extract_config(config_name: str) -> Dict:
|
| 10 |
+
"""Load a YAML configuration file from the configs directory.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
config_name (str): Name of the config file (e.g., 'model_config.yml').
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
Dict: Parsed configuration dictionary.
|
| 17 |
+
|
| 18 |
+
Raises:
|
| 19 |
+
FileNotFoundError: If the configuration file does not exist.
|
| 20 |
+
ValueError: If config_name is empty or not a string.
|
| 21 |
+
"""
|
| 22 |
+
if not isinstance(config_name, str) or not config_name.strip():
|
| 23 |
+
raise ValueError("config_name must be a non-empty string")
|
| 24 |
+
|
| 25 |
+
config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'configs', config_name))
|
| 26 |
+
logging.debug(f"Attempting to load config from: {config_path}")
|
| 27 |
+
|
| 28 |
+
if not os.path.exists(config_path):
|
| 29 |
+
logging.error(f"Configuration file not found: {config_path}")
|
| 30 |
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
with open(config_path, 'r') as f:
|
| 34 |
+
config = yaml.safe_load(f)
|
| 35 |
+
if config is None:
|
| 36 |
+
logging.warning(f"Configuration file {config_name} is empty")
|
| 37 |
+
return {}
|
| 38 |
+
logging.info(f"Successfully loaded config: {config_name}")
|
| 39 |
+
return config
|
| 40 |
+
except yaml.YAMLError as e:
|
| 41 |
+
logging.error(f"Failed to parse YAML in {config_name}: {e}")
|
| 42 |
+
raise
|
| 43 |
+
|
| 44 |
+
def get_parquet_file_names() -> List[str]:
|
| 45 |
+
"""Retrieve Parquet file names from the extract_data.yml configuration.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
List[str]: List of Parquet file names derived from CSV file names.
|
| 49 |
+
|
| 50 |
+
Raises:
|
| 51 |
+
FileNotFoundError: If extract_data.yml is missing.
|
| 52 |
+
ValueError: If no files are specified in the configuration.
|
| 53 |
+
"""
|
| 54 |
+
config = load_extract_config('extract_data.yml')
|
| 55 |
+
files = config.get('files', [])
|
| 56 |
+
if not files:
|
| 57 |
+
logging.error("No files specified in extract_data.yml")
|
| 58 |
+
raise ValueError("No files specified in extract_data.yml")
|
| 59 |
+
|
| 60 |
+
parquet_files = [f.replace(".csv", ".parquet") for f in files]
|
| 61 |
+
logging.debug(f"Derived Parquet file names: {parquet_files}")
|
| 62 |
+
return parquet_files
|
| 63 |
+
|
| 64 |
+
def load_pipeline_config() -> Dict:
|
| 65 |
+
"""Load pipeline configuration from pipeline_config.yml.
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Dict: Pipeline configuration dictionary.
|
| 69 |
+
|
| 70 |
+
Raises:
|
| 71 |
+
FileNotFoundError: If pipeline_config.yml is missing.
|
| 72 |
+
"""
|
| 73 |
+
config = load_extract_config('pipeline_config.yml')
|
| 74 |
+
logging.debug(f"Pipeline config loaded: {config}")
|
| 75 |
+
return config
|
| 76 |
+
|
| 77 |
+
def define_server_filenames(**kwargs) -> List[str]:
|
| 78 |
+
"""Extract base filenames from client file paths using Airflow XCom.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
kwargs: Airflow task instance arguments containing 'ti' for XCom.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
List[str]: List of base filenames.
|
| 85 |
+
|
| 86 |
+
Raises:
|
| 87 |
+
KeyError: If 'ti' is not provided in kwargs.
|
| 88 |
+
ValueError: If no files are retrieved from XCom.
|
| 89 |
+
"""
|
| 90 |
+
if 'ti' not in kwargs:
|
| 91 |
+
logging.error("Task instance 'ti' not provided in kwargs")
|
| 92 |
+
raise KeyError("Task instance 'ti' not provided in kwargs")
|
| 93 |
+
|
| 94 |
+
ti = kwargs['ti']
|
| 95 |
+
client_files = ti.xcom_pull(task_ids='download_binance_csv')
|
| 96 |
+
if client_files is None:
|
| 97 |
+
logging.error("No files retrieved from XCom for task 'download_binance_csv'")
|
| 98 |
+
raise ValueError("No files retrieved from XCom for task 'download_binance_csv'")
|
| 99 |
+
|
| 100 |
+
if not isinstance(client_files, list):
|
| 101 |
+
client_files = [client_files]
|
| 102 |
+
|
| 103 |
+
server_files = [os.path.basename(p) for p in client_files]
|
| 104 |
+
logging.debug(f"Extracted server filenames: {server_files}")
|
| 105 |
+
return server_files
|
components/utils/utils.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import timezone, timedelta
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def parse_timezone(tz_offset_str):
|
| 5 |
+
"""
|
| 6 |
+
Parse a timezone offset string (e.g., '+07:00') and return a timezone object.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
tz_offset_str (str): Timezone offset in format '[+-]HH:MM'
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
timezone: A datetime.timezone object with the specified offset
|
| 13 |
+
|
| 14 |
+
Raises:
|
| 15 |
+
ValueError: If the timezone format is invalid
|
| 16 |
+
"""
|
| 17 |
+
match = re.match(r'([+-])(\d{2}):(\d{2})', tz_offset_str)
|
| 18 |
+
if not match:
|
| 19 |
+
raise ValueError(f"Invalid timezone format: {tz_offset_str}")
|
| 20 |
+
|
| 21 |
+
sign, hours, minutes = match.groups()
|
| 22 |
+
hours, minutes = int(hours), int(minutes)
|
| 23 |
+
if sign == '-':
|
| 24 |
+
hours, minutes = -hours, -minutes
|
| 25 |
+
|
| 26 |
+
return timezone(timedelta(hours=hours, minutes=minutes))
|
configs/data_limit.yml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- name: BTCUSDT-1s
|
| 2 |
+
limit:
|
| 3 |
+
- "2025-08"
|
| 4 |
+
- "2025-09"
|
configs/data_sources.yml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- name: BTCUSDT-1s
|
| 2 |
+
url: https://data.binance.vision/data/spot/monthly/klines/BTCUSDT/1s/
|
configs/delete_lstm_hyperparams.yml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LSTM / BiLSTM / GRU / Custom Model Hyperparameters
|
| 2 |
+
architecture: bilstm # Options: lstm, bilstm, gru, custom
|
| 3 |
+
seq_length: 60 # Number of time steps to look back
|
| 4 |
+
units: 100
|
| 5 |
+
layers: 2
|
| 6 |
+
dropout: 0.2
|
| 7 |
+
activation: tanh
|
| 8 |
+
|
| 9 |
+
optimizer: adam
|
| 10 |
+
loss: mse
|
| 11 |
+
epochs: 50
|
| 12 |
+
batch_size: 64
|
| 13 |
+
patience: 10
|
| 14 |
+
|
| 15 |
+
train_ratio: 0.8
|
| 16 |
+
val_ratio: 0.1
|
| 17 |
+
test_ratio: 0.1
|
configs/extract_data.yml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
files:
|
| 2 |
+
- BTCUSDT-1s-2025-08.csv
|
| 3 |
+
- BTCUSDT-1s-2025-09.csv
|
| 4 |
+
storage_folder: temp/extracted_from_minio
|
configs/model_config.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# Unified Model & Output Configuration
|
| 3 |
+
# =============================================================================
|
| 4 |
+
|
| 5 |
+
model:
|
| 6 |
+
architecture: bilstm # lstm | bilstm | gru | custom
|
| 7 |
+
units: 100 #100
|
| 8 |
+
layers: 3 #2
|
| 9 |
+
dropout: 0.2
|
| 10 |
+
activation: tanh
|
| 11 |
+
optimizer: adam
|
| 12 |
+
learning_rate: 0.0005
|
| 13 |
+
loss: mse
|
| 14 |
+
|
| 15 |
+
training:
|
| 16 |
+
epochs: 20
|
| 17 |
+
batch_size: 6144
|
| 18 |
+
patience: 5
|
| 19 |
+
|
| 20 |
+
evaluation:
|
| 21 |
+
eval_batch_size: 6144
|
| 22 |
+
|
| 23 |
+
data:
|
| 24 |
+
seq_length: 60
|
| 25 |
+
train_ratio: 0.8
|
| 26 |
+
val_ratio: 0.1
|
| 27 |
+
|
| 28 |
+
output:
|
| 29 |
+
checkpoints:
|
| 30 |
+
model_dir: ckpts
|
| 31 |
+
scaler_dir: ckpts
|
| 32 |
+
|
| 33 |
+
metrics:
|
| 34 |
+
metrics_dir: evaluation
|
| 35 |
+
|
| 36 |
+
predictions:
|
| 37 |
+
pred_dir: evaluation
|
| 38 |
+
|
| 39 |
+
versioning:
|
| 40 |
+
datetime_format: '%Y-%m-%d-%H-%M-%S'
|
| 41 |
+
timezone: '+07:00'
|
configs/pipeline_config.yml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
minio:
|
| 2 |
+
bucket_name: minio-ngrok-bucket
|
| 3 |
+
paths:
|
| 4 |
+
temp_parquet_path: temp/temp_parquet_chunks
|
| 5 |
+
output_parquet_path: temp/aggregated_output
|
| 6 |
+
duckdb_path: duckdb_databases/financial_data.db
|
| 7 |
+
output_csv_path: analytics/financial_data.csv
|
| 8 |
+
parquet_folder: temp/extracted_from_minio
|
docs/data_sources.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Source: Binance BTCUSDT 1s Kline
|
| 2 |
+
|
| 3 |
+
- **Source:** [Binance Data Public](https://data.binance.vision/?prefix=data/spot/monthly/klines/BTCUSDT/1s/)
|
| 4 |
+
- **Example file:** `BTCUSDT-1s-2024-05.csv.gz`
|
| 5 |
+
- **Columns:**
|
| 6 |
+
1. Open time
|
| 7 |
+
2. Open
|
| 8 |
+
3. High
|
| 9 |
+
4. Low
|
| 10 |
+
5. Close
|
| 11 |
+
6. Volume
|
| 12 |
+
7. Close time
|
| 13 |
+
8. Quote asset volume
|
| 14 |
+
9. Number of trades
|
| 15 |
+
10. Taker buy base asset volume
|
| 16 |
+
11. Taker buy quote asset volume
|
| 17 |
+
12. Ignore
|
| 18 |
+
|
| 19 |
+
## Usage
|
| 20 |
+
|
| 21 |
+
- The pipeline downloads and processes this data for deep learning time series forecasting.
|
| 22 |
+
- The DAG can be modified to use a different month by changing the filename in the download step.
|
| 23 |
+
|
| 24 |
+
## Acknowledgment
|
| 25 |
+
You can refer to https://github.com/binance/binance-public-data?tab=readme-ov-file#klines to get more infromation
|
| 26 |
+
|
docs/dependencies.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dependencies
|
| 2 |
+
|
| 3 |
+
## System
|
| 4 |
+
|
| 5 |
+
- Hadoop/HDFS
|
| 6 |
+
- Spark
|
| 7 |
+
- Airflow
|
| 8 |
+
- Python 3.8+
|
| 9 |
+
|
| 10 |
+
## Python
|
| 11 |
+
|
| 12 |
+
Install with pip:
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
pip install pandas numpy scikit-learn tensorflow
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Notes
|
| 19 |
+
|
| 20 |
+
- Ensure Java is installed for Hadoop/Spark.
|
| 21 |
+
- Airflow and Hadoop should be configured and running before triggering the DAG.
|
| 22 |
+
- If using a dev container, dependencies may already be installed.
|
docs/frameworks_installation.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Spark
|
| 2 |
+
### Install Java 8 (required for Spark)
|
| 3 |
+
!apt-get update -qq
|
| 4 |
+
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
|
| 5 |
+
|
| 6 |
+
### Download and extract Spark (use the latest version; this is 3.5.6 with Hadoop 3)
|
| 7 |
+
!wget -q https://downloads.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
|
| 8 |
+
!tar xf spark-3.5.6-bin-hadoop3.tgz
|
| 9 |
+
|
| 10 |
+
### Install PySpark and findspark (helps locate Spark)
|
| 11 |
+
!pip install -q pyspark findspark duckdb # duckdb for your script
|
| 12 |
+
|
| 13 |
+
### Set environment variables
|
| 14 |
+
import os
|
| 15 |
+
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
|
| 16 |
+
os.environ["SPARK_HOME"] = "/content/spark-3.5.6-bin-hadoop3"
|
| 17 |
+
|
| 18 |
+
### Initialize findspark
|
| 19 |
+
import findspark
|
| 20 |
+
findspark.init()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
## Hadoop
|
| 24 |
+
|
| 25 |
+
!wget https://downloads.apache.org/hadoop/common/hadoop-3.4.2/hadoop-3.4.2.tar.gz
|
| 26 |
+
!tar -xzvf hadoop-3.4.2.tar.gz && cp -r hadoop-3.4.2/ /usr/local/
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
JAVA_HOME = !readlink -f /usr/bin/java | sed "s:bin/java::"
|
| 30 |
+
java_home_text = JAVA_HOME[0]
|
| 31 |
+
java_home_text_command = f"$ {JAVA_HOME[0]} "
|
| 32 |
+
!echo export JAVA_HOME=$java_home_text >>/usr/local/hadoop-3.4.2/etc/hadoop/hadoop-env.sh
|
| 33 |
+
|
| 34 |
+
# Set environment variables
|
| 35 |
+
import os
|
| 36 |
+
os.environ['HADOOP_HOME']="/usr/local/hadoop-3.4.2"
|
| 37 |
+
os.environ['JAVA_HOME']=java_home_text
|
| 38 |
+
|
| 39 |
+
!alias hadoop="/usr/local/hadoop-3.4.2/bin/hadoop"
|
| 40 |
+
!alias hdfs="/usr/local/hadoop-3.4.2/bin/hdfs"
|
| 41 |
+
!source ~/.bashrc # or source ~/.zshrc
|
| 42 |
+
!sudo ln -s /usr/local/hadoop-3.4.2/bin/hadoop /usr/local/bin/hadoop
|
| 43 |
+
!sudo ln -s /usr/local/hadoop-3.4.2/bin/hdfs /usr/local/bin/hdfs
|
| 44 |
+
!hadoop
|
| 45 |
+
!hdfs
|
| 46 |
+
## Airflow
|
| 47 |
+
|
| 48 |
+
pip install apache-airflow
|
| 49 |
+
|
| 50 |
+
airflow db init
|
| 51 |
+
|
| 52 |
+
airflow webserver -p 8080 &
|
| 53 |
+
airflow scheduler &
|
| 54 |
+
|
| 55 |
+
## Ngrok
|
| 56 |
+
|
| 57 |
+
## MinIO
|
| 58 |
+
### Client
|
| 59 |
+
```bash
|
| 60 |
+
pip install minio
|
| 61 |
+
```
|
| 62 |
+
### Server
|
| 63 |
+
# Install MinIO binary
|
| 64 |
+
!wget https://dl.min.io/server/minio/release/linux-amd64/minio
|
| 65 |
+
!chmod +x minio
|
| 66 |
+
!mkdir -p ~/minio-data
|
| 67 |
+
|
| 68 |
+
import os
|
| 69 |
+
os.environ['MINIO_ROOT_USER'] = 'username'
|
| 70 |
+
os.environ['MINIO_ROOT_PASSWORD'] = 'username_password'
|
| 71 |
+
|
| 72 |
+
!./minio server ~/minio-data --address ":12390" --console-address ":12391" &
|
docs/install_airflow.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Installing and Setting Up Apache Airflow
|
| 2 |
+
|
| 3 |
+
This guide provides detailed instructions for installing and configuring Apache Airflow with support for asynchronous tasks, Celery, PostgreSQL, and Kubernetes. The steps below ensure a proper setup for running Airflow, initializing its database, creating an admin user, and starting the scheduler and webserver. This setup is suitable for a local development environment or a scalable production setup with the specified backends.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
Before proceeding, ensure you have the following:
|
| 7 |
+
- **Python 3.12**: Airflow 2.10.3 is compatible with Python 3.12, as specified in the constraint file.
|
| 8 |
+
- **pip**: The Python package manager to install Airflow and its dependencies.
|
| 9 |
+
- **PostgreSQL**: If using PostgreSQL as the metadata database (recommended for production).
|
| 10 |
+
- **Celery**: For distributed task execution (optional, included in the installation).
|
| 11 |
+
- **Kubernetes**: For running Airflow in a Kubernetes cluster (optional, included in the installation).
|
| 12 |
+
- **Sufficient permissions**: To create directories and run background processes.
|
| 13 |
+
- **Virtual environment** (recommended): To isolate dependencies. Create one with:
|
| 14 |
+
```bash
|
| 15 |
+
python -m venv venv
|
| 16 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## Installation Steps
|
| 20 |
+
|
| 21 |
+
### 1. Install Apache Airflow
|
| 22 |
+
Install Airflow for version 2.10.3.
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
pip install apache-airflow==2.10.3
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### 2. Set Up the Airflow Home Directory
|
| 29 |
+
Airflow requires a home directory to store its configuration, logs, and DAGs. The following Python script sets the `AIRFLOW_HOME` environment variable and creates the directory if it doesn't exist.
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
import os
|
| 33 |
+
import time
|
| 34 |
+
|
| 35 |
+
# Ensure environment
|
| 36 |
+
os.environ['AIRFLOW_HOME'] = '<your_project_path>/airflow'
|
| 37 |
+
os.makedirs('airflow', exist_ok=True)
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
Replace `<your_project_path>` with the absolute path to your project directory (e.g., `/home/user/BTC-USDT-ETL-Pipeline`). For example:
|
| 41 |
+
```python
|
| 42 |
+
os.environ['AIRFLOW_HOME'] = '/home/user/BTC-USDT-ETL-Pipeline/airflow'
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
This script ensures the `airflow` directory is created in your project path to store Airflow's configuration files, logs, and SQLite database (if not using PostgreSQL).
|
| 46 |
+
|
| 47 |
+
### 3. Initialize the Airflow Database
|
| 48 |
+
Initialize the Airflow metadata database, which stores DAG runs, task instances, and other metadata. This step is required before starting the scheduler or webserver.
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
# Re-init the database (resets metadata but keeps DAGs if any)
|
| 52 |
+
airflow db init
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
**Note**:
|
| 56 |
+
- This command creates a default `airflow.cfg` configuration file in `AIRFLOW_HOME`.
|
| 57 |
+
- If using PostgreSQL, ensure the database is running and update the `sql_alchemy_conn` in `airflow.cfg` to point to your PostgreSQL instance (e.g., `postgresql+psycopg2://user:password@localhost:5432/airflow`).
|
| 58 |
+
- Running `airflow db init` resets metadata but preserves any DAGs in the `dags` folder.
|
| 59 |
+
|
| 60 |
+
### 4. Create an Admin User
|
| 61 |
+
The Airflow webserver requires at least one admin user for login. Create an admin user with the following command:
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
# Create admin user (critical—webserver needs this for login)
|
| 65 |
+
airflow users create \
|
| 66 |
+
--username admin \
|
| 67 |
+
--firstname Admin \
|
| 68 |
+
--lastname User \
|
| 69 |
+
--role Admin \
|
| 70 |
+
--email admin@example.com \
|
| 71 |
+
--password admin
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
This command creates a user with:
|
| 75 |
+
- Username: `admin`
|
| 76 |
+
- Password: `admin` (change this in production for security)
|
| 77 |
+
- Role: `Admin` (grants full access to the Airflow UI)
|
| 78 |
+
|
| 79 |
+
To verify the user was created successfully, list all users:
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
# Verify user creation
|
| 83 |
+
airflow users list
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### 5. Start the Airflow Scheduler
|
| 87 |
+
The scheduler is responsible for scheduling and executing DAGs. Start it in the background using `nohup` to ensure it continues running.
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
# Start scheduler first (it needs DB)
|
| 91 |
+
nohup airflow scheduler > airflow/scheduler.log 2>&1 &
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**Notes**:
|
| 95 |
+
- The scheduler requires the database to be initialized first.
|
| 96 |
+
- Logs are redirected to `scheduler.log` in the specified directory.
|
| 97 |
+
- Replace `airflow` with your `AIRFLOW_HOME` path if different.
|
| 98 |
+
|
| 99 |
+
### 6. Start the Airflow Webserver
|
| 100 |
+
The webserver provides the Airflow UI for managing DAGs, viewing task logs, and monitoring runs. Start it on port 8081 (or another port if needed).
|
| 101 |
+
|
| 102 |
+
```bash
|
| 103 |
+
airflow webserver --port 8081 > airflow/airflow.log 2>&1 &
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
**Notes**:
|
| 107 |
+
- The webserver runs on `http://localhost:8081` by default.
|
| 108 |
+
- Logs are redirected to `airflow.log` in the `AIRFLOW_HOME` directory.
|
| 109 |
+
- Access the UI by navigating to `http://localhost:8081` in your browser and logging in with the admin credentials (username: `admin`, password: `admin`).
|
| 110 |
+
|
| 111 |
+
## Additional Notes
|
| 112 |
+
- **Configuration**: After running `airflow db init`, review and modify `airflow.cfg` in the `AIRFLOW_HOME` directory to customize settings (e.g., executor type, database connection, or Celery broker).
|
| 113 |
+
- **Celery Setup**: If using the Celery executor, ensure a message broker (e.g., Redis or RabbitMQ) is running and configured in `airflow.cfg`.
|
| 114 |
+
- **Kubernetes Executor**: For Kubernetes, configure the Kubernetes executor in `airflow.cfg` and ensure your Kubernetes cluster is accessible.
|
| 115 |
+
- **Security**: Change the default admin password and secure the database connection in production environments.
|
| 116 |
+
- **Logs**: Check `scheduler.log` and `airflow.log` for troubleshooting.
|
| 117 |
+
|
| 118 |
+
## Next Steps
|
| 119 |
+
- Place your DAGs in the `AIRFLOW_HOME/dags` folder to start defining workflows.
|
| 120 |
+
- Explore the Airflow UI to monitor and manage your DAGs.
|
| 121 |
+
- Refer to the [Apache Airflow documentation](https://airflow.apache.org/docs/apache-airflow/stable/) for advanced configurations.
|
docs/install_minio_server.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MinIO Server Setup Guide
|
| 2 |
+
|
| 3 |
+
This guide provides step-by-step instructions to set up and run a MinIO server on a Linux system.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
- Python 3.x installed
|
| 7 |
+
- Required Python packages: `python-dotenv`, `wget` (install using `pip install python-dotenv wget`)
|
| 8 |
+
- A `minio.env` environment file
|
| 9 |
+
- Administrative privileges for file permissions and port usage
|
| 10 |
+
- Free ports for MinIO API (default: 9000) and WebUI (default: 9001)
|
| 11 |
+
|
| 12 |
+
## Setup Instructions
|
| 13 |
+
|
| 14 |
+
### 1. Download and Prepare MinIO Binary
|
| 15 |
+
Run the following commands to download the MinIO server binary and set up the data directory:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
wget https://dl.min.io/server/minio/release/linux-amd64/minio
|
| 19 |
+
chmod +x minio
|
| 20 |
+
mkdir -p ~/minio-data
|
| 21 |
+
mkdir -p ~/minio-logs
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### 2. Configure Environment Variables
|
| 25 |
+
Create a `minio.env` file in the same directory as the MinIO binary with the following content, replacing placeholders with your desired values:
|
| 26 |
+
|
| 27 |
+
```
|
| 28 |
+
MINIO_ROOT_USER=<your_username>
|
| 29 |
+
MINIO_ROOT_PASSWORD=<your_password>
|
| 30 |
+
MINIO_HOST=localhost:<minio_port>
|
| 31 |
+
MINIO_CONSOLE_ADDRESS=localhost:<minio_web_port>
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
- `<your_username>`: Choose a secure username for the MinIO admin account.
|
| 35 |
+
- `<your_password>`: Use a strong password (at least 8 characters).
|
| 36 |
+
- Ensure ports `<minio_port>` (API) and `<minio_web_port>` (WebUI) are free, or update them to available ports.
|
| 37 |
+
|
| 38 |
+
### 3. Start the MinIO Server
|
| 39 |
+
Run the following command to start the MinIO server in the background, using the environment variables from `minio.env`:
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
export MINIO_ROOT_USER=<your_username>
|
| 43 |
+
export MINIO_ROOT_PASSWORD=<your_password>
|
| 44 |
+
MINIO_ROOT_USER=$MINIO_ROOT_USER MINIO_ROOT_PASSWORD=$MINIO_ROOT_PASSWORD \
|
| 45 |
+
./minio server ~/minio-data --address :<minio_port> --console-address :<minio_web_port> > ~/minio-logs/minio_server.log 2>&1 &
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
- This command exports environment variables and starts the MinIO server.
|
| 49 |
+
- Logs are saved to `~/minio-logs/minio_server.log` for troubleshooting.
|
| 50 |
+
|
| 51 |
+
### 4. Access MinIO
|
| 52 |
+
- **API Access**: Connect to `http://localhost:<minio_port>` for programmatic access.
|
| 53 |
+
- **WebUI Access**: Open `http://localhost:<minio_web_port>` in a browser and log in with `<your_username>` and `<your_password>`.
|
| 54 |
+
|
| 55 |
+
## Notes
|
| 56 |
+
- **Stopping the Server**: To stop the MinIO server, find its process ID using `ps aux | grep minio` and terminate it with `kill <pid>`.
|
| 57 |
+
- **Port Conflicts**: If ports `<minio_port>` or `<minio_web_port>` are in use, modify `MINIO_ADDRESS` and `MINIO_CONSOLE_ADDRESS` in `minio.env` to use different ports.
|
| 58 |
+
- **Security**: Store `minio.env` securely and avoid exposing sensitive credentials.
|
| 59 |
+
- **Data Directory**: The `~/minio-data` directory stores MinIO buckets and objects. Ensure it has sufficient disk space.
|
docs/install_spark.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Install Apache Spark
|
| 2 |
+
|
| 3 |
+
This guide provides step-by-step instructions to download, install, and configure Apache Spark 3.5.6 with Hadoop 3 support on a Linux-based system. Apache Spark is a powerful open-source data processing engine designed for big data and machine learning workloads. The following commands will help you set up Spark and configure the environment variables to run Spark applications, including PySpark with Python 3.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
- A Linux-based operating system (e.g., Ubuntu, CentOS).
|
| 7 |
+
- `wget` and `tar` utilities installed.
|
| 8 |
+
- `sudo` privileges for moving files to system directories.
|
| 9 |
+
- Python 3 installed (for PySpark).
|
| 10 |
+
|
| 11 |
+
## Installation Steps
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
# Download Apache Spark 3.5.6 with Hadoop 3 support
|
| 15 |
+
wget https://downloads.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
|
| 16 |
+
|
| 17 |
+
# Extract the downloaded tarball
|
| 18 |
+
tar -xzf spark-3.5.6-bin-hadoop3.tgz
|
| 19 |
+
|
| 20 |
+
# Move the extracted folder to /opt/spark
|
| 21 |
+
sudo mv spark-3.5.6-bin-hadoop3 /opt/spark
|
| 22 |
+
|
| 23 |
+
# Set environment variables for Spark
|
| 24 |
+
export SPARK_HOME=/opt/spark
|
| 25 |
+
export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
|
| 26 |
+
export PYSPARK_PYTHON=python3
|
| 27 |
+
```
|
docs/visualize_data.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Analytics & Visualization
|
| 2 |
+
|
| 3 |
+
## DuckDB
|
| 4 |
+
|
| 5 |
+
Run ad-hoc queries using python to get `output_csv_path`:
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
python components/duckdb2csv.py
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## LockerStudio
|
| 12 |
+
|
| 13 |
+
Upload `analytics/financial_data.csv` to Google Looker and create Report.
|
| 14 |
+
|
| 15 |
+
## Looker
|
| 16 |
+
You can see report at: https://lookerstudio.google.com/reporting/d12e8138-ffdb-40ac-a7fd-9fa986464f54/page/YtFbF/edit
|
duckdb_databases/financial_data.db
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20b333763c7dd6bdddd4eb0e88b7eb427faa464a213bc024fcd6cb5001e92bdb
|
| 3 |
+
size 536576
|
evaluation/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.csv
|
logs/.gitkeep
ADDED
|
File without changes
|