heboya8 commited on
Commit
2eee82e
·
verified ·
1 Parent(s): 787525e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. .gitignore +208 -0
  3. .python_version +1 -0
  4. CHANGELOG.md +789 -0
  5. LICENSE +201 -0
  6. README.md +108 -0
  7. airflow/airflow-webserver.pid +1 -0
  8. airflow/airflow.cfg +2498 -0
  9. airflow/airflow.db +3 -0
  10. airflow/dags/new6.py +147 -0
  11. airflow/webserver_config.py +132 -0
  12. analytics/BTCUSDT_report.pdf +0 -0
  13. ckpts/.gitignore +2 -0
  14. ckpts/model_2025-10-28-11-33-51-(+07).h5 +3 -0
  15. ckpts/scaler_2025-10-28-11-33-51-(+07).pkl +3 -0
  16. components/__init__.py +0 -0
  17. components/btcusdt_ingest_data.py +157 -0
  18. components/datalake_cr.py +44 -0
  19. components/delete_lstm_predict.py +56 -0
  20. components/delete_lstm_train.py +42 -0
  21. components/delete_model.py +10 -0
  22. components/duckdb2csv.py +22 -0
  23. components/duckdb_api.py +68 -0
  24. components/model/__init__.py +0 -0
  25. components/model/data_utils.py +90 -0
  26. components/model/evaluation.py +239 -0
  27. components/model/model_utils.py +294 -0
  28. components/model/old_model_utils.py +189 -0
  29. components/model/training.py +194 -0
  30. components/old-process_data.py +111 -0
  31. components/process_data.py +150 -0
  32. components/utils/__init__.py +0 -0
  33. components/utils/file_utils.py +105 -0
  34. components/utils/utils.py +26 -0
  35. configs/data_limit.yml +4 -0
  36. configs/data_sources.yml +2 -0
  37. configs/delete_lstm_hyperparams.yml +17 -0
  38. configs/extract_data.yml +4 -0
  39. configs/model_config.yml +41 -0
  40. configs/pipeline_config.yml +8 -0
  41. docs/data_sources.md +26 -0
  42. docs/dependencies.md +22 -0
  43. docs/frameworks_installation.md +72 -0
  44. docs/install_airflow.md +121 -0
  45. docs/install_minio_server.md +59 -0
  46. docs/install_spark.md +27 -0
  47. docs/visualize_data.md +16 -0
  48. duckdb_databases/financial_data.db +3 -0
  49. evaluation/.gitignore +1 -0
  50. logs/.gitkeep +0 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ airflow/airflow.db filter=lfs diff=lfs merge=lfs -text
37
+ duckdb_databases/financial_data.db filter=lfs diff=lfs merge=lfs -text
38
+ minio filter=lfs diff=lfs merge=lfs -text
39
+ temp/BTCUSDT-1s-2025-08.csv filter=lfs diff=lfs merge=lfs -text
40
+ temp/BTCUSDT-1s-2025-09.csv filter=lfs diff=lfs merge=lfs -text
41
+ temp/temp_parquet_chunks/.part-00000-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
42
+ temp/temp_parquet_chunks/.part-00002-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
43
+ temp/temp_parquet_chunks/.part-00003-5d1f072f-086f-4a7a-8c65-5cbc6839e5b5-c000.snappy.parquet.crc filter=lfs diff=lfs merge=lfs -text
44
+ zrok filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ *.env
140
+ .envrc
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
.python_version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
CHANGELOG.md ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHANGELOG
2
+
3
+ ## v1.1.9
4
+
5
+ CHANGE: The `publicProxy` now supports "striped session cookies" to support larger authentication payloads when working with OIDC providers that use larger tokens/payloads. (https://github.com/openziti/zrok/issues/1101)
6
+
7
+ FIX: Fix for icon/favicon in HTML for the api console. (https://github.com/openziti/zrok/pull/1094)
8
+
9
+ ## v1.1.8
10
+
11
+ CHANGE: The `ContextDialer` in `agent.Controller.NewClient` now utilizes the "two-phase" approach to invoking `Dial` on the remote agent service. First, it attempts to dial the service with the current service list state. If that fails, it will call `RefreshService` to update the service list and does a second `Dial`. If the second `Dial` fails, then the connection fails. This works around service list staleness issues. (https://github.com/openziti/zrok/issues/1090)
12
+
13
+ CHANGE: `github.com/openziti/sdk-golang` updated to `v1.2.8`.
14
+
15
+ CHANGE: CI pull requests now trigger a native-architecture Windows build.
16
+
17
+ ## v1.1.7
18
+
19
+ FIX: Missing import for windows-specific build.
20
+
21
+ ## v1.1.6
22
+
23
+ FEATURE: The `agent.Agent` now can optionally enforce that agent remoting starts successfully when creating a new instance. The `agent.Config` struct has a new `RequireRemoting` boolean to control this behavior. (https://github.com/openziti/zrok/issues/1085)
24
+
25
+ CHANGE: Additional diagnostic logging in the zrok Agent; details around sub-process executions, etc. (https://github.com/openziti/zrok/issues/1084)
26
+
27
+ ## v1.1.5
28
+
29
+ CHANGE: Upgraded go toolchain to `v1.24.6`. (https://github.com/openziti/zrok/issues/1072)
30
+
31
+ ## v1.1.4
32
+
33
+ CHANGE: Update `github.com/caddyserver/caddy/v2` to `v2.9.1`; fixes CVE-2024-53259 (would only potentially effect users using the QUIC protocol, very atypical) (https://github.com/openziti/zrok/issues/1047)
34
+
35
+ ## v1.1.3
36
+
37
+ FEATURE: A new `compatibility` > `version_patterns` array is included in the controller configuration, allowing for dynamic adjustment of allowed client version strings (https://github.com/openziti/zrok/issues/1030)
38
+
39
+ FEATURE: A new `compatibility` > `log_version` boolean is included in the controller configuration. When this boolean is set to `true`, the controller will log all client versions provided for compatibility checking.
40
+
41
+ CHANGE: Update `github.com/openziti/sdk-golang` to `v1.2.3`
42
+
43
+ CHANGE: Minor vulnerability packages updated in `ui` and `agent/agentUi`
44
+
45
+ FIX: The `scope` field of the metrics returned from `/metrics/environment/...` is now properly set as `environment` and the from `/metrics/share/...` is now properly set as `share` (https://github.com/openziti/zrok/issues/1031)
46
+
47
+ ## v1.1.2
48
+
49
+ FIX: A panic happened in the `publicProxy` implementation when no `oauth` config block is present (https://github.com/openziti/zrok/issues/1032)
50
+
51
+ ## v1.1.1
52
+
53
+ FIX: Masquerade as `v1.0-v1.1.1 [gitHash]` when performing client version checks. Will be replaced with the usual client identifier in `v1.1.2` when the regular expressions for controlling client compatibility are externalized in the controller config (https://github.com/openziti/zrok/issues/1028)
54
+
55
+ ## v1.1.0
56
+
57
+ FEATURE: Rewritten and improved `publicProxy` package (`zrok access public`), with support for extensible OAuth-compliant identity providers. The `publicProxy` configuration now supports any number of configured OAuth-compliant providers (rather than just a single `google` provider and/or a single `github` provider). Also includes a new OIDC-compliant generic IDP provider integration. Improvements to authentication flows and security all around. See the [updated guide](https://docs.zrok.io/docs/guides/self-hosting/oauth/configuring-oauth/) on using OAuth-based identity providers with the zrok public frontend (https://github.com/openziti/zrok/issues/968)
58
+
59
+ FEATURE: Templatized and improved static pages (not found/404, unauthorized/401, health check, etc.) used by the public frontend. Consolidated variable data using golang `text/template` so that static `proxyUi` package can display additional error information and provide extension points for replacing all of the templated content with external files. See the [error pages guide](https://docs.zrok.io/docs/guides/self-hosting/error-pages/) for more information on customizing the built-in template (https://github.com/openziti/zrok/issues/1012)
60
+
61
+ FEATURE: `zrok access private` now includes a `--template-path` allowing the embedded `proxyUi` template to be replaced with an external HTML file (https://github.com/openziti/zrok/issues/1012)
62
+
63
+ FIX: Invoking `/agent/*` endpoints to remotely manage agents with remoting was causing a new API session to be allocated in the ziti controller for each request. A slightly different strategy was employed for embedding the ziti SDK into the zrok controller that should mitigate this (https://github.com/openziti/zrok/issues/1023)
64
+
65
+ ## v1.0.8
66
+
67
+ FEATURE: New opt-in configuration item `superNetwork` which enables multiple data plane connections to the OpenZiti underlay, a separate control plane connection, enabling SDK-based flow control. To opt-in use `zrok config set superNetwork true` in each environment, or set the `ZROK_SUPER_NETWORK` environment variable to `true` (https://github.com/openziti/zrok/issues/1010)
68
+
69
+ CHANGE: Updated `github.com/openziti/sdk-golang` to `v1.2.1` (https://github.com/openziti/zrok/issues/1010)
70
+
71
+ ## v1.0.7
72
+
73
+ FEATURE: zrok Agent now supports health checks (against the target endpoint) for `proxy` backend shares using the `zrok agent share http-healthcheck` command. The zrok API now includes an `/agent/share/http-healthcheck` endpoint for remotely performing these checks against remoted Agents. See the guide for using the feature at https://docs.zrok.io/guides/agent/http-healthcheck/ (https://github.com/openziti/zrok/issues/1002)
74
+
75
+ FEATURE: `/overview`, `/detail/share`, `/detail/environment`, and `/overview/{organizationToken}/{accountEmail}` all adjusted to include `envZId` in share detail output (https://github.com/openziti/zrok/issues/998)
76
+
77
+ FEATURE: New add and delete API endpoints for frontend grants. New `zrok admin create frontend-grant` and `zrok admin delete frontend-grant` CLI for invoking these API endpoints from the command line (https://github.com/openziti/zrok/issues/992)
78
+
79
+ FEATURE: New admin endpoint for deleting accounts. New `zrok admin delete account` CLI for invoking the API endpoint from the command line (https://github.com/openziti/zrok/issues/993)
80
+
81
+ FEATURE: New admin endpoint for deleting identities. New `zrok admin delete identity` CLI for invoking the API endpoint from the command line (https://github.com/openziti/zrok/issues/800)
82
+
83
+ FEATURE: New API endpoint (`/overview/public-frontends`) that returns the public frontends available to authenticated account. The public frontends include those marked with the `open` permission mode, and those marked `closed` where the user has a frontend grant allowing them to access the frontend. New CLI command `zrok overview public-frontends` to allow end users to list the public frontends their account can use (https://github.com/openziti/zrok/issues/996)
84
+
85
+ CHANGE: Updated `openapi-generator-cli` from `7.12.0` to `7.14.0`
86
+
87
+ ## v1.0.6
88
+
89
+ CHANGE: The `/overview` endpoint has been adjusted to include a new `remoteAgent` `boolean` on the `environment` instances, indicating whether or not the environment has an enrolled remote agent (https://github.com/openziti/zrok/issues/977)
90
+
91
+ CHANGE: Adjusted core framework entry points to support changing zrokdir, and host interrogation functions to better support embedded zrok functionality (https://github.com/openziti/zrok/issues/976)
92
+
93
+ ## v1.0.5
94
+
95
+ FEATURE: Initial support for zrok Agent remoting; new `zrok agent enroll` and `zrok agent unenroll` commands that establish opt-in remote Agent management facilities on a per-environment basis. The central API has been augmented to allow for remote control (creating shares and private access instances) of these agents; see the [remoting guide](https://docs.zrok.io/docs/guides/agent/remoting) for details (https://github.com/openziti/zrok/issues/967)
96
+
97
+ CHANGE: `zrok share public`, `zrok share private`, and `zrok reserve` all default to the "closed" permission mode (they previously defaulted to the "open" permission mode). The `--closed` flag has been replaced with a new `--open` flag. See the [Permission Modes](https://docs.zrok.io/docs/guides/permission-modes/) docs for details (https://github.com/openziti/zrok/issues/971)
98
+
99
+ FIX: `zrok enable` now handles the case where the user ID does not resolve to a username when generating the default environment description (https://github.com/openziti/zrok/issues/959)
100
+
101
+ FIX: Linux packages were optimized to avoid manage file revision conflicts (https://github.com/openziti/zrok/issues/817)
102
+
103
+ ## v1.0.4
104
+
105
+ FIX: `zrok admin bootstrap` and `zrok enable` functionality were broken in `v1.0.3`. A bad combination of dependencies caused issues with marshalling data from the associated controller endpoints
106
+
107
+ CHANGE: `github.com/openziti/sdk-golang` has been updated to `v1.1.0`, `github.com/openziti/ziti` has been updated to `v1.6.0`. Related dependencies and indirects also updated
108
+
109
+ CHANGE: Updated to `golang` `v1.24` as the official build toolchain
110
+
111
+ ## v1.0.3
112
+
113
+ FEATURE: `zrok agent console` now outputs the URL it is attempting to open. New `zrok agent console --headless` option to only emit the agent console URL (https://github.com/openziti/zrok/issues/944)
114
+
115
+ FEATURE: New `zrok admin unbootstrap` to remove zrok resources from the underlying OpenZiti instance (https://github.com/openziti/zrok/issues/935)
116
+
117
+ FEATURE: New InfluxDB metrics capture infrastructure for `zrok test canary` framework (https://github.com/openziti/zrok/issues/948)
118
+
119
+ FEATURE: New `zrok test canary enabler` to validate `enable`/`disable` operations and gather performance metrics around how those paths are operating (https://github.com/openziti/zrok/issues/771)
120
+
121
+ FEATURE: New `zrok test canary` infrastructure capable of supporting more complex testing scenarios; now capable of streaming canary metrics into an InfluxDB repository; new programming framework for developing additional types of streaming canary metrics (https://github.com/openziti/zrok/issues/948 https://github.com/openziti/zrok/issues/954)
122
+
123
+ FEATURE: All `zrok test canary` commands that have "min" and "max" values (`--min-pacing` and `--max-pacing` for example) now include a singular version of that flag for setting both "min" and "max" to the same value (`--pacing` for example). The singular version of the flag always overrides any `--min-*` or `--max-*` values that might be set
124
+
125
+ CHANGE: New _guard_ to prevent users from running potentially dangerous `zrok test canary` commands inadvertently without understanding what they do (https://github.com/openziti/zrok/issues/947)
126
+
127
+ CHANGE: Updated `npm` dependencies for `ui`, `agent/agentUi` and `website`. Updated `github.com/openziti/sdk-golang` to `v0.24.0`
128
+
129
+ ## v1.0.2
130
+
131
+ FEATURE: "Auto-rebase" for enabled environments where the `apiEndpoint` is set to `https://api.zrok.io`. This will automatically migrate existing environments to the new `apiEndpoint` for the `v1.0.x` series (https://github.com/openziti/zrok/issues/936)
132
+
133
+ FEATURE: New `admin/new_account_link` configuration option to allow the insertion of "how do I register for an account?" links into the login form (https://github.com/openziti/zrok/issues/552)
134
+
135
+ CHANGE: The release environment, share, and access modals in the API console now have a better message letting the user know they will still need to clean up their `zrok` processes (https://github.com/openziti/zrok/issues/910)
136
+
137
+ CHANGE: The openziti/zrok Docker image has been updated to use the latest version of the ziti CLI, 1.4.3 (https://github.com/openziti/zrok/pull/917)
138
+
139
+ ## v1.0.1
140
+
141
+ FEATURE: The zrok Agent now persists private accesses and reserved shares between executions. Any `zrok access private` instances or `zrok share reserved` instances created using the agent are now persisted to a registry stored in `${HOME}/.zrok`. When restarting the agent these accesses and reserved shares are re-created from the data in this registry (https://github.com/openziti/zrok/pull/922)
142
+
143
+ FEATURE: zrok-agent Linux package runs the agent as a user service (https://github.com/openziti/zrok/issues/883)
144
+
145
+ CHANGE: Updated the "Getting Started" guide to be slightly more streamlined and reflect the `v1.0` changes (https://github.com/openziti/zrok/issues/877)
146
+
147
+ CHANGE: let the Docker instance set the Caddy HTTPS port (https://github.com/openziti/zrok/pull/920)
148
+
149
+ CHANGE: Add Traefik option for TLS termination in the Docker instance (https://github.com/openziti/zrok/issues/808)
150
+
151
+ ## v1.0.0
152
+
153
+ MAJOR RELEASE: zrok reaches version 1.0.0!
154
+
155
+ FEATURE: Completely redesigned web interface ("API Console"). New implementation provides a dual-mode interface supporting an improved visual network navigator and also a "tabular" view, which provides a more traditional "data" view. New stack built using vite, React, and TypeScript (https://github.com/openziti/zrok/issues/724)
156
+
157
+ FEATURE: New "zrok Agent", a background manager process for your zrok environments, which allows you to easily manage and work with multiple `zrok share` and `zrok access` processes. New `--subordinate` flag added to `zrok share [public|private|reserved]` and `zrok access private` to operate in a mode that allows an Agent to manage shares and accesses (https://github.com/openziti/zrok/issues/463)
158
+
159
+ FEATURE: New "zrok Agent UI" a web-based user interface for the zrok Agent, which allows creating and releasing shares and accesses through a web browser. This is just an initial chunk of the new Agent UI, and is considered a "minimum viable" version of this interface (https://github.com/openziti/zrok/issues/221)
160
+
161
+ FEATURE: `zrok share [public|private|reserved]` and `zrok access private` now auto-detect if the zrok Agent is running in an environment and will automatically service share and access requests through the Agent, rather than in-process if the Agent is running. If the Agent is not running, operation remains as it was in `v0.4.x` and the share or access is handled in-process. New `--force-agent` and `--force-local` flags exist to skip Agent detection and manually select an operating mode (https://github.com/openziti/zrok/issues/751)
162
+
163
+ FEATURE: `zrok access private` supports a new `--auto` mode, which can automatically find an available open address/port to bind the frontend listener on. Also includes `--auto-address`, `--auto-start-port`, and `--auto-end-port` features with sensible defaults. Supported by both the agent and local operating modes (https://github.com/openziti/zrok/issues/780)
164
+
165
+ FEATURE: `zrok rebase` commands (`zrok rebase apiEndpoint` and `zrok rebase accountToken`) allows "rebasing" an enabled environment onto a different API endpoint or a different account token. This is useful for migrating already-enabled environments between endpoints supporting different zrok versions, and is also useful when regenerating an account token (https://github.com/openziti/zrok/issues/869, https://github.com/openziti/zrok/issues/897)
166
+
167
+ FEATURE: `zrok test canary` CLI tree replaces the old `zrok test loop` tree; new `zrok test canary public-proxy` and `zrok test canary private-proxy` provide modernized, updated versions of what the `zrok test loop` commands used to do. This new approach will serve as the foundation for all future zrok testing infrastructure (https://github.com/openziti/zrok/issues/771)
168
+
169
+ FEATURE: New `/api/v1/versions` endpoint to return comprehensive, full stack version information about the deployed service instance. Currently only returns a single `controllerVersion` property (https://github.com/openziti/zrok/issues/881)
170
+
171
+ CHANGE: The default API URL for `v1.0.x` zrok clients is now `https://api-v1.zrok.io` (instead of the older `https://api.zrok.io`). The zrok.io deployment will now be maintaining version-specific DNS for versioned API endpoints.
172
+
173
+ CHANGE: Refactored API implementation. Cleanup, lint removal, additional data elements added, unused data removed (https://github.com/openziti/zrok/issues/834)
174
+
175
+ CHANGE: Deprecated the `passwords` configuration stanza. The zrok controller and API console now use a hard-coded set of (what we believe to be) reasonable assumptions about password quality (https://github.com/openziti/zrok/issues/834)
176
+
177
+ CHANGE: The protocol for determining valid client versions has been changed. Previously a zrok client would do a `GET` against the `/api/v1/version` endpoint and do a local version string comparison (as a normal precondition to any API call) to see if the controller version matched. The protocol has been amended so that any out-of-date client using the old protocol will receive a version string indicating that they need to uprade their client. New clients will do a `POST` against the `/api/v1/clientVersionCheck` endpoint, posting their client version, and the server will check for compatibility. Does not change the security posture in any significant way, but gives more flexibility on the server side for managing client compatibility. Provides a better, cleared out-of-date error message for old clients when accessing `v1.0.0`+ (https://github.com/openziti/zrok/issues/859)
178
+
179
+ CHANGE: The Node.js SDK is now generated by `openapi-generator` using the `typescript-fetch` template. Examples and SDK components updated to use the `v1.0.0` API and generated client (https://github.com/openziti/zrok/issues/893)
180
+
181
+ CHANGE: The Python SDK is now generated by `openapi-generator` and requires a newer `urllib3` version 2.1.0. The published Python module, `zrok`, inherits the dependencies of the generated packages (https://github.com/openziti/zrok/issues/894)
182
+
183
+ ## v0.4.49
184
+
185
+ FIX: Release artifacts now include a reproducible source archive. The archive's download URL is now used by the Homebrew formula when building from source instead of the archive generated on-demand by GitHub (https://github.com/openziti/zrok/issues/858).
186
+
187
+ FIX: Pre-releases are no longer uploaded to the stable Linux package repo, and workflows that promote stable release artifacts to downstream distribution channels enforce semver stable release tags, i.e., not having a semver hyphenated prerelease suffix.
188
+
189
+ CHANGE: The release `checksums.txt` has been renamed `checksums.sha256.txt` to reflect the use of a collision-resistant algorithm instead of `shasum`'s default algorithm, SHA-1.
190
+
191
+ CHANGE: The dependency graph is now published as a release artifact named `sbom-{version}.spdx.json` (https://github.com/openziti/zrok/issues/888).
192
+
193
+ CHANGE: Pre-releases are uploaded to the pre-release Linux package repo and Docker Hub for testing. [RELEASING.md](./RELEASING.md) describes releaser steps and the events they trigger.
194
+
195
+ CHANGE: Linux release binaries are now built on the ziti-builder container image based on Ubuntu Focal 20.04 to preserve backward compatibility as the ubuntu-20.04 GitHub runner is end of life.
196
+
197
+ CHANGE: Container images now include SLSA and SBOM attestations, and these are also published to the Docker Hub registry (https://github.com/openziti/zrok/issues/890).
198
+
199
+ CHANGE: Release binary and text artifacts are now accompanied by provenance attestations (https://github.com/openziti/zrok/issues/889).
200
+
201
+ ## v0.4.48
202
+
203
+ FEATURE: The controller configuration now supports a `disable_auto_migration` boolean in the `store` stanza. When set to `true`, the controller will not attempt to auto-migrate (or otherwise validate the migration state) of the underlying database. Leaving `disable_auto_migration` out, or setting it to false will retain the default behavior of auto-migrating when starting the zrok controller. The `zrok admin migrate` command will still perform a migration regardless of how this setting is configured in the controller configuration (https://github.com/openziti/zrok/issues/866)
204
+
205
+ FIX: the Python SDK erroneously assumed the enabled zrok environment contained a config.json file, and was changed to only load it if the file was present (https://github.com/openziti/zrok/pull/853/).
206
+
207
+ ## v0.4.47
208
+
209
+ CHANGE: the Docker instance will wait for the ziti container healthy status (contribution from Ben Wong @bwong365 - https://github.com/openziti/zrok/pull/790)
210
+
211
+ CHANGE: Document solving the DNS propagation timeout for Docker instances that are using Caddy to manage the wildcard certificate.
212
+
213
+ CHANGE: Add usage hint in `zrok config get --help` to clarify how to list all valid `configName` and their current values by running `zrok status`.
214
+
215
+ CHANGE: The Python SDK's `Overview()` function was refactored as a class method (https://github.com/openziti/zrok/pull/846).
216
+
217
+ FEATURE: The Python SDK now includes a `ProxyShare` class providing an HTTP proxy for public and private shares and a
218
+ Jupyter notebook example (https://github.com/openziti/zrok/pull/847).
219
+
220
+ FIX: PyPi publishing was failing due to a CI issue (https://github.com/openziti/zrok/issues/849)
221
+
222
+ ## v0.4.46
223
+
224
+ FEATURE: Linux service template for systemd user units (https://github.com/openziti/zrok/pull/818)
225
+
226
+ FIX: Docker share examples had incorrect default path for zrok environment mountpoint
227
+
228
+ FIX: Clarify how to use DNS providers like Route53 with the zrok Docker instance sample.
229
+
230
+ CHANGE: Use port 80 for the default Ziti API endpoint in the zrok Docker instance sample (https://github.com/openziti/zrok/issues/793).
231
+
232
+ CHANGE: Clarify OS requirements for zrok VPN
233
+
234
+ CHANGE: Set the Windows executable search path in the Windows install guide.
235
+
236
+ CHANGE: bump macOS runner for Python module from macos-12 to macos-13
237
+
238
+ ## v0.4.45
239
+
240
+ FEATURE: Minimal support for "organizations". Site admin API endpoints provided to create, list, and delete "organizations". Site admin API endpoints provided to add, list, and remove "organization members" (zrok accounts) with the ability to mark accounts as a "organization admin". API endpoints provided for organization admins to list the members of their organizations, and to also see the overview (environments, shares, and accesses) for any account in their organization. API endpoint for end users to see which organizations their account is a member of (https://github.com/openziti/zrok/issues/537)
241
+
242
+ CHANGE: briefly mention the backend modes that apply to public and private share concepts
243
+
244
+ FIX: Update indirect dependency `github.com/golang-jwt/jwt/v4` to version `v4.5.1` (https://github.com/openziti/zrok/issues/794)
245
+
246
+ FIX: Document unique names
247
+
248
+ FIX: reduce Docker image sizes (https://github.com/openziti/zrok/pull/783)
249
+
250
+ FIX: Docker reserved private share startup error (https://github.com/openziti/zrok/pull/801)
251
+
252
+ FIX: Correct the download URL for the armv7 Linux release (https://github.com/openziti/zrok/issues/782)
253
+
254
+ ## v0.4.44
255
+
256
+ FIX: Fix for goreleaser build action to align with changed ARM64 build path.
257
+
258
+ ## v0.4.43
259
+
260
+ CHANGE: Update `github.com/openziti/sdk-golang` to version `v0.23.44`. Remove old `github.com/openziti/fabric` dependency, instead pulling in the modern `github.com/openziti/ziti` dependency.
261
+
262
+ FIX: Bypass interstitial page for HTTP `OPTIONS` method (https://github.com/openziti/zrok/issues/777)
263
+
264
+ ## v0.4.42
265
+
266
+ CHANGE: Switch all `Dial` operations made into the OpenZiti overlay to use `DialWithOptions(..., &ziti.DialOptions{ConnectTimeout: 30 * time.Second})`, switching to a 30 second timeout from a 5 second default (https://github.com/openziti/zrok/issues/772)
267
+
268
+ FIX: Removed the `--basic-auth` flag from `zrok share private` as this was ignored... even if `zrok access private` honored the `ziti.proxy.v1` config to ask for basic auth, it would still be easy to write a custom SDK client that ignored the basic auth and accessed the share directly; better to remove the option than to allow confusing usage (https://github.com/openziti/zrok/issues/770)
269
+
270
+ FIX: always append common options like `--headless` and conditionally append `--verbose --insecure` if their respective env vars are set to when running in a service manager like systemd or Docker and wrapping the `zrok` command with the `zrok-share.bash` shell script (https://openziti.discourse.group/t/question-about-reserved-public-vs-temp-public-shares/3169)
271
+
272
+ FIX: Correct registration page CSS to ensure that the entire form is visible
273
+
274
+ ## v0.4.41
275
+
276
+ FIX: Fixed crash when invoking `zrok share reserved` with no arguments (https://github.com/openziti/zrok/issues/740)
277
+
278
+ FIX: zrok-share.service on Linux failed to start with a private share in closed permission mode
279
+
280
+ FIX: Update `gopkg.in/go-jose/go-jose.v2` to `v2.6.3` to fix vulnerability around compressed data (https://github.com/openziti/zrok/issues/761)
281
+
282
+ ## v0.4.40
283
+
284
+ FEATURE: New endpoint for synchronizing grants for an account (https://github.com/openziti/zrok/pull/744). Useful for updating the `zrok.proxy.v1` config objects containing interstitial setting when the `skip_interstitial_grants` table has been updated.
285
+
286
+ FIX: prune incorrect troubleshooting advice about listing Caddy's certificates
287
+
288
+ ## v0.4.39
289
+
290
+ FEATURE: New API endpoint allowing direct creation of accounts in the zrok database. Requires an admin token (specified in the controller configuration yaml) for authentication. See the OpenAPI spec for details of the API endpoint. The `zrok admin create account` CLI was also updated to call the API endpoint, rather than directly operating on the underlying database (https://github.com/openziti/zrok/issues/734). The [Docker](https://github.com/openziti/zrok/pull/736) and [Kubernetes](https://github.com/openziti/helm-charts/pull/249) zrok instance deployments were adapted to the new CLI parameter shape.
291
+
292
+ FEATURE: Support `html_path` directive in `interstitial` stanza of public frontend configuration to support using an external HTML file for the interstitial page (https://github.com/openziti/zrok/issues/716)
293
+
294
+ FEATURE: `zrok access private` now includes a `--response-header` flag to add headers to the response for HTTP-based backends. Add flag multiple times to add multiple headers to the response. Expects `key:value` header definitions in this format: `--response-header "Access-Control-Allow-Origin: *"` (https://github.com/openziti/zrok/issues/522)
295
+
296
+ CHANGE: Update `github.com/openziti/sdk-golang` (and related dependencies) to version `v0.23.40`.
297
+
298
+ CHANGE: upgrade to ziti v1.1.7 CLI in zrok container image
299
+
300
+ ## v0.4.38
301
+
302
+ FEATURE: Conditionally enable interstitial page based on `User-Agent` prefix list. See the [frontend configuration template](etc/frontend.yml) for details on the new configuration structure (https://github.com/openziti/zrok/issues/715)
303
+
304
+ CHANGE: The interstitial configuration has been modified from a simple `interstitial: <bool>` to a richer structure, but the config version has not been incremented; this feature has not been widely adopted yet. See the [frontend configuration template](etc/frontend.yml) for details on the new structure.
305
+
306
+ CHANGE: The registration page where a new user's password is set now includes a required checkbox, asking them to acknowledge the terms and conditions presented above the checkbox (https://github.com/openziti/zrok/issues/669)
307
+
308
+ FIX: The registration page where a new user's password is set now includes better styling of the error message `<div/>` to prevent the entire page from jumping when the message changes.
309
+
310
+ ## v0.4.37
311
+
312
+ FIX: Fix for setting the `zrok_interstitial` cookie on Chrome-based browsers.
313
+
314
+ FIX: Fix for `store.IsAccountGrantedSkipInterstitial` to respect the `deleted` flag.
315
+
316
+ FIX: When an error occurs connecting to the proxied endpoint, the `proxy` backend should return HTTP status `502` (https://github.com/openziti/zrok/issues/703)
317
+
318
+ ## v0.4.36
319
+
320
+ FEATURE: New interstitial pages that can be enabled per-frontend, and disabled per-account (https://github.com/openziti/zrok/issues/704)
321
+
322
+ CHANGE: Enable `"declaration": true` in `tsconfig.json` for Node SDK.
323
+
324
+ FIX: build 32bit build for armhf to fix [the FPE issue](https://github.com/openziti/zrok/issues/654) and [the missing link issue](https://github.com/openziti/zrok/issues/642)
325
+
326
+ CHANGE: add [cross-build instructions](./BUILD.md) (includes new snapshot build target `armel`)
327
+
328
+ ## v0.4.35
329
+
330
+ FEATURE: Added import for `github.com/greenpau/caddy-security` to include that Caddy plugin to enable authentication, authorization, and credentials extensions for the `caddy` backend (https://github.com/openziti/zrok/issues/506)
331
+
332
+ FEATURE: Closed permission mode for Docker and Linux private shares
333
+
334
+ CHANGE: add example in ./etc/caddy to set X-Real-IP header to public share client IP
335
+
336
+ CHANGE: auto-update the ziti CLI version that is built in to the openziti/zrok container image
337
+
338
+ CHANGE: Docker examples set HOME to enable running CLI commands in the container
339
+
340
+ FIX: Fix for environment count inheritance when using a resource count class to override global environment count (https://github.com/openziti/zrok/issues/695)
341
+
342
+ ## v0.4.34
343
+
344
+ FEATURE: Linux service support for all private share modes (contribution from Stefan Adelbert @stefanadelbert)
345
+
346
+ FIX: Fix for mixing limited and unlimited (-1) resource counts in the limits system (https://github.com/openziti/zrok/issues/680)
347
+
348
+ FIX: Fix for sending multiple warning emails when a warning is applied to an account (https://github.com/openziti/zrok/issues/685)
349
+
350
+ CHANGE: add Docker compose example for multiple share containers using the same enabled environment in [compose.override.yml](./docker/compose/zrok-public-reserved/compose.override.yml)
351
+
352
+ CHANGE: bump many GitHub Actions that were using deprecated distributions of Node.js
353
+
354
+ CHANGE: bump macOS runner for Node SDK from macos-11 to macos-12
355
+
356
+ ## v0.4.33
357
+
358
+ FIX: Fix for log message in `Agent.CanAccessShare` (`"account '#%d' over frontends per share limit '%d'"`), which was not returning the correct limit value.
359
+
360
+ FIX: Properly set `permission_mode` in `frontends` when createing a private frontend using `zrok access private` (https://github.com/openziti/zrok/issues/677)
361
+
362
+ CHANGE: Updated `react-bootstrap` to version `2.10.2` (web console).
363
+
364
+ CHANGE: Updated `@mui/material` to version `5.15.18` (web console).
365
+
366
+ CHANGE: Updated `react` and `react-dom` to version `18.3.1` (web console).
367
+
368
+ CHANGE: Updated `recharts` to version `2.12.7` (web console).
369
+
370
+ CHANGE: Updated `react-router-dom` to version `6.23.1` (web console).
371
+
372
+ CHANGE: Updated `axios` to version `1.7.2` for (node SDK).
373
+
374
+ CHANGE: Updated `@openziti/ziti-sdk-nodejs` to version `0.17.0` (node SDK).
375
+
376
+ ## v0.4.32
377
+
378
+ FEATURE: New permission mode support for public frontends. Open permission mode frontends are available to all users in the service instance. Closed permission mode frontends reference the new `frontend_grants` table that can be used to control which accounts are allowed to create shares using that frontend. `zrok admin create frontend` now supports `--closed` flag to create closed permission mode frontends (https://github.com/openziti/zrok/issues/539)
379
+
380
+ FEATURE: New config `defaultFrontend` that specifies the default frontend to be used for an environment. Provides the default `--frontend` for `zrok share public` and `zrok reserve public` (https://github.com/openziti/zrok/issues/663)
381
+
382
+ FEATURE: Resource count limits now include `share_frontends` to limit the number of frontends that are allowed to make connections to a share (https://github.com/openziti/zrok/issues/650)
383
+
384
+ CHANGE: The frontend selection flag used by `zrok share public` and `zrok reserve public` has been changed from `--frontends` to `--frontend`
385
+
386
+ FIX: use controller config spec v4 in the Docker instance
387
+
388
+ ## v0.4.31
389
+
390
+ FEATURE: New "limits classes" limits implementation (https://github.com/openziti/zrok/issues/606). This new feature allows for extensive limits customization on a per-user basis, with fallback to the global defaults in the controller configuration.
391
+
392
+ CHANGE: The controller configuration version has been updated to version `4` (`v: 4`) to support the new limits global configuration changes (https://github.com/openziti/zrok/issues/606).
393
+
394
+ CHANGE: A new `ZROK_CTRL_CONFIG_VERSION` environment variable now exists to temporarily force the controller to assume a specific controller configuration version, regardless of what version exists in the file. This allows two different config versions to potentially be co-mingled in the same controller configuration file. Use with care (https://github.com/openziti/zrok/issues/648)
395
+
396
+ CHANGE: Log messages that said `backend proxy endpoint` were clarified to say `backend target`.
397
+
398
+ FIX: Correct the syntax for the Docker and Linux zrok-share "frontdoor" service that broke OAuth email address pattern matching.
399
+
400
+ ## v0.4.30
401
+
402
+ FIX: Fix to the Node.js release process to properly support releasing on a tag.
403
+
404
+ ## v0.4.29
405
+
406
+ FIX: Backed out an incorrect change to support a FreeBSD port in progress.
407
+
408
+ ## v0.4.28
409
+
410
+ FEATURE: Node.js support for the zrok SDK (https://github.com/openziti/zrok/issues/400)
411
+
412
+ FEATURE: A Docker Compose project for self-hosting a zrok instance and [accompanying Docker guide](https://docs.zrok.io/docs/guides/self-hosting/docker) for more information.
413
+
414
+ CHANGE: the container images run as "ziggy" (UID 2171) instead of the generic restricted user "nobody" (UID 65534). This reduces the risk of unexpected file permissions when binding the Docker host's filesystem to a zrok container.
415
+
416
+ CHANGE: the Docker sharing guides were simplified and expanded
417
+
418
+ ## v0.4.27
419
+
420
+ FEATURE: New `vpn` backend mode. Use `sudo zrok share private --backend-mode vpn` on the _VPN server_ host, then `sudo zrok access private <token>` on _VPN client_ machine. Works with reserved shares using `zrok reserve private --backend-mode vpn`. Use `<target>` parameter to override default VPN network settings `zrok share private -b vpn 192.168.255.42/24` -- server IP is `192.168.255.42` and VPN netmask will be `192.168.255.0/24`. Client IPs are assigned automatically from netmask range.
421
+
422
+ CHANGE: Update to OpenZiti SDK (`github.com/openziti/sdk-golang`) at `v0.23.22`.
423
+
424
+ CHANGE: Added indexes to `environments`, `shares`, and `frontends` tables to improve overall query performance on both PostgreSQL and Sqlite.
425
+
426
+ FIX: Also update the Python SDK to include the permission mode and access grants fields on the `ShareRequest` (https://github.com/openziti/zrok/issues/432)
427
+
428
+ FIX: Add a way to find the username on Linux when /etc/passwd and stdlib can't resolve the UID (https://github.com/openziti/zrok/issues/454)
429
+
430
+ ## v0.4.26
431
+
432
+ FEATURE: New _permission modes_ available for shares. _Open permission mode_ retains the behavior of previous zrok releases and is the default setting. _Closed permission mode_ (`--closed`) only allows a share to be accessed (`zrok access`) by users who have been granted access with the `--access-grant` flag. See the documentation at (https://docs.zrok.io/docs/guides/permission-modes/) (https://github.com/openziti/zrok/issues/432)
433
+
434
+ CHANGE: The target for a `socks` share is automatically set to `socks` to improve web console display.
435
+
436
+ CHANGE: Enhancements to the look and feel of the account actions tab in the web console. Textual improvements.
437
+
438
+ FIX: The regenerate account token dialog incorrectly specified the path `${HOME}/.zrok/environments.yml`. This, was corrected to be `${HOME}/.zrok/environments.json`.
439
+
440
+ FIX: Align zrok frontdoor examples and Linux package (`zrok-share`) with the new OAuth email flag `--oauth-email-address-patterns` introduced in v0.4.25.
441
+
442
+ FIX: Reloading the web console when logged in no longer provokes the user to the login page.
443
+
444
+ ## v0.4.25
445
+
446
+ FEATURE: New action in the web console that allows changing the password of the logged-in account (https://github.com/openziti/zrok/issues/148)
447
+
448
+ FEATURE: The web console now supports revoking your current account token and generating a new one (https://github.com/openziti/zrok/issues/191)
449
+
450
+ CHANGE: When specifying OAuth configuration for public shares from the `zrok share public` or `zrok reserve` public commands, the flags and functionality for restricting the allowed email addresses of the authenticating users has changed. The old flag was `--oauth-email-domains`, which took a string value that needed to be contained in the user's email address. The new flag is `--oauth-email-address-patterns`, which accepts a glob-style filter, using https://github.com/gobwas/glob (https://github.com/openziti/zrok/issues/413)
451
+
452
+ CHANGE: Creating a reserved share checks for token collision and returns a more appropriate error message (https://github.com/openziti/zrok/issues/531)
453
+
454
+ CHANGE: Update UI to add a 'true' value on `reserved` boolean (https://github.com/openziti/zrok/issues/443)
455
+
456
+ CHANGE: OpenZiti SDK (github.com/openziti/sdk-golang) updated to version `v0.22.29`, which introduces changes to OpenZiti API session handling
457
+
458
+ FIX: Fixed bug where a second password reset request would for any account would fail (https://github.com/openziti/zrok/issues/452)
459
+
460
+ ## v0.4.24
461
+
462
+ FEATURE: New `socks` backend mode for use with private sharing. Use `zrok share private --backend-mode socks` and then `zrok access private` that share from somewhere else... very lightweight VPN-like functionality (https://github.com/openziti/zrok/issues/558)
463
+
464
+ FEATURE: New `zrok admin create account` command that allows populating accounts directly into the underlying controller database (https://github.com/openziti/zrok/issues/551)
465
+
466
+ CHANGE: The `zrok test loopback public` utility to report non-`200` errors and also ensure that the listening side of the test is fully established before starting loopback testing.
467
+
468
+ CHANGE: The OpenZiti SDK for golang (https://github.com/openziti/sdk-golang) has been updated to version `v0.22.28`
469
+
470
+ ## v0.4.23
471
+
472
+ FEATURE: New CLI commands have been implemented for working with the `drive` share backend mode (part of the "zrok Drives" functionality). These commands include `zrok cp`, `zrok mkdir` `zrok mv`, `zrok ls`, and `zrok rm`. These are initial, minimal versions of these commands and very likely contain bugs and ergonomic annoyances. There is a guide available at (`docs/guides/drives.mdx`) that explains how to work with these tools in detail (https://github.com/openziti/zrok/issues/438)
473
+
474
+ FEATURE: Python SDK now has a decorator for integrating with various server side frameworks. See the `http-server` example.
475
+
476
+ FEATURE: Python SDK share and access handling now supports context management.
477
+
478
+ FEATURE: TLS for `zrok` controller and frontends. Add the `tls:` stanza to your controller configuration (see `etc/ctrl.yml`) to enable TLS support for the controller API. Add the `tls:` stanza to your frontend configuration (see `etc/frontend.yml`) to enable TLS support for frontends (be sure to check your `public` frontend template) (#24)(https://github.com/openziti/zrok/issues/24)
479
+
480
+ CHANGE: Improved OpenZiti resource cleanup resilience. Previous resource cleanup would stop when an error was encountered at any stage of the cleanup process (serps, sps, config, service). New cleanup implementation logs errors but continues to clean up anything that it can (https://github.com/openziti/zrok/issues/533)
481
+
482
+ CHANGE: Instead of setting the `ListenOptions.MaxConnections` property to `64`, use the default value of `3`. This property actually controls the number of terminators created on the underlying OpenZiti network. This property is actually getting renamed to `ListenOptions.MaxTerminators` in an upcoming release of `github.com/openziti/sdk-golang` (https://github.com/openziti/zrok/issues/535)
483
+
484
+ CHANGE: Versioning for the Python SDK has been updated to use versioneer for management.
485
+
486
+ CHANGE: Python SDK package name has been renamed to `zrok`, dropping the `-sdk` postfix. [pypi](https://pypi.org/project/zrok).
487
+
488
+ ## v0.4.22
489
+
490
+ FIX: The goreleaser action is not updated to work with the latest golang build. Modifed `go.mod` to comply with what goreleaser expects
491
+
492
+ ## v0.4.21
493
+
494
+ FEATURE: The web console now supports deleting `zrok access` frontends (https://github.com/openziti/zrok/issues/504)
495
+
496
+ CHANGE: The web console now displays the frontend token as the label for any `zrok access` frontends throughout the user interface (https://github.com/openziti/zrok/issues/504)
497
+
498
+ CHANGE: Updated `github.com/rubenv/sql-migrate` to `v1.6.0`
499
+
500
+ CHANGE: Updated `github.com/openziti/sdk-golang` to `v0.22.6`
501
+
502
+ FIX: The migration `sqlite3/015_v0_4_19_share_unique_name_constraint.sql` has been adjusted to delete the old `shares_old` table as the last step of the migration process. Not sure exactly why, but SQLite is unhappy otherwise (https://github.com/openziti/zrok/issues/504)
503
+
504
+ FIX: Email addresses have been made case-insensitive. Please note that there is a migration included in this release (`016_v0_4_21_lowercase_email.sql`) which will attempt to ensure that all email addresses in your existing database are stored in lowercase; **if this migration fails you will need to manually remediate the duplicate account entries** (https://github.com/openziti/zrok/issues/517)
505
+
506
+ FIX: Stop sending authentication cookies to non-authenticated shares (https://github.com/openziti/zrok/issues/512)
507
+
508
+ ## v0.4.20
509
+
510
+ CHANGE: OpenZiti SDK updated to `v0.21.2`. All `ziti.ListenOptions` listener options configured to use `WaitForNEstablishedListeners: 1`. When a `zrok share` client or an `sdk.Share` client are connected to an OpenZiti router that supports "listener established" events, then listen calls will not return until the listener is fully established on the OpenZiti network. Previously a `zrok share` client could report that it is fully operational and listening before the listener is fully established on the OpenZiti network; in practice this produced a very small window of time when the share would not be ready to accept requests. This change eliminates this window of time (https://github.com/openziti/zrok/issues/490)
511
+
512
+ FIX: Require the JWT in a zrok OAuth cookie to have an audience claim that matches the public share hostname. This prevents a cookie from one share from being use to log in to another share.
513
+
514
+ ## v0.4.19
515
+
516
+ FEATURE: Reserved shares now support unique names ("vanity tokens"). This allows for the creation of reserved shares with identifiable names rather than generated share tokens. Includes basic support for profanity checking (https://github.com/openziti/zrok/issues/401)
517
+
518
+ CHANGE: The `publicProxy` endpoint implementation used in the `zrok access public` frontend has been updated to use the new `RefreshService(serviceName)` call instead of `RefreshServices()`. This should greatly improve the performance of requests against missing or non-responsive zrok shares (https://github.com/openziti/zrok/issues/487)
519
+
520
+ CHANGE: The Python SDK has been updated to properly support the "reserved" flag on the `ShareRequest` passed to `CreateShare`
521
+
522
+ CHANGE: Dependency updates; `github.com/openziti/sdk-golang@v0.20.145`; `github.com/caddyserver/caddy/v2@2.7.6`; indirect dependencies
523
+
524
+ ## v0.4.18
525
+
526
+ FEATURE: Python SDK added. Can be found on [pypi](https://test.pypi.org/project/zrok-sdk). `pastebin` example illustrates basic SDK usage (see `sdk/python/examples/README.md` for details) (https://github.com/openziti/zrok/issues/401)
527
+
528
+ CHANGE: Moved the golang zrok sdk into `sdk/golang/sdk` to normalize location for future SDK's.
529
+
530
+ CHANGE: add restart policies to docker compose samples used by the guide docs, e.g., reserved public share should auto-start on boot, temp public share should not.
531
+
532
+ ## v0.4.17
533
+
534
+ CHANGE: Replaced most in-line shell scripts in Docker Compose projects with installed scripts that are shared between the Docker and Linux service. This normalizes the operational configuration of both Docker shares and Linux service, i.e., to use the same env vars.
535
+
536
+ CHANGE: Upgrade to Docusaurus v3 for documentation.
537
+
538
+ FIX: Some Docker shares had broken env mountpoints
539
+
540
+ ## v0.4.16
541
+
542
+ FEATURE: Publish Linux packages for `zrok` CLI and a systemd service for running a reserved public share (`zrok-share`).
543
+
544
+ ## v0.4.15
545
+
546
+ CHANGE: Updated the code signing and notarization process for macos binaries. The previous release process used the `gon` utility to handle both code signing and notarization. Apple changed the requirements and the `gon` utility no longer properly functions as of 2023-11-01. The `goreleaser` process has been adjusted to use the `notarytool` utility that ships with XCode to sign and notarize the binary (https://github.com/openziti/zrok/issues/435)
547
+
548
+ ## v0.4.14
549
+
550
+ FEATURE: `zrok` Drives "Phase 1" (`p1`) functionality included in this release. This includes new `--backend-mode drive`, which accepts a folder path as a target. A `drive` share can be mounted as a network drive on Windows, macOS, and Linux, allowing full read/write access from all applications on those systems (https://github.com/openziti/zrok/issues/218) Subsequent releases will address CLI use cases and provide further refinements to the overall approach.
551
+
552
+ FEATURE: Docker Compose project for a reserved public share in docker/compose/zrok-public-reserved/compose.yml is described in the [public share guide](https://docs.zrok.io/docs/guides/docker-share/docker_public_share_guide/).
553
+
554
+ ## v0.4.13
555
+
556
+ FIX: Update to Homebrew automation to properly integrate with the latest version of the Homebrew release process.
557
+
558
+ ## v0.4.12
559
+
560
+ FIX: The `zrok reserve` command was not properly recording the reserved share status of the shares that it created, preventing the `zrok release` command from properly releasing them (https://github.com/openziti/zrok/issues/427) If a user encounters reserved shares that cannot be released with the `zrok release` command, they can be deleted through the web console.
561
+
562
+ ## v0.4.11
563
+
564
+ FEATURE: The `zrok reserve` command now incorporates the `--json-output|-j` flag, which outputs the reservation details as JSON, rather than as human-consumable log messages. Other commands will produce similar output in the future (https://github.com/openziti/zrok/issues/422)
565
+
566
+ FIX: Include `--oauth-provider` and associated flags for the `zrok reserve` command, allowing reserved shares to specify OAuth authentication (https://github.com/openziti/zrok/issues/421)
567
+
568
+ ## v0.4.10
569
+
570
+ CHANGE: The public frontend configuration has been bumped from `v: 2` to `v: 3`. The `redirect_host`, `redirect_port` and `redirect_http_only` parameters have been removed. These three configuration options have been replaced with `bind_address`, `redirect_url` and `cookie_domain`. See the OAuth configuration guide at `docs/guides/self-hosting/oauth/configuring-oauth.md` for more details (https://github.com/openziti/zrok/issues/411)
571
+
572
+ ## v0.4.9
573
+
574
+ FIX: Remove extraneous share token prepended to OAuth frontend redirect.
575
+
576
+ ## v0.4.8
577
+
578
+ FEATURE: The `sdk` package now includes a `sdk.Overview` function, which returns a complete description of the account attached to the enabled environment. Useful for inventorying the deployed shares and environments (https://github.com/openziti/zrok/issues/407)
579
+
580
+ CHANGE: The `zrok access public` frontend configuration format has changed and now requires that the configuration document include a `v: 2` declaration. This frontend configuration format is now versioned and when the code updates the configuration structure, you will receive an error message at startup, provoking you to look into updating your configuration (https://github.com/openziti/zrok/issues/406)
581
+
582
+ CHANGE: The title color of the header was changed from white to flourescent green, to better match the overall branding
583
+
584
+ CHANGE: Tweaks to build and release process for logging and deprecations. Pin golang version at 1.21.3+ and node version at 18.x across all platforms
585
+
586
+ CHANGE: Improvements to email invitation sent in response to `zrok invite` to correct broken links, some minor HTML issues and improve overall deliverability (https://github.com/openziti/zrok/issues/405)
587
+
588
+ CHANGE: Added warning message after `zrok invite` submit directing the user to check their "spam" folder if they do not receive the invite message.
589
+
590
+ ## v0.4.7
591
+
592
+ FEATURE: OAuth authentication with the ability to restrict authenticated users to specified domains for `zrok share public`. Supports both Google and GitHub authentication in this version. More authentication providers, and extensibility to come in future `zrok` releases. See the OAuth configuration guide at `docs/guides/self-hosting/oauth/configuring-oauth.md` for details (https://github.com/openziti/zrok/issues/45, https://github.com/openziti/zrok/issues/404)
593
+
594
+ CHANGE: `--basic-auth` realm now presented as the share token rather than as `zrok` in `publicProxy` frontend implementation
595
+
596
+ ## v0.4.6
597
+
598
+ FEATURE: New `--backend-mode caddy`, which pre-processes a `Caddyfile` allowing a `bind` statement to work like this: `bind {{ .ZrokBindAddress }}`. Allows development of complicated API gateways and multi-backend shares, while maintaining the simple, ephemeral sharing model provided by `zrok` (https://github.com/openziti/zrok/issues/391)
599
+
600
+ CHANGE: `--backend-mode web` has been refactored to utilize Caddy as the integrated web server. This provides for a much nicer web-based file browsing experience, while maintaining the existing web server facilities (https://github.com/openziti/zrok/issues/392)
601
+
602
+ CHANGE: Updated the golang version for release builds to `1.21.0` and the node version to `18.x`
603
+
604
+ CHANGE: Added `FrontendEndponts` to `sdk.Share`, returning selected frontend URLs to callers of `sdk.CreateShare`
605
+
606
+ CHANGE: Added a short alias `-b` for `--backend-mode` to improve CLI ergonomics (https://github.com/openziti/zrok/issues/397)
607
+
608
+ ## v0.4.5
609
+
610
+ FEATURE: New health check endpoint (`/health`), which verifies that the underlying SQL store and metrics repository (InfluxDB, if configured) are operating correctly (https://github.com/openziti/zrok/issues/372)
611
+
612
+ CHANGE: Updated to golang v1.21.0 and node v18.x
613
+
614
+ FIX: `zrok admin bootstrap` and `zrok enable` both broken with latest OpenZiti releases (tested with `v0.30.0`); updated to latest OpenZiti golang SDK (https://github.com/openziti/zrok/issues/389)
615
+
616
+ ## v0.4.4
617
+
618
+ FIX: `zrok status`, `zrok enable`, `zrok config`, etc. were all causing a panic when used on systems that had no previous `~/.zrok` directory (https://github.com/openziti/zrok/issues/383)
619
+
620
+ ## v0.4.3
621
+
622
+ FEATURE: New `zrok overview` command, which returns all of the account details as a single JSON structure. See the OpenAPI spec at `specs/zrok.yml` for more details of the `/api/v1/overview` endpoint (https://github.com/openziti/zrok/issues/374)
623
+
624
+ FEATURE: New `zrok` SDK (https://github.com/openziti/zrok/issues/34). `pastebin` example illustrates basic SDK usage (see `sdk/examples/pastebin/README.md` for details) ((https://github.com/openziti/zrok/issues/379)
625
+
626
+ ## v0.4.2
627
+
628
+ Some days are just like this. `v0.4.2` is a re-do of `v0.4.1`. Trying to get Homebrew working and had a bad release. Hopefully this is the one.
629
+
630
+ ## v0.4.1
631
+
632
+ FEATURE: New `zrok console` command to open the currently configured web console in the local web browser (https://github.com/openziti/zrok/issues/170)
633
+
634
+ CHANGE: Further tweaks to the release process to automatically get the latest release into Homebrew (https://github.com/openziti/zrok/issues/264)
635
+
636
+ ## v0.4.0
637
+
638
+ FEATURE: New `tcpTunnel` backend mode allowing for private sharing of local TCP sockets with other `zrok` users (https://github.com/openziti/zrok/issues/170)
639
+
640
+ FEATURE: New `udpTunnel` backend mode allowing for private sharing of local UDP sockets with other `zrok` users (https://github.com/openziti/zrok/issues/306)
641
+
642
+ FEATURE: New metrics infrastructure based on OpenZiti usage events (https://github.com/openziti/zrok/issues/128). See the [v0.4 Metrics Guide](docs/guides/metrics-and-limits/configuring-metrics.md) for more information.
643
+
644
+ FEATURE: New limits implementation based on the new metrics infrastructure (https://github.com/openziti/zrok/issues/235). See the [v0.4 Limits Guide](docs/guides/metrics-and-limits/configuring-limits.md) for more information.
645
+
646
+ FEATURE: The invite mechanism has been reworked to improve user experience. The configuration has been updated to include a new `invite` stanza, and now includes a boolean flag indicating whether or not the instance allows new invitations to be created, and also includes contact details for requesting a new invite. These values are used by the `zrok invite` command to provide a smoother end-user invite experience https://github.com/openziti/zrok/issues/229)
647
+
648
+ FEATURE: New password strength checking rules and configuration. See the example configuration file (`etc/ctrl.yml`) for details about how to configure the strength checking rules (https://github.com/openziti/zrok/issues/167)
649
+
650
+ FEATURE: A new `admin/profile_endpoint` configuration option is available to start a `net/http/pprof` listener. See `etc/ctrl.yml` for details.
651
+
652
+ CHANGE: The controller configuration version bumps from `v: 2` to `v: 3` to support all of the new `v0.4` functionality. See the [example ctrl.yml](etc/ctrl.yml) for details on the new configuration.
653
+
654
+ CHANGE: The underlying database store now utilizes a `deleted` flag on all tables to implement "soft deletes". This was necessary for the new metrics infrastructure, where we need to account for metrics data that arrived after the lifetime of a share or environment; and also we're going to need this for limits, where we need to see historical information about activity in the past (https://github.com/openziti/zrok/issues/262)
655
+
656
+ CHANGE: Updated to latest `github.com/openziti/sdk-golang` (https://github.com/openziti/zrok/issues/335)
657
+
658
+ FIX: `zrok share reserved --override-endpoint` now works correctly; `--override-endpoint` was being incorrectly ignore previously (https://github.com/openziti/zrok/pull/348)
659
+
660
+ ## v0.3.7
661
+
662
+ FIX: Improved TUI word-wrapping (https://github.com/openziti/zrok/issues/180)
663
+
664
+ ## v0.3.6
665
+
666
+ CHANGE: Additional change to support branch builds (for CI purposes) and additional containerization efforts around k8s.
667
+
668
+ ## v0.3.5
669
+
670
+ CHANGE: `zrok config set apiEndpoint` now validates that the new API endpoint correctly starts with `http://` or `https://` (https://github.com/openziti/zrok/issues/258)
671
+
672
+ CHANGE: Additional linting to support homebrew (https://github.com/openziti/zrok/issues/264)
673
+
674
+ ## v0.3.4
675
+
676
+ CHANGE: `zrok test endpoint` incorporates `--ziti` mode (and related flags) to allow direct endpoint listening on a Ziti service
677
+
678
+ CHANGE: `zrok test websocket` command to test websockets, whether over TCP or over Ziti
679
+
680
+ FIX: Websocket support now functional
681
+
682
+ ## v0.3.3
683
+
684
+ CHANGE: `zrok test loop` has been moved to `zrok test loop public`, making way for additional types of loopback testing tools. The `zrok test endpoint` server now includes an `/echo` endpoint, which provides a simple echo websocket (https://github.com/openziti/zrok/issues/237)
685
+
686
+ ## v0.3.2
687
+
688
+ FEATURE: New docker infrastructure, including `docker-compose.yml` examples (and documentation) illustrating how to deploy `zrok` in `docker`-based environments
689
+
690
+ CHANGE: Include missing `--headless` flag for `zrok enable` and `zrok access private` (https://github.com/openziti/zrok/issues/246)
691
+
692
+ CHANGE: Fix for `zrok enable` error path handling (https://github.com/openziti/zrok/issues/244)
693
+
694
+ FEATURE: `zrok controller validate` and `zrok access public validate` will both perform a quick syntax validation on controller and public frontend configuration documents (https://github.com/openziti/zrok/issues/238)
695
+
696
+ $ zrok controller validate etc/dev.yml
697
+ [ERROR]: controller config validation failed (error loading controller config 'etc/dev.yml': field 'maintenance': field 'registration': field 'expiration_timeout': got [bool], expected [time.Duration])
698
+
699
+ CHANGE: `zrok status` no longer shows secrets (secret token, ziti identity) unless the `--secrets` flag is passed (https://github.com/openziti/zrok/issues/243)
700
+
701
+ ## v0.3.1
702
+
703
+ CHANGE: Incorporate initial docker image build (https://github.com/openziti/zrok/issues/217)
704
+
705
+ CHANGE: Improve target URL parsing for `zrok share` when using `--backend-mode` proxy (https://github.com/openziti/zrok/issues/211)
706
+
707
+ New and improved URL handling for proxy backends:
708
+
709
+ 9090 -> http://127.0.0.1:9090
710
+ localhost:9090 -> http://127.0.0.1:9090
711
+ https://localhost:9090 -> https://localhost:9090
712
+
713
+ CHANGE: Improve usability of `zrok invite` TUI in low-color environments (https://github.com/openziti/zrok/issues/206)
714
+
715
+ CHANGE: Better error responses when `zrok invite` fails due to missing token (https://github.com/openziti/zrok/issues/207)
716
+
717
+ ## v0.3.0
718
+
719
+ CHANGE: Removed some minor web console lint and warnings (https://github.com/openziti/zrok/issues/205)
720
+
721
+ ## v0.3.0-rc6
722
+
723
+ CHANGE: Better error message when `zrok admin create frontend` runs into a duplicate name collision (https://github.com/openziti/zrok/issues/168)
724
+
725
+ CHANGE: Gentler CLI error messages by default (https://github.com/openziti/zrok/issues/203)
726
+
727
+ CHANGE: Add favicon to web console (https://github.com/openziti/zrok/issues/198)
728
+
729
+ CHANGE: Add configurable "terms of use" link in the controller configuration, and optionally display the link on the login form and registration forms (https://github.com/openziti/zrok/issues/184)
730
+
731
+ CHANGE: Prevent multiple `zrok enable` commands from succeeding (https://github.com/openziti/zrok/issues/190)
732
+
733
+ CHANGE: New `--insecure` flag for `share <public|private|reserved>` commands (https://github.com/openziti/zrok/issues/195)
734
+
735
+ ## v0.3.0-rc5
736
+
737
+ CHANGE: Improvements to controller log messages to assist in operations (https://github.com/openziti/zrok/issues/186)
738
+
739
+ CHANGE: `armv7` builds for Linux are now shipped with releases; these builds were tested against a Raspberry Pi 4 (https://github.com/openziti/zrok/issues/93)
740
+
741
+ CHANGE: `zrok config set` now includes a warning when the `apiEndpoint` config is changed and an environment is already enabled; the user will not see the change until `zrok disable` is run. The CLI now includes a `zrok config unset` command (https://github.com/openziti/zrok/issues/188)
742
+
743
+ ## v0.3.0-rc4
744
+
745
+ CHANGE: Enable notarization for macos binaries (https://github.com/openziti/zrok/issues/92)
746
+
747
+ ## v0.3.0-rc3
748
+
749
+ > This release increments the configuration version from `1` to `2`. See the note below.
750
+
751
+ CHANGE: The email "from" configuration moved from `registration/email_from` to `email/from`. **NOTE: This change increments the configuration `V` from `1` to `2`.**
752
+
753
+ CHANGE: Replaced un-salted sha512 password hashing with salted hashing based on Argon2 **NOTE: This version will _invalidate_ all account passwords, and will require all users to use the 'Forgot Password?' function to reset their password.** (https://github.com/openziti/zrok/issues/156)
754
+
755
+ CHANGE: Switched from `ubuntu-latest` (`22.04`) for the Linux builds to `ubuntu-20.04`. Should improve `glibc` compatibility with older Linux distributions (https://github.com/openziti/zrok/issues/179)
756
+
757
+ CHANGE: `zrok admin generate` now outputs the generated tokens to `stdout` after successfully provisioning the tokens (https://github.com/openziti/zrok/issues/181)
758
+
759
+ FIX: Fixed log message in `resetPasswordRequest.go` (https://github.com/openziti/zrok/issues/175)
760
+
761
+ FIX: Fixed `-v` (verbose mode) on in TUI-based `zrok share` and `zrok access` (https://github.com/openziti/zrok/issues/174)
762
+
763
+ ## v0.3.0-rc2
764
+
765
+ FEATURE: Allow users to reset their password (https://github.com/openziti/zrok/issues/65)
766
+
767
+ CHANGE: Improved email styling for new user invite emails (https://github.com/openziti/zrok/issues/157)
768
+
769
+ CHANGE: Migrated from `openziti-test-kitchen` to `openziti` (https://github.com/openziti/zrok/issues/158).
770
+
771
+ CHANGE: Show a hint when `zrok invite` fails, indicating that the user should check to see if they need to be using the `--token` flag and token-based invites (https://github.com/openziti/zrok/issues/172).
772
+
773
+ FIX: Fixed PostgreSQL migration issue where sequences got reset and resulted in primary key collisions on a couple of tables (https://github.com/openziti/zrok/issues/160).
774
+
775
+ FIX: Remove `frontend` instances when `zrok disable`-ing an environment containing them (https://github.com/openziti/zrok/issues/171)
776
+
777
+ ## v0.3.x Series
778
+
779
+ The `v0.2` series was a _proof-of-concept_ implementation for the overall `zrok` architecture and the concept.
780
+
781
+ `v0.3` is a massive elaboration of the concept, pivoting it from being a simple ephemeral reverse proxy solution, to being the beginnings of a comprehensive sharing platform, complete with public and private sharing (built on top of OpenZiti).
782
+
783
+ `v0.3.0` includes the minimal functionality required to produce an early, preview version of the elaborated `zrok` concept, suitable for both production use at `zrok.io`, and also suitable for private self-hosting.
784
+
785
+ From `v0.3.0` forward, we will begin tracking notable changes in this document.
786
+
787
+ ## v0.2.18
788
+
789
+ * DEFECT: Token generation has been improved to use an alphabet consisting of `[a-zA-Z0-9]`. Service token generation continues to use a case-insensitive alphabet consisting of `[a-z0-9]` to be DNS-safe.
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2019 NetFoundry, Inc.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # zrok - Secure Internet Sharing Made Simple
2
+
3
+ ![zrok logo](docs/images/zrok_cover.png)
4
+
5
+ **Share anything, anywhere, instantly. Enterprise reliability. No firewall changes. No port forwarding. No hassle.**
6
+
7
+ `zrok` lets you securely share web services, files, and network resources with anyone—whether they're across the internet or your private network. Built on zero-trust networking, it works through firewalls and NAT without requiring any network configuration changes.
8
+
9
+ ## Quick Start
10
+
11
+ Get sharing in under 2 minutes:
12
+
13
+ 1. **[Install zrok](https://docs.zrok.io/docs/guides/install/)** for your platform
14
+ 2. **Get an account**: `zrok invite` (use the free [zrok.io service](https://docs.zrok.io/docs/getting-started/))
15
+ 3. **Enable sharing**: `zrok enable`
16
+
17
+ That's it! Now you can share anything:
18
+
19
+ ```bash
20
+ # Share a web service publicly
21
+ $ zrok share public localhost:8080
22
+
23
+ # Share files as a network drive
24
+ $ zrok share public --backend-mode drive ~/Documents
25
+
26
+ # Share privately with other zrok users
27
+ $ zrok share private localhost:3000
28
+ ```
29
+
30
+ ![zrok Web Console](docs/images/zrok_web_console.png)
31
+
32
+ ## What You Can Share
33
+
34
+ ### Web Services
35
+ Instantly make local web apps accessible over the internet:
36
+
37
+ ```bash
38
+ $ zrok share public localhost:8080
39
+ ```
40
+ ![zrok share public](docs/images/zrok_share_public.png)
41
+
42
+ ### Files & Directories
43
+ Turn any folder into a shareable network drive:
44
+
45
+ ```bash
46
+ $ zrok share public --backend-mode drive ~/Repos/zrok
47
+ ```
48
+ ![zrok share public -b drive](docs/images/zrok_share_public_drive.png)
49
+ ![mounted zrok drive](docs/images/zrok_share_public_drive_explorer.png)
50
+
51
+ ### Private Resources
52
+ Share TCP/UDP services securely with other zrok users—no public internet exposure.
53
+
54
+ ## Key Features
55
+
56
+ - **Zero Configuration**: Works through firewalls, NAT, and corporate networks
57
+ - **Secure by Default**: End-to-end encryption with zero-trust architecture
58
+ - **Public & Private Sharing**: Share with anyone or just specific users
59
+ - **Multiple Protocols**: HTTP/HTTPS, TCP, UDP, and file sharing
60
+ - **Cross-Platform**: Windows, macOS, Linux, and Raspberry Pi
61
+ - **Self-Hostable**: Run your own zrok service instance
62
+
63
+ ## How It Works
64
+
65
+ `zrok` is built on [OpenZiti](https://docs.openziti.io/docs/learn/introduction/), a programmable zero-trust network overlay. This means:
66
+
67
+ - **No inbound connectivity required**: Works from behind firewalls and NAT
68
+ - **End-to-end encryption**: All traffic is encrypted, even from zrok servers
69
+ - **Peer-to-peer connections**: Direct connections between users when possible
70
+ - **Identity-based access**: Share with specific users, not IP addresses
71
+
72
+ ## Developer SDK
73
+
74
+ Embed `zrok` sharing into your applications with our Go SDK:
75
+
76
+ ```go
77
+ // Create a share
78
+ shr, err := sdk.CreateShare(root, &sdk.ShareRequest{
79
+ BackendMode: sdk.TcpTunnelBackendMode,
80
+ ShareMode: sdk.PrivateShareMode,
81
+ })
82
+
83
+ // Accept connections
84
+ listener, err := sdk.NewListener(shr.Token, root)
85
+ ```
86
+
87
+ [Read the SDK guide](https://blog.openziti.io/the-zrok-sdk) for complete examples.
88
+
89
+ ## Self-Hosting
90
+
91
+ Run your own `zrok` service—from Raspberry Pi to enterprise scale:
92
+
93
+ - Single binary contains everything you need
94
+ - Scales from small personal instances to large public services
95
+ - Built on the same codebase as the public `zrok.io` service
96
+
97
+ [Self-Hosting Guide](https://docs.zrok.io/docs/guides/self-hosting/self_hosting_guide/)
98
+
99
+ ## Resources
100
+
101
+ - **[Documentation](https://docs.zrok.io/)**
102
+ - **[Office Hours Videos](https://www.youtube.com/watch?v=Edqv7yRmXb0&list=PLMUj_5fklasLuM6XiCNqwAFBuZD1t2lO2)**
103
+ - **[Building from Source](./BUILD.md)**
104
+ - **[Contributing](./CONTRIBUTING.md)**
105
+
106
+ ---
107
+
108
+ *Ready to start sharing? [Get started with zrok →](https://docs.zrok.io/docs/getting-started)*
airflow/airflow-webserver.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 257
airflow/airflow.cfg ADDED
@@ -0,0 +1,2498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [core]
2
+ # The folder where your airflow pipelines live, most likely a
3
+ # subfolder in a code repository. This path must be absolute.
4
+ #
5
+ # Variable: AIRFLOW__CORE__DAGS_FOLDER
6
+ #
7
+ dags_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/dags
8
+
9
+ # Hostname by providing a path to a callable, which will resolve the hostname.
10
+ # The format is "package.function".
11
+ #
12
+ # For example, default value ``airflow.utils.net.getfqdn`` means that result from patched
13
+ # version of `socket.getfqdn() <https://docs.python.org/3/library/socket.html#socket.getfqdn>`__,
14
+ # see related `CPython Issue <https://github.com/python/cpython/issues/49254>`__.
15
+ #
16
+ # No argument should be required in the function specified.
17
+ # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
18
+ #
19
+ # Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE
20
+ #
21
+ hostname_callable = airflow.utils.net.getfqdn
22
+
23
+ # A callable to check if a python file has airflow dags defined or not and should
24
+ # return ``True`` if it has dags otherwise ``False``.
25
+ # If this is not provided, Airflow uses its own heuristic rules.
26
+ #
27
+ # The function should have the following signature
28
+ #
29
+ # .. code-block:: python
30
+ #
31
+ # def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ...
32
+ #
33
+ # Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE
34
+ #
35
+ might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic
36
+
37
+ # Default timezone in case supplied date times are naive
38
+ # can be `UTC` (default), `system`, or any `IANA <https://www.iana.org/time-zones>`
39
+ # timezone string (e.g. Europe/Amsterdam)
40
+ #
41
+ # Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE
42
+ #
43
+ default_timezone = utc
44
+
45
+ # The executor class that airflow should use. Choices include
46
+ # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``,
47
+ # ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the
48
+ # full import path to the class when using a custom executor.
49
+ #
50
+ # Variable: AIRFLOW__CORE__EXECUTOR
51
+ #
52
+ executor = SequentialExecutor
53
+
54
+ # The auth manager class that airflow should use. Full import path to the auth manager class.
55
+ #
56
+ # Variable: AIRFLOW__CORE__AUTH_MANAGER
57
+ #
58
+ auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
59
+
60
+ # This defines the maximum number of task instances that can run concurrently per scheduler in
61
+ # Airflow, regardless of the worker count. Generally this value, multiplied by the number of
62
+ # schedulers in your cluster, is the maximum number of task instances with the running
63
+ # state in the metadata database. Setting this value to zero allows unlimited parallelism.
64
+ #
65
+ # Variable: AIRFLOW__CORE__PARALLELISM
66
+ #
67
+ parallelism = 32
68
+
69
+ # The maximum number of task instances allowed to run concurrently in each DAG. To calculate
70
+ # the number of tasks that is running concurrently for a DAG, add up the number of running
71
+ # tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``,
72
+ # which is defaulted as ``[core] max_active_tasks_per_dag``.
73
+ #
74
+ # An example scenario when this would be useful is when you want to stop a new dag with an early
75
+ # start date from stealing all the executor slots in a cluster.
76
+ #
77
+ # Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG
78
+ #
79
+ max_active_tasks_per_dag = 16
80
+
81
+ # Are DAGs paused by default at creation
82
+ #
83
+ # Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION
84
+ #
85
+ dags_are_paused_at_creation = True
86
+
87
+ # The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs
88
+ # if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``,
89
+ # which is defaulted as ``[core] max_active_runs_per_dag``.
90
+ #
91
+ # Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG
92
+ #
93
+ max_active_runs_per_dag = 16
94
+
95
+ # (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused.
96
+ # This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``,
97
+ # which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``.
98
+ # If not specified, then the value is considered as 0,
99
+ # meaning that the dags are never paused out by default.
100
+ #
101
+ # Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG
102
+ #
103
+ max_consecutive_failed_dag_runs_per_dag = 0
104
+
105
+ # The name of the method used in order to start Python processes via the multiprocessing module.
106
+ # This corresponds directly with the options available in the Python docs:
107
+ # `multiprocessing.set_start_method
108
+ # <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method>`__
109
+ # must be one of the values returned by `multiprocessing.get_all_start_methods()
110
+ # <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.get_all_start_methods>`__.
111
+ #
112
+ # Example: mp_start_method = fork
113
+ #
114
+ # Variable: AIRFLOW__CORE__MP_START_METHOD
115
+ #
116
+ # mp_start_method =
117
+
118
+ # Whether to load the DAG examples that ship with Airflow. It's good to
119
+ # get started, but you probably want to set this to ``False`` in a production
120
+ # environment
121
+ #
122
+ # Variable: AIRFLOW__CORE__LOAD_EXAMPLES
123
+ #
124
+ load_examples = True
125
+
126
+ # Path to the folder containing Airflow plugins
127
+ #
128
+ # Variable: AIRFLOW__CORE__PLUGINS_FOLDER
129
+ #
130
+ plugins_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/plugins
131
+
132
+ # Should tasks be executed via forking of the parent process
133
+ #
134
+ # * ``False``: Execute via forking of the parent process
135
+ # * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked
136
+ # up by tasks straight away
137
+ #
138
+ # Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER
139
+ #
140
+ execute_tasks_new_python_interpreter = False
141
+
142
+ # Secret key to save connection passwords in the db
143
+ #
144
+ # Variable: AIRFLOW__CORE__FERNET_KEY
145
+ #
146
+ fernet_key =
147
+
148
+ # Whether to disable pickling dags
149
+ #
150
+ # Variable: AIRFLOW__CORE__DONOT_PICKLE
151
+ #
152
+ donot_pickle = True
153
+
154
+ # How long before timing out a python file import
155
+ #
156
+ # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT
157
+ #
158
+ dagbag_import_timeout = 30.0
159
+
160
+ # Should a traceback be shown in the UI for dagbag import errors,
161
+ # instead of just the exception message
162
+ #
163
+ # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS
164
+ #
165
+ dagbag_import_error_tracebacks = True
166
+
167
+ # If tracebacks are shown, how many entries from the traceback should be shown
168
+ #
169
+ # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH
170
+ #
171
+ dagbag_import_error_traceback_depth = 2
172
+
173
+ # How long before timing out a DagFileProcessor, which processes a dag file
174
+ #
175
+ # Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT
176
+ #
177
+ dag_file_processor_timeout = 50
178
+
179
+ # The class to use for running task instances in a subprocess.
180
+ # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
181
+ # when using a custom task runner.
182
+ #
183
+ # Variable: AIRFLOW__CORE__TASK_RUNNER
184
+ #
185
+ task_runner = StandardTaskRunner
186
+
187
+ # If set, tasks without a ``run_as_user`` argument will be run with this user
188
+ # Can be used to de-elevate a sudo user running Airflow when executing tasks
189
+ #
190
+ # Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION
191
+ #
192
+ default_impersonation =
193
+
194
+ # What security module to use (for example kerberos)
195
+ #
196
+ # Variable: AIRFLOW__CORE__SECURITY
197
+ #
198
+ security =
199
+
200
+ # Turn unit test mode on (overwrites many configuration options with test
201
+ # values at runtime)
202
+ #
203
+ # Variable: AIRFLOW__CORE__UNIT_TEST_MODE
204
+ #
205
+ unit_test_mode = False
206
+
207
+ # Whether to enable pickling for xcom (note that this is insecure and allows for
208
+ # RCE exploits).
209
+ #
210
+ # Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING
211
+ #
212
+ enable_xcom_pickling = False
213
+
214
+ # What classes can be imported during deserialization. This is a multi line value.
215
+ # The individual items will be parsed as a pattern to a glob function.
216
+ # Python built-in classes (like dict) are always allowed.
217
+ #
218
+ # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES
219
+ #
220
+ allowed_deserialization_classes = airflow.*
221
+
222
+ # What classes can be imported during deserialization. This is a multi line value.
223
+ # The individual items will be parsed as regexp patterns.
224
+ # This is a secondary option to ``[core] allowed_deserialization_classes``.
225
+ #
226
+ # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP
227
+ #
228
+ allowed_deserialization_classes_regexp =
229
+
230
+ # When a task is killed forcefully, this is the amount of time in seconds that
231
+ # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
232
+ #
233
+ # Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME
234
+ #
235
+ killed_task_cleanup_time = 60
236
+
237
+ # Whether to override params with dag_run.conf. If you pass some key-value pairs
238
+ # through ``airflow dags backfill -c`` or
239
+ # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
240
+ #
241
+ # Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS
242
+ #
243
+ dag_run_conf_overrides_params = True
244
+
245
+ # If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive).
246
+ #
247
+ # Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE
248
+ #
249
+ dag_discovery_safe_mode = True
250
+
251
+ # The pattern syntax used in the
252
+ # `.airflowignore
253
+ # <https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#airflowignore>`__
254
+ # files in the DAG directories. Valid values are ``regexp`` or ``glob``.
255
+ #
256
+ # Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX
257
+ #
258
+ dag_ignore_file_syntax = regexp
259
+
260
+ # The number of retries each task is going to have by default. Can be overridden at dag or task level.
261
+ #
262
+ # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES
263
+ #
264
+ default_task_retries = 0
265
+
266
+ # The number of seconds each task is going to wait by default between retries. Can be overridden at
267
+ # dag or task level.
268
+ #
269
+ # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY
270
+ #
271
+ default_task_retry_delay = 300
272
+
273
+ # The maximum delay (in seconds) each task is going to wait by default between retries.
274
+ # This is a global setting and cannot be overridden at task or DAG level.
275
+ #
276
+ # Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY
277
+ #
278
+ max_task_retry_delay = 86400
279
+
280
+ # The weighting method used for the effective total priority weight of the task
281
+ #
282
+ # Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE
283
+ #
284
+ default_task_weight_rule = downstream
285
+
286
+ # Maximum possible time (in seconds) that task will have for execution of auxiliary processes
287
+ # (like listeners, mini scheduler...) after task is marked as success..
288
+ #
289
+ # Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME
290
+ #
291
+ task_success_overtime = 20
292
+
293
+ # The default task execution_timeout value for the operators. Expected an integer value to
294
+ # be passed into timedelta as seconds. If not specified, then the value is considered as None,
295
+ # meaning that the operators are never timed out by default.
296
+ #
297
+ # Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT
298
+ #
299
+ default_task_execution_timeout =
300
+
301
+ # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
302
+ #
303
+ # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL
304
+ #
305
+ min_serialized_dag_update_interval = 30
306
+
307
+ # If ``True``, serialized DAGs are compressed before writing to DB.
308
+ #
309
+ # .. note::
310
+ #
311
+ # This will disable the DAG dependencies view
312
+ #
313
+ # Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS
314
+ #
315
+ compress_serialized_dags = False
316
+
317
+ # Fetching serialized DAG can not be faster than a minimum interval to reduce database
318
+ # read rate. This config controls when your DAGs are updated in the Webserver
319
+ #
320
+ # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL
321
+ #
322
+ min_serialized_dag_fetch_interval = 10
323
+
324
+ # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
325
+ # in the Database.
326
+ # All the template_fields for each of Task Instance are stored in the Database.
327
+ # Keeping this number small may cause an error when you try to view ``Rendered`` tab in
328
+ # TaskInstance view for older tasks.
329
+ #
330
+ # Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK
331
+ #
332
+ max_num_rendered_ti_fields_per_task = 30
333
+
334
+ # On each dagrun check against defined SLAs
335
+ #
336
+ # Variable: AIRFLOW__CORE__CHECK_SLAS
337
+ #
338
+ check_slas = True
339
+
340
+ # Path to custom XCom class that will be used to store and resolve operators results
341
+ #
342
+ # Example: xcom_backend = path.to.CustomXCom
343
+ #
344
+ # Variable: AIRFLOW__CORE__XCOM_BACKEND
345
+ #
346
+ xcom_backend = airflow.models.xcom.BaseXCom
347
+
348
+ # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
349
+ # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
350
+ #
351
+ # Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS
352
+ #
353
+ lazy_load_plugins = True
354
+
355
+ # By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
356
+ # Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or
357
+ # loaded from module.
358
+ #
359
+ # Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS
360
+ #
361
+ lazy_discover_providers = True
362
+
363
+ # Hide sensitive **Variables** or **Connection extra json keys** from UI
364
+ # and task logs when set to ``True``
365
+ #
366
+ # .. note::
367
+ #
368
+ # Connection passwords are always hidden in logs
369
+ #
370
+ # Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS
371
+ #
372
+ hide_sensitive_var_conn_fields = True
373
+
374
+ # A comma-separated list of extra sensitive keywords to look for in variables names or connection's
375
+ # extra JSON.
376
+ #
377
+ # Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES
378
+ #
379
+ sensitive_var_conn_names =
380
+
381
+ # Task Slot counts for ``default_pool``. This setting would not have any effect in an existing
382
+ # deployment where the ``default_pool`` is already created. For existing deployments, users can
383
+ # change the number of slots using Webserver, API or the CLI
384
+ #
385
+ # Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT
386
+ #
387
+ default_pool_task_slot_count = 128
388
+
389
+ # The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
390
+ # length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
391
+ # mapped tasks from clogging the scheduler.
392
+ #
393
+ # Variable: AIRFLOW__CORE__MAX_MAP_LENGTH
394
+ #
395
+ max_map_length = 1024
396
+
397
+ # The default umask to use for process when run in daemon mode (scheduler, worker, etc.)
398
+ #
399
+ # This controls the file-creation mode mask which determines the initial value of file permission bits
400
+ # for newly created files.
401
+ #
402
+ # This value is treated as an octal-integer.
403
+ #
404
+ # Variable: AIRFLOW__CORE__DAEMON_UMASK
405
+ #
406
+ daemon_umask = 0o077
407
+
408
+ # Class to use as dataset manager.
409
+ #
410
+ # Example: dataset_manager_class = airflow.datasets.manager.DatasetManager
411
+ #
412
+ # Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS
413
+ #
414
+ # dataset_manager_class =
415
+
416
+ # Kwargs to supply to dataset manager.
417
+ #
418
+ # Example: dataset_manager_kwargs = {"some_param": "some_value"}
419
+ #
420
+ # Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS
421
+ #
422
+ # dataset_manager_kwargs =
423
+
424
+ # Dataset URI validation should raise an exception if it is not compliant with AIP-60.
425
+ # By default this configuration is false, meaning that Airflow 2.x only warns the user.
426
+ # In Airflow 3, this configuration will be enabled by default.
427
+ #
428
+ # Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION
429
+ #
430
+ strict_dataset_uri_validation = False
431
+
432
+ # (experimental) Whether components should use Airflow Internal API for DB connectivity.
433
+ #
434
+ # Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION
435
+ #
436
+ database_access_isolation = False
437
+
438
+ # (experimental) Airflow Internal API url.
439
+ # Only used if ``[core] database_access_isolation`` is ``True``.
440
+ #
441
+ # Example: internal_api_url = http://localhost:8080
442
+ #
443
+ # Variable: AIRFLOW__CORE__INTERNAL_API_URL
444
+ #
445
+ # internal_api_url =
446
+
447
+ # Secret key used to authenticate internal API clients to core. It should be as random as possible.
448
+ # However, when running more than 1 instances of webserver / internal API services, make sure all
449
+ # of them use the same ``secret_key`` otherwise calls will fail on authentication.
450
+ # The authentication token generated using the secret key has a short expiry time though - make
451
+ # sure that time on ALL the machines that you run airflow components on is synchronized
452
+ # (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed.
453
+ #
454
+ # Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY
455
+ #
456
+ internal_api_secret_key = JRNP2IC4kIaVxisy9+AW4A==
457
+
458
+ # The ability to allow testing connections across Airflow UI, API and CLI.
459
+ # Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled
460
+ # Disabled - Disables the test connection functionality and disables the Test Connection button in UI.
461
+ # Enabled - Enables the test connection functionality and shows the Test Connection button in UI.
462
+ # Hidden - Disables the test connection functionality and hides the Test Connection button in UI.
463
+ # Before setting this to Enabled, make sure that you review the users who are able to add/edit
464
+ # connections and ensure they are trusted. Connection testing can be done maliciously leading to
465
+ # undesired and insecure outcomes.
466
+ # See `Airflow Security Model: Capabilities of authenticated UI users
467
+ # <https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#capabilities-of-authenticated-ui-users>`__
468
+ # for more details.
469
+ #
470
+ # Variable: AIRFLOW__CORE__TEST_CONNECTION
471
+ #
472
+ test_connection = Disabled
473
+
474
+ # The maximum length of the rendered template field. If the value to be stored in the
475
+ # rendered template field exceeds this size, it's redacted.
476
+ #
477
+ # Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH
478
+ #
479
+ max_templated_field_length = 4096
480
+
481
+ [database]
482
+ # Path to the ``alembic.ini`` file. You can either provide the file path relative
483
+ # to the Airflow home directory or the absolute path if it is located elsewhere.
484
+ #
485
+ # Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH
486
+ #
487
+ alembic_ini_file_path = alembic.ini
488
+
489
+ # The SQLAlchemy connection string to the metadata database.
490
+ # SQLAlchemy supports many different database engines.
491
+ # See: `Set up a Database Backend: Database URI
492
+ # <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri>`__
493
+ # for more details.
494
+ #
495
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN
496
+ #
497
+ sql_alchemy_conn = sqlite:////kaggle/working/BTC-USDT-ETL-Pipeline/airflow/airflow.db
498
+
499
+ # Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value
500
+ #
501
+ # Example: sql_alchemy_engine_args = {"arg1": true}
502
+ #
503
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS
504
+ #
505
+ # sql_alchemy_engine_args =
506
+
507
+ # The encoding for the databases
508
+ #
509
+ # Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING
510
+ #
511
+ sql_engine_encoding = utf-8
512
+
513
+ # Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns
514
+ # in case they have different encoding.
515
+ # By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
516
+ # the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed
517
+ # the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see
518
+ # `GitHub Issue Comment <https://github.com/apache/airflow/pull/17603#issuecomment-901121618>`__
519
+ # for more details.
520
+ #
521
+ # Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS
522
+ #
523
+ # sql_engine_collation_for_ids =
524
+
525
+ # If SQLAlchemy should pool database connections.
526
+ #
527
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED
528
+ #
529
+ sql_alchemy_pool_enabled = True
530
+
531
+ # The SQLAlchemy pool size is the maximum number of database connections
532
+ # in the pool. 0 indicates no limit.
533
+ #
534
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE
535
+ #
536
+ sql_alchemy_pool_size = 5
537
+
538
+ # The maximum overflow size of the pool.
539
+ # When the number of checked-out connections reaches the size set in pool_size,
540
+ # additional connections will be returned up to this limit.
541
+ # When those additional connections are returned to the pool, they are disconnected and discarded.
542
+ # It follows then that the total number of simultaneous connections the pool will allow
543
+ # is **pool_size** + **max_overflow**,
544
+ # and the total number of "sleeping" connections the pool will allow is pool_size.
545
+ # max_overflow can be set to ``-1`` to indicate no overflow limit;
546
+ # no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
547
+ #
548
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW
549
+ #
550
+ sql_alchemy_max_overflow = 10
551
+
552
+ # The SQLAlchemy pool recycle is the number of seconds a connection
553
+ # can be idle in the pool before it is invalidated. This config does
554
+ # not apply to sqlite. If the number of DB connections is ever exceeded,
555
+ # a lower config value will allow the system to recover faster.
556
+ #
557
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE
558
+ #
559
+ sql_alchemy_pool_recycle = 1800
560
+
561
+ # Check connection at the start of each connection pool checkout.
562
+ # Typically, this is a simple statement like "SELECT 1".
563
+ # See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic
564
+ # <https://docs.sqlalchemy.org/en/14/core/pooling.html#disconnect-handling-pessimistic>`__
565
+ # for more details.
566
+ #
567
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING
568
+ #
569
+ sql_alchemy_pool_pre_ping = True
570
+
571
+ # The schema to use for the metadata database.
572
+ # SQLAlchemy supports databases with the concept of multiple schemas.
573
+ #
574
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA
575
+ #
576
+ sql_alchemy_schema =
577
+
578
+ # Import path for connect args in SQLAlchemy. Defaults to an empty dict.
579
+ # This is useful when you want to configure db engine args that SQLAlchemy won't parse
580
+ # in connection string. This can be set by passing a dictionary containing the create engine parameters.
581
+ # For more details about passing create engine parameters (keepalives variables, timeout etc)
582
+ # in Postgres DB Backend see `Setting up a PostgreSQL Database
583
+ # <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#setting-up-a-postgresql-database>`__
584
+ # e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and
585
+ # can be imported as shown below
586
+ #
587
+ # Example: sql_alchemy_connect_args = airflow_local_settings.connect_args
588
+ #
589
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS
590
+ #
591
+ # sql_alchemy_connect_args =
592
+
593
+ # Important Warning: Use of sql_alchemy_session_maker Highly Discouraged
594
+ # Import path for function which returns 'sqlalchemy.orm.sessionmaker'.
595
+ # Improper configuration of sql_alchemy_session_maker can lead to serious issues,
596
+ # including data corruption, unrecoverable application crashes. Please review the SQLAlchemy
597
+ # documentation for detailed guidance on proper configuration and best practices.
598
+ #
599
+ # Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker
600
+ #
601
+ # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER
602
+ #
603
+ # sql_alchemy_session_maker =
604
+
605
+ # Whether to load the default connections that ship with Airflow when ``airflow db init`` is called.
606
+ # It's good to get started, but you probably want to set this to ``False`` in a production environment.
607
+ #
608
+ # Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS
609
+ #
610
+ load_default_connections = True
611
+
612
+ # Number of times the code should be retried in case of DB Operational Errors.
613
+ # Not all transactions will be retried as it can cause undesired state.
614
+ # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
615
+ #
616
+ # Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES
617
+ #
618
+ max_db_retries = 3
619
+
620
+ # Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive,
621
+ # and the users can assert the correct version through other means (e.g. through a Helm chart).
622
+ # Accepts ``True`` or ``False``.
623
+ #
624
+ # Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS
625
+ #
626
+ check_migrations = True
627
+
628
+ [logging]
629
+ # The folder where airflow should store its log files.
630
+ # This path must be absolute.
631
+ # There are a few existing configurations that assume this is set to the default.
632
+ # If you choose to override this you may need to update the
633
+ # ``[logging] dag_processor_manager_log_location`` and
634
+ # ``[logging] child_process_log_directory settings`` as well.
635
+ #
636
+ # Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER
637
+ #
638
+ base_log_folder = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs
639
+
640
+ # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
641
+ # Set this to ``True`` if you want to enable remote logging.
642
+ #
643
+ # Variable: AIRFLOW__LOGGING__REMOTE_LOGGING
644
+ #
645
+ remote_logging = False
646
+
647
+ # Users must supply an Airflow connection id that provides access to the storage
648
+ # location. Depending on your remote logging service, this may only be used for
649
+ # reading logs, not writing them.
650
+ #
651
+ # Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID
652
+ #
653
+ remote_log_conn_id =
654
+
655
+ # Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after
656
+ # they are uploaded to the remote location.
657
+ #
658
+ # Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS
659
+ #
660
+ delete_local_logs = False
661
+
662
+ # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
663
+ # Credentials
664
+ # <https://cloud.google.com/docs/authentication/application-default-credentials>`__ will
665
+ # be used.
666
+ #
667
+ # Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH
668
+ #
669
+ google_key_path =
670
+
671
+ # Storage bucket URL for remote logging
672
+ # S3 buckets should start with **s3://**
673
+ # Cloudwatch log groups should start with **cloudwatch://**
674
+ # GCS buckets should start with **gs://**
675
+ # WASB buckets should start with **wasb** just to help Airflow select correct handler
676
+ # Stackdriver logs should start with **stackdriver://**
677
+ #
678
+ # Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER
679
+ #
680
+ remote_base_log_folder =
681
+
682
+ # The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__``
683
+ # of remote task handler and it overrides the values provided by Airflow config. For example if you set
684
+ # ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local
685
+ # log files will be deleted after they are uploaded to remote location.
686
+ #
687
+ # Example: remote_task_handler_kwargs = {"delete_local_copy": true}
688
+ #
689
+ # Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS
690
+ #
691
+ remote_task_handler_kwargs =
692
+
693
+ # Use server-side encryption for logs stored in S3
694
+ #
695
+ # Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS
696
+ #
697
+ encrypt_s3_logs = False
698
+
699
+ # Logging level.
700
+ #
701
+ # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
702
+ #
703
+ # Variable: AIRFLOW__LOGGING__LOGGING_LEVEL
704
+ #
705
+ logging_level = INFO
706
+
707
+ # Logging level for celery. If not set, it uses the value of logging_level
708
+ #
709
+ # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
710
+ #
711
+ # Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL
712
+ #
713
+ celery_logging_level =
714
+
715
+ # Logging level for Flask-appbuilder UI.
716
+ #
717
+ # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
718
+ #
719
+ # Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL
720
+ #
721
+ fab_logging_level = WARNING
722
+
723
+ # Logging class
724
+ # Specify the class that will specify the logging configuration
725
+ # This class has to be on the python classpath
726
+ #
727
+ # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
728
+ #
729
+ # Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS
730
+ #
731
+ logging_config_class =
732
+
733
+ # Flag to enable/disable Colored logs in Console
734
+ # Colour the logs when the controlling terminal is a TTY.
735
+ #
736
+ # Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG
737
+ #
738
+ colored_console_log = True
739
+
740
+ # Log format for when Colored logs is enabled
741
+ #
742
+ # Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT
743
+ #
744
+ colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
745
+
746
+ # Specifies the class utilized by Airflow to implement colored logging
747
+ #
748
+ # Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS
749
+ #
750
+ colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
751
+
752
+ # Format of Log line
753
+ #
754
+ # Variable: AIRFLOW__LOGGING__LOG_FORMAT
755
+ #
756
+ log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
757
+
758
+ # Defines the format of log messages for simple logging configuration
759
+ #
760
+ # Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT
761
+ #
762
+ simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
763
+
764
+ # Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory.
765
+ #
766
+ # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET
767
+ #
768
+ dag_processor_log_target = file
769
+
770
+ # Format of Dag Processor Log line
771
+ #
772
+ # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT
773
+ #
774
+ dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
775
+
776
+ # Determines the formatter class used by Airflow for structuring its log messages
777
+ # The default formatter class is timezone-aware, which means that timestamps attached to log entries
778
+ # will be adjusted to reflect the local timezone of the Airflow instance
779
+ #
780
+ # Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS
781
+ #
782
+ log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware
783
+
784
+ # An import path to a function to add adaptations of each secret added with
785
+ # ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function
786
+ # is expected to require a single parameter: the secret to be adapted. It may return a
787
+ # single adaptation of the secret or an iterable of adaptations to each be masked as secrets.
788
+ # The original secret will be masked as well as any adaptations returned.
789
+ #
790
+ # Example: secret_mask_adapter = urllib.parse.quote
791
+ #
792
+ # Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER
793
+ #
794
+ secret_mask_adapter =
795
+
796
+ # Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter``
797
+ #
798
+ # Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}}
799
+ #
800
+ # Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE
801
+ #
802
+ task_log_prefix_template =
803
+
804
+ # Formatting for how airflow generates file names/paths for each task run.
805
+ #
806
+ # Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE
807
+ #
808
+ log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log
809
+
810
+ # Formatting for how airflow generates file names for log
811
+ #
812
+ # Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE
813
+ #
814
+ log_processor_filename_template = {{ filename }}.log
815
+
816
+ # Full path of dag_processor_manager logfile.
817
+ #
818
+ # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION
819
+ #
820
+ dag_processor_manager_log_location = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs/dag_processor_manager/dag_processor_manager.log
821
+
822
+ # Whether DAG processor manager will write logs to stdout
823
+ #
824
+ # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT
825
+ #
826
+ dag_processor_manager_log_stdout = False
827
+
828
+ # Name of handler to read task instance logs.
829
+ # Defaults to use ``task`` handler.
830
+ #
831
+ # Variable: AIRFLOW__LOGGING__TASK_LOG_READER
832
+ #
833
+ task_log_reader = task
834
+
835
+ # A comma\-separated list of third-party logger names that will be configured to print messages to
836
+ # consoles\.
837
+ #
838
+ # Example: extra_logger_names = connexion,sqlalchemy
839
+ #
840
+ # Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES
841
+ #
842
+ extra_logger_names =
843
+
844
+ # When you start an Airflow worker, Airflow starts a tiny web server
845
+ # subprocess to serve the workers local log files to the airflow main
846
+ # web server, who then builds pages and sends them to users. This defines
847
+ # the port on which the logs are served. It needs to be unused, and open
848
+ # visible from the main web server to connect into the workers.
849
+ #
850
+ # Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT
851
+ #
852
+ worker_log_server_port = 8793
853
+
854
+ # Port to serve logs from for triggerer.
855
+ # See ``[logging] worker_log_server_port`` description for more info.
856
+ #
857
+ # Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT
858
+ #
859
+ trigger_log_server_port = 8794
860
+
861
+ # We must parse timestamps to interleave logs between trigger and task. To do so,
862
+ # we need to parse timestamps in log files. In case your log format is non-standard,
863
+ # you may provide import path to callable which takes a string log line and returns
864
+ # the timestamp (datetime.datetime compatible).
865
+ #
866
+ # Example: interleave_timestamp_parser = path.to.my_func
867
+ #
868
+ # Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER
869
+ #
870
+ # interleave_timestamp_parser =
871
+
872
+ # Permissions in the form or of octal string as understood by chmod. The permissions are important
873
+ # when you use impersonation, when logs are written by a different user than airflow. The most secure
874
+ # way of configuring it in this case is to add both users to the same group and make it the default
875
+ # group of both users. Group-writeable logs are default in airflow, but you might decide that you are
876
+ # OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might
877
+ # decide to add more security if you do not use impersonation and change it to ``0o755`` to make it
878
+ # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700``
879
+ # if all the access (read/write) for your logs happens from the same user.
880
+ #
881
+ # Example: file_task_handler_new_folder_permissions = 0o775
882
+ #
883
+ # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS
884
+ #
885
+ file_task_handler_new_folder_permissions = 0o775
886
+
887
+ # Permissions in the form or of octal string as understood by chmod. The permissions are important
888
+ # when you use impersonation, when logs are written by a different user than airflow. The most secure
889
+ # way of configuring it in this case is to add both users to the same group and make it the default
890
+ # group of both users. Group-writeable logs are default in airflow, but you might decide that you are
891
+ # OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might
892
+ # decide to add more security if you do not use impersonation and change it to ``0o644`` to make it
893
+ # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600``
894
+ # if all the access (read/write) for your logs happens from the same user.
895
+ #
896
+ # Example: file_task_handler_new_file_permissions = 0o664
897
+ #
898
+ # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS
899
+ #
900
+ file_task_handler_new_file_permissions = 0o664
901
+
902
+ # By default Celery sends all logs into stderr.
903
+ # If enabled any previous logging handlers will get *removed*.
904
+ # With this option AirFlow will create new handlers
905
+ # and send low level logs like INFO and WARNING to stdout,
906
+ # while sending higher severity logs to stderr.
907
+ #
908
+ # Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION
909
+ #
910
+ celery_stdout_stderr_separation = False
911
+
912
+ # If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from
913
+ # the scheduler, executor, or callback execution context. This can help in circumstances such as
914
+ # when there's something blocking the execution of the task and ordinarily there may be no task
915
+ # logs at all.
916
+ # This is set to ``True`` by default. If you encounter issues with this feature
917
+ # (e.g. scheduler performance issues) it can be disabled.
918
+ #
919
+ # Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER
920
+ #
921
+ enable_task_context_logger = True
922
+
923
+ # A comma separated list of keywords related to errors whose presence should display the line in red
924
+ # color in UI
925
+ #
926
+ # Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS
927
+ #
928
+ color_log_error_keywords = error,exception
929
+
930
+ # A comma separated list of keywords related to warning whose presence should display the line in yellow
931
+ # color in UI
932
+ #
933
+ # Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS
934
+ #
935
+ color_log_warning_keywords = warn
936
+
937
+ [metrics]
938
+ # `StatsD <https://github.com/statsd/statsd>`__ integration settings.
939
+
940
+ # If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use
941
+ # regex pattern matching anywhere within the metric name instead of only prefix matching
942
+ # at the start of the name.
943
+ #
944
+ # Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH
945
+ #
946
+ metrics_use_pattern_match = False
947
+
948
+ # Configure an allow list (comma separated string) to send only certain metrics.
949
+ # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
950
+ # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
951
+ #
952
+ # Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
953
+ #
954
+ # Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST
955
+ #
956
+ metrics_allow_list =
957
+
958
+ # Configure a block list (comma separated string) to block certain metrics from being emitted.
959
+ # If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured,
960
+ # ``[metrics] metrics_block_list`` is ignored.
961
+ #
962
+ # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
963
+ #
964
+ # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
965
+ #
966
+ # Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
967
+ #
968
+ # Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST
969
+ #
970
+ metrics_block_list =
971
+
972
+ # Enables sending metrics to StatsD.
973
+ #
974
+ # Variable: AIRFLOW__METRICS__STATSD_ON
975
+ #
976
+ statsd_on = False
977
+
978
+ # Specifies the host address where the StatsD daemon (or server) is running
979
+ #
980
+ # Variable: AIRFLOW__METRICS__STATSD_HOST
981
+ #
982
+ statsd_host = localhost
983
+
984
+ # Specifies the port on which the StatsD daemon (or server) is listening to
985
+ #
986
+ # Variable: AIRFLOW__METRICS__STATSD_PORT
987
+ #
988
+ statsd_port = 8125
989
+
990
+ # Defines the namespace for all metrics sent from Airflow to StatsD
991
+ #
992
+ # Variable: AIRFLOW__METRICS__STATSD_PREFIX
993
+ #
994
+ statsd_prefix = airflow
995
+
996
+ # A function that validate the StatsD stat name, apply changes to the stat name if necessary and return
997
+ # the transformed stat name.
998
+ #
999
+ # The function should have the following signature
1000
+ #
1001
+ # .. code-block:: python
1002
+ #
1003
+ # def func_name(stat_name: str) -> str: ...
1004
+ #
1005
+ # Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER
1006
+ #
1007
+ stat_name_handler =
1008
+
1009
+ # To enable datadog integration to send airflow metrics.
1010
+ #
1011
+ # Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED
1012
+ #
1013
+ statsd_datadog_enabled = False
1014
+
1015
+ # List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``)
1016
+ #
1017
+ # Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS
1018
+ #
1019
+ statsd_datadog_tags =
1020
+
1021
+ # Set to ``False`` to disable metadata tags for some of the emitted metrics
1022
+ #
1023
+ # Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS
1024
+ #
1025
+ statsd_datadog_metrics_tags = True
1026
+
1027
+ # If you want to utilise your own custom StatsD client set the relevant
1028
+ # module path below.
1029
+ # Note: The module path must exist on your
1030
+ # `PYTHONPATH <https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH>`
1031
+ # for Airflow to pick it up
1032
+ #
1033
+ # Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH
1034
+ #
1035
+ # statsd_custom_client_path =
1036
+
1037
+ # If you want to avoid sending all the available metrics tags to StatsD,
1038
+ # you can configure a block list of prefixes (comma separated) to filter out metric tags
1039
+ # that start with the elements of the list (e.g: ``job_id,run_id``)
1040
+ #
1041
+ # Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id
1042
+ #
1043
+ # Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS
1044
+ #
1045
+ statsd_disabled_tags = job_id,run_id
1046
+
1047
+ # To enable sending Airflow metrics with StatsD-Influxdb tagging convention.
1048
+ #
1049
+ # Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED
1050
+ #
1051
+ statsd_influxdb_enabled = False
1052
+
1053
+ # Enables sending metrics to OpenTelemetry.
1054
+ #
1055
+ # Variable: AIRFLOW__METRICS__OTEL_ON
1056
+ #
1057
+ otel_on = False
1058
+
1059
+ # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
1060
+ # metrics and traces.
1061
+ #
1062
+ # Variable: AIRFLOW__METRICS__OTEL_HOST
1063
+ #
1064
+ otel_host = localhost
1065
+
1066
+ # Specifies the port of the OpenTelemetry Collector that is listening to.
1067
+ #
1068
+ # Variable: AIRFLOW__METRICS__OTEL_PORT
1069
+ #
1070
+ otel_port = 8889
1071
+
1072
+ # The prefix for the Airflow metrics.
1073
+ #
1074
+ # Variable: AIRFLOW__METRICS__OTEL_PREFIX
1075
+ #
1076
+ otel_prefix = airflow
1077
+
1078
+ # Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces
1079
+ # to the configured OpenTelemetry Collector.
1080
+ #
1081
+ # Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS
1082
+ #
1083
+ otel_interval_milliseconds = 60000
1084
+
1085
+ # If ``True``, all metrics are also emitted to the console. Defaults to ``False``.
1086
+ #
1087
+ # Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON
1088
+ #
1089
+ otel_debugging_on = False
1090
+
1091
+ # The default service name of traces.
1092
+ #
1093
+ # Variable: AIRFLOW__METRICS__OTEL_SERVICE
1094
+ #
1095
+ otel_service = Airflow
1096
+
1097
+ # If ``True``, SSL will be enabled. Defaults to ``False``.
1098
+ # To establish an HTTPS connection to the OpenTelemetry collector,
1099
+ # you need to configure the SSL certificate and key within the OpenTelemetry collector's
1100
+ # ``config.yml`` file.
1101
+ #
1102
+ # Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE
1103
+ #
1104
+ otel_ssl_active = False
1105
+
1106
+ [traces]
1107
+ # Distributed traces integration settings.
1108
+
1109
+ # Enables sending traces to OpenTelemetry.
1110
+ #
1111
+ # Variable: AIRFLOW__TRACES__OTEL_ON
1112
+ #
1113
+ otel_on = False
1114
+
1115
+ # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
1116
+ # traces.
1117
+ #
1118
+ # Variable: AIRFLOW__TRACES__OTEL_HOST
1119
+ #
1120
+ otel_host = localhost
1121
+
1122
+ # Specifies the port of the OpenTelemetry Collector that is listening to.
1123
+ #
1124
+ # Variable: AIRFLOW__TRACES__OTEL_PORT
1125
+ #
1126
+ otel_port = 8889
1127
+
1128
+ # The default service name of traces.
1129
+ #
1130
+ # Variable: AIRFLOW__TRACES__OTEL_SERVICE
1131
+ #
1132
+ otel_service = Airflow
1133
+
1134
+ # If True, all traces are also emitted to the console. Defaults to False.
1135
+ #
1136
+ # Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON
1137
+ #
1138
+ otel_debugging_on = False
1139
+
1140
+ # If True, SSL will be enabled. Defaults to False.
1141
+ # To establish an HTTPS connection to the OpenTelemetry collector,
1142
+ # you need to configure the SSL certificate and key within the OpenTelemetry collector's
1143
+ # config.yml file.
1144
+ #
1145
+ # Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE
1146
+ #
1147
+ otel_ssl_active = False
1148
+
1149
+ # If True, after the task is complete, the full task log messages will be added as the
1150
+ # span events, chunked by 64k size. defaults to False.
1151
+ #
1152
+ # Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT
1153
+ #
1154
+ otel_task_log_event = False
1155
+
1156
+ [secrets]
1157
+ # Full class name of secrets backend to enable (will precede env vars and metastore in search path)
1158
+ #
1159
+ # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
1160
+ #
1161
+ # Variable: AIRFLOW__SECRETS__BACKEND
1162
+ #
1163
+ backend =
1164
+
1165
+ # The backend_kwargs param is loaded into a dictionary and passed to ``__init__``
1166
+ # of secrets backend class. See documentation for the secrets backend you are using.
1167
+ # JSON is expected.
1168
+ #
1169
+ # Example for AWS Systems Manager ParameterStore:
1170
+ # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
1171
+ #
1172
+ # Variable: AIRFLOW__SECRETS__BACKEND_KWARGS
1173
+ #
1174
+ backend_kwargs =
1175
+
1176
+ # .. note:: |experimental|
1177
+ #
1178
+ # Enables local caching of Variables, when parsing DAGs only.
1179
+ # Using this option can make dag parsing faster if Variables are used in top level code, at the expense
1180
+ # of longer propagation time for changes.
1181
+ # Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG
1182
+ # tasks are run.
1183
+ #
1184
+ # Variable: AIRFLOW__SECRETS__USE_CACHE
1185
+ #
1186
+ use_cache = False
1187
+
1188
+ # .. note:: |experimental|
1189
+ #
1190
+ # When the cache is enabled, this is the duration for which we consider an entry in the cache to be
1191
+ # valid. Entries are refreshed if they are older than this many seconds.
1192
+ # It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a
1193
+ # Variable change take effect.
1194
+ #
1195
+ # Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS
1196
+ #
1197
+ cache_ttl_seconds = 900
1198
+
1199
+ [cli]
1200
+ # In what way should the cli access the API. The LocalClient will use the
1201
+ # database directly, while the json_client will use the api running on the
1202
+ # webserver
1203
+ #
1204
+ # Variable: AIRFLOW__CLI__API_CLIENT
1205
+ #
1206
+ api_client = airflow.api.client.local_client
1207
+
1208
+ # If you set web_server_url_prefix, do NOT forget to append it here, ex:
1209
+ # ``endpoint_url = http://localhost:8080/myroot``
1210
+ # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
1211
+ #
1212
+ # Variable: AIRFLOW__CLI__ENDPOINT_URL
1213
+ #
1214
+ endpoint_url = http://localhost:8080
1215
+
1216
+ [debug]
1217
+ # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
1218
+ # failed task. Helpful for debugging purposes.
1219
+ #
1220
+ # Variable: AIRFLOW__DEBUG__FAIL_FAST
1221
+ #
1222
+ fail_fast = False
1223
+
1224
+ [api]
1225
+ # Enables the deprecated experimental API. Please note that these API endpoints do not have
1226
+ # access control. An authenticated user has full access.
1227
+ #
1228
+ # .. warning::
1229
+ #
1230
+ # This `Experimental REST API
1231
+ # <https://airflow.apache.org/docs/apache-airflow/stable/deprecated-rest-api-ref.html>`__ is
1232
+ # deprecated since version 2.0. Please consider using
1233
+ # `the Stable REST API
1234
+ # <https://airflow.apache.org/docs/apache-airflow/stable/stable-rest-api-ref.html>`__.
1235
+ # For more information on migration, see
1236
+ # `RELEASE_NOTES.rst <https://github.com/apache/airflow/blob/main/RELEASE_NOTES.rst>`_
1237
+ #
1238
+ # Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API
1239
+ #
1240
+ enable_experimental_api = False
1241
+
1242
+ # Comma separated list of auth backends to authenticate users of the API. See
1243
+ # `Security: API
1244
+ # <https://airflow.apache.org/docs/apache-airflow/stable/security/api.html>`__ for possible values.
1245
+ # ("airflow.api.auth.backend.default" allows all requests for historic reasons)
1246
+ #
1247
+ # Variable: AIRFLOW__API__AUTH_BACKENDS
1248
+ #
1249
+ auth_backends = airflow.api.auth.backend.session
1250
+
1251
+ # Used to set the maximum page limit for API requests. If limit passed as param
1252
+ # is greater than maximum page limit, it will be ignored and maximum page limit value
1253
+ # will be set as the limit
1254
+ #
1255
+ # Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT
1256
+ #
1257
+ maximum_page_limit = 100
1258
+
1259
+ # Used to set the default page limit when limit param is zero or not provided in API
1260
+ # requests. Otherwise if positive integer is passed in the API requests as limit, the
1261
+ # smallest number of user given limit or maximum page limit is taken as limit.
1262
+ #
1263
+ # Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT
1264
+ #
1265
+ fallback_page_limit = 100
1266
+
1267
+ # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
1268
+ #
1269
+ # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
1270
+ #
1271
+ # Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE
1272
+ #
1273
+ google_oauth2_audience =
1274
+
1275
+ # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
1276
+ # `the Application Default Credentials
1277
+ # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
1278
+ # be used.
1279
+ #
1280
+ # Example: google_key_path = /files/service-account-json
1281
+ #
1282
+ # Variable: AIRFLOW__API__GOOGLE_KEY_PATH
1283
+ #
1284
+ google_key_path =
1285
+
1286
+ # Used in response to a preflight request to indicate which HTTP
1287
+ # headers can be used when making the actual request. This header is
1288
+ # the server side response to the browser's
1289
+ # Access-Control-Request-Headers header.
1290
+ #
1291
+ # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS
1292
+ #
1293
+ access_control_allow_headers =
1294
+
1295
+ # Specifies the method or methods allowed when accessing the resource.
1296
+ #
1297
+ # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS
1298
+ #
1299
+ access_control_allow_methods =
1300
+
1301
+ # Indicates whether the response can be shared with requesting code from the given origins.
1302
+ # Separate URLs with space.
1303
+ #
1304
+ # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS
1305
+ #
1306
+ access_control_allow_origins =
1307
+
1308
+ # Indicates whether the **xcomEntries** endpoint supports the **deserialize**
1309
+ # flag. If set to ``False``, setting this flag in a request would result in a
1310
+ # 400 Bad Request error.
1311
+ #
1312
+ # Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT
1313
+ #
1314
+ enable_xcom_deserialize_support = False
1315
+
1316
+ [lineage]
1317
+ # what lineage backend to use
1318
+ #
1319
+ # Variable: AIRFLOW__LINEAGE__BACKEND
1320
+ #
1321
+ backend =
1322
+
1323
+ [operators]
1324
+ # The default owner assigned to each new operator, unless
1325
+ # provided explicitly or passed via ``default_args``
1326
+ #
1327
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER
1328
+ #
1329
+ default_owner = airflow
1330
+
1331
+ # The default value of attribute "deferrable" in operators and sensors.
1332
+ #
1333
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE
1334
+ #
1335
+ default_deferrable = false
1336
+
1337
+ # Indicates the default number of CPU units allocated to each operator when no specific CPU request
1338
+ # is specified in the operator's configuration
1339
+ #
1340
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS
1341
+ #
1342
+ default_cpus = 1
1343
+
1344
+ # Indicates the default number of RAM allocated to each operator when no specific RAM request
1345
+ # is specified in the operator's configuration
1346
+ #
1347
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_RAM
1348
+ #
1349
+ default_ram = 512
1350
+
1351
+ # Indicates the default number of disk storage allocated to each operator when no specific disk request
1352
+ # is specified in the operator's configuration
1353
+ #
1354
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_DISK
1355
+ #
1356
+ default_disk = 512
1357
+
1358
+ # Indicates the default number of GPUs allocated to each operator when no specific GPUs request
1359
+ # is specified in the operator's configuration
1360
+ #
1361
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS
1362
+ #
1363
+ default_gpus = 0
1364
+
1365
+ # Default queue that tasks get assigned to and that worker listen on.
1366
+ #
1367
+ # Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE
1368
+ #
1369
+ default_queue = default
1370
+
1371
+ # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
1372
+ # If set to ``False``, an exception will be thrown,
1373
+ # otherwise only the console message will be displayed.
1374
+ #
1375
+ # Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS
1376
+ #
1377
+ allow_illegal_arguments = False
1378
+
1379
+ [webserver]
1380
+ # The message displayed when a user attempts to execute actions beyond their authorised privileges.
1381
+ #
1382
+ # Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE
1383
+ #
1384
+ access_denied_message = Access is Denied
1385
+
1386
+ # Path of webserver config file used for configuring the webserver parameters
1387
+ #
1388
+ # Variable: AIRFLOW__WEBSERVER__CONFIG_FILE
1389
+ #
1390
+ config_file = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/webserver_config.py
1391
+
1392
+ # The base url of your website: Airflow cannot guess what domain or CNAME you are using.
1393
+ # This is used to create links in the Log Url column in the Browse - Task Instances menu,
1394
+ # as well as in any automated emails sent by Airflow that contain links to your webserver.
1395
+ #
1396
+ # Variable: AIRFLOW__WEBSERVER__BASE_URL
1397
+ #
1398
+ base_url = http://localhost:8080
1399
+
1400
+ # Default timezone to display all dates in the UI, can be UTC, system, or
1401
+ # any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the
1402
+ # default value of core/default_timezone will be used
1403
+ #
1404
+ # Example: default_ui_timezone = America/New_York
1405
+ #
1406
+ # Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE
1407
+ #
1408
+ default_ui_timezone = UTC
1409
+
1410
+ # The ip specified when starting the web server
1411
+ #
1412
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST
1413
+ #
1414
+ web_server_host = 0.0.0.0
1415
+
1416
+ # The port on which to run the web server
1417
+ #
1418
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT
1419
+ #
1420
+ web_server_port = 8080
1421
+
1422
+ # Paths to the SSL certificate and key for the web server. When both are
1423
+ # provided SSL will be enabled. This does not change the web server port.
1424
+ #
1425
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT
1426
+ #
1427
+ web_server_ssl_cert =
1428
+
1429
+ # Paths to the SSL certificate and key for the web server. When both are
1430
+ # provided SSL will be enabled. This does not change the web server port.
1431
+ #
1432
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY
1433
+ #
1434
+ web_server_ssl_key =
1435
+
1436
+ # The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the
1437
+ # ``database`` backend, sessions are store in the database and they can be
1438
+ # managed there (for example when you reset password of the user, all sessions for that user are
1439
+ # deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client
1440
+ # side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted
1441
+ # when you reset password of the user, which means that other than waiting for expiry time, the only
1442
+ # way to invalidate all sessions for a user is to change secret_key and restart webserver (which
1443
+ # also invalidates and logs out all other user's sessions).
1444
+ #
1445
+ # When you are using ``database`` backend, make sure to keep your database session table small
1446
+ # by periodically running ``airflow db clean --table session`` command, especially if you have
1447
+ # automated API calls that will create a new session for each call rather than reuse the sessions
1448
+ # stored in browser cookies.
1449
+ #
1450
+ # Example: session_backend = securecookie
1451
+ #
1452
+ # Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND
1453
+ #
1454
+ session_backend = database
1455
+
1456
+ # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
1457
+ #
1458
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT
1459
+ #
1460
+ web_server_master_timeout = 120
1461
+
1462
+ # Number of seconds the gunicorn webserver waits before timing out on a worker
1463
+ #
1464
+ # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT
1465
+ #
1466
+ web_server_worker_timeout = 120
1467
+
1468
+ # Number of workers to refresh at a time. When set to 0, worker refresh is
1469
+ # disabled. When nonzero, airflow periodically refreshes webserver workers by
1470
+ # bringing up new ones and killing old ones.
1471
+ #
1472
+ # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE
1473
+ #
1474
+ worker_refresh_batch_size = 1
1475
+
1476
+ # Number of seconds to wait before refreshing a batch of workers.
1477
+ #
1478
+ # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL
1479
+ #
1480
+ worker_refresh_interval = 6000
1481
+
1482
+ # If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes,
1483
+ # then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower,
1484
+ # uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production.
1485
+ #
1486
+ # Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE
1487
+ #
1488
+ reload_on_plugin_change = False
1489
+
1490
+ # Secret key used to run your flask app. It should be as random as possible. However, when running
1491
+ # more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise
1492
+ # one of them will error with "CSRF session token is missing".
1493
+ # The webserver key is also used to authorize requests to Celery workers when logs are retrieved.
1494
+ # The token generated using the secret key has a short expiry time though - make sure that time on
1495
+ # ALL the machines that you run airflow components on is synchronized (for example using ntpd)
1496
+ # otherwise you might get "forbidden" errors when the logs are accessed.
1497
+ #
1498
+ # Variable: AIRFLOW__WEBSERVER__SECRET_KEY
1499
+ #
1500
+ secret_key = JRNP2IC4kIaVxisy9+AW4A==
1501
+
1502
+ # Number of workers to run the Gunicorn web server
1503
+ #
1504
+ # Variable: AIRFLOW__WEBSERVER__WORKERS
1505
+ #
1506
+ workers = 4
1507
+
1508
+ # The worker class gunicorn should use. Choices include
1509
+ # ``sync`` (default), ``eventlet``, ``gevent``.
1510
+ #
1511
+ # .. warning::
1512
+ #
1513
+ # When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT``
1514
+ # environment variable to ``"1"`` to make sure gevent patching is done as early as possible.
1515
+ #
1516
+ # Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may
1517
+ # affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash.
1518
+ #
1519
+ # See related Issues / PRs for more details:
1520
+ #
1521
+ # * https://github.com/benoitc/gunicorn/issues/2796
1522
+ # * https://github.com/apache/airflow/issues/8212
1523
+ # * https://github.com/apache/airflow/pull/28283
1524
+ #
1525
+ # Variable: AIRFLOW__WEBSERVER__WORKER_CLASS
1526
+ #
1527
+ worker_class = sync
1528
+
1529
+ # Log files for the gunicorn webserver. '-' means log to stderr.
1530
+ #
1531
+ # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE
1532
+ #
1533
+ access_logfile = -
1534
+
1535
+ # Log files for the gunicorn webserver. '-' means log to stderr.
1536
+ #
1537
+ # Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE
1538
+ #
1539
+ error_logfile = -
1540
+
1541
+ # Access log format for gunicorn webserver.
1542
+ # default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"``
1543
+ # See `Gunicorn Settings: 'access_log_format' Reference
1544
+ # <https://docs.gunicorn.org/en/stable/settings.html#access-log-format>`__ for more details
1545
+ #
1546
+ # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT
1547
+ #
1548
+ access_logformat =
1549
+
1550
+ # Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values
1551
+ # except those that have security implications. ``True`` shows all values. ``False`` hides the
1552
+ # configuration completely.
1553
+ #
1554
+ # Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG
1555
+ #
1556
+ expose_config = False
1557
+
1558
+ # Expose hostname in the web server
1559
+ #
1560
+ # Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME
1561
+ #
1562
+ expose_hostname = False
1563
+
1564
+ # Expose stacktrace in the web server
1565
+ #
1566
+ # Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE
1567
+ #
1568
+ expose_stacktrace = False
1569
+
1570
+ # Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times``
1571
+ #
1572
+ # Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW
1573
+ #
1574
+ dag_default_view = grid
1575
+
1576
+ # Default DAG orientation. Valid values are:
1577
+ # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
1578
+ #
1579
+ # Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION
1580
+ #
1581
+ dag_orientation = LR
1582
+
1583
+ # Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical``
1584
+ #
1585
+ # Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER
1586
+ #
1587
+ grid_view_sorting_order = topological
1588
+
1589
+ # The amount of time (in secs) webserver will wait for initial handshake
1590
+ # while fetching logs from other worker machine
1591
+ #
1592
+ # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC
1593
+ #
1594
+ log_fetch_timeout_sec = 5
1595
+
1596
+ # Time interval (in secs) to wait before next log fetching.
1597
+ #
1598
+ # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC
1599
+ #
1600
+ log_fetch_delay_sec = 2
1601
+
1602
+ # Distance away from page bottom to enable auto tailing.
1603
+ #
1604
+ # Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET
1605
+ #
1606
+ log_auto_tailing_offset = 30
1607
+
1608
+ # Animation speed for auto tailing log display.
1609
+ #
1610
+ # Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED
1611
+ #
1612
+ log_animation_speed = 1000
1613
+
1614
+ # By default, the webserver shows paused DAGs. Flip this to hide paused
1615
+ # DAGs by default
1616
+ #
1617
+ # Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT
1618
+ #
1619
+ hide_paused_dags_by_default = False
1620
+
1621
+ # Consistent page size across all listing views in the UI
1622
+ #
1623
+ # Variable: AIRFLOW__WEBSERVER__PAGE_SIZE
1624
+ #
1625
+ page_size = 100
1626
+
1627
+ # Define the color of navigation bar
1628
+ #
1629
+ # Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR
1630
+ #
1631
+ navbar_color = #fff
1632
+
1633
+ # Define the color of text in the navigation bar
1634
+ #
1635
+ # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR
1636
+ #
1637
+ navbar_text_color = #51504f
1638
+
1639
+ # Define the color of navigation bar links when hovered
1640
+ #
1641
+ # Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR
1642
+ #
1643
+ navbar_hover_color = #eee
1644
+
1645
+ # Define the color of text in the navigation bar when hovered
1646
+ #
1647
+ # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR
1648
+ #
1649
+ navbar_text_hover_color = #51504f
1650
+
1651
+ # Define the color of the logo text
1652
+ #
1653
+ # Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR
1654
+ #
1655
+ navbar_logo_text_color = #51504f
1656
+
1657
+ # Default dagrun to show in UI
1658
+ #
1659
+ # Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER
1660
+ #
1661
+ default_dag_run_display_number = 25
1662
+
1663
+ # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
1664
+ #
1665
+ # Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX
1666
+ #
1667
+ enable_proxy_fix = False
1668
+
1669
+ # Number of values to trust for ``X-Forwarded-For``.
1670
+ # See `Werkzeug: X-Forwarded-For Proxy Fix
1671
+ # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1672
+ #
1673
+ # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR
1674
+ #
1675
+ proxy_fix_x_for = 1
1676
+
1677
+ # Number of values to trust for ``X-Forwarded-Proto``.
1678
+ # See `Werkzeug: X-Forwarded-For Proxy Fix
1679
+ # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1680
+ #
1681
+ # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO
1682
+ #
1683
+ proxy_fix_x_proto = 1
1684
+
1685
+ # Number of values to trust for ``X-Forwarded-Host``.
1686
+ # See `Werkzeug: X-Forwarded-For Proxy Fix
1687
+ # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1688
+ #
1689
+ # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST
1690
+ #
1691
+ proxy_fix_x_host = 1
1692
+
1693
+ # Number of values to trust for ``X-Forwarded-Port``.
1694
+ # See `Werkzeug: X-Forwarded-For Proxy Fix
1695
+ # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1696
+ #
1697
+ # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT
1698
+ #
1699
+ proxy_fix_x_port = 1
1700
+
1701
+ # Number of values to trust for ``X-Forwarded-Prefix``.
1702
+ # See `Werkzeug: X-Forwarded-For Proxy Fix
1703
+ # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1704
+ #
1705
+ # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX
1706
+ #
1707
+ proxy_fix_x_prefix = 1
1708
+
1709
+ # Set secure flag on session cookie
1710
+ #
1711
+ # Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE
1712
+ #
1713
+ cookie_secure = False
1714
+
1715
+ # Set samesite policy on session cookie
1716
+ #
1717
+ # Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE
1718
+ #
1719
+ cookie_samesite = Lax
1720
+
1721
+ # Default setting for wrap toggle on DAG code and TI log views.
1722
+ #
1723
+ # Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP
1724
+ #
1725
+ default_wrap = False
1726
+
1727
+ # Allow the UI to be rendered in a frame
1728
+ #
1729
+ # Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED
1730
+ #
1731
+ x_frame_enabled = True
1732
+
1733
+ # Send anonymous user activity to your analytics tool
1734
+ # choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo``
1735
+ #
1736
+ # Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL
1737
+ #
1738
+ # analytics_tool =
1739
+
1740
+ # Unique ID of your account in the analytics tool
1741
+ #
1742
+ # Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID
1743
+ #
1744
+ # analytics_id =
1745
+
1746
+ # Your instances url, only applicable to Matomo.
1747
+ #
1748
+ # Example: analytics_url = https://your.matomo.instance.com/
1749
+ #
1750
+ # Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL
1751
+ #
1752
+ # analytics_url =
1753
+
1754
+ # 'Recent Tasks' stats will show for old DagRuns if set
1755
+ #
1756
+ # Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS
1757
+ #
1758
+ show_recent_stats_for_completed_runs = True
1759
+
1760
+ # The UI cookie lifetime in minutes. User will be logged out from UI after
1761
+ # ``[webserver] session_lifetime_minutes`` of non-activity
1762
+ #
1763
+ # Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES
1764
+ #
1765
+ session_lifetime_minutes = 43200
1766
+
1767
+ # Sets a custom page title for the DAGs overview page and site title for all pages
1768
+ #
1769
+ # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME
1770
+ #
1771
+ # instance_name =
1772
+
1773
+ # Whether the custom page title for the DAGs overview page contains any Markup language
1774
+ #
1775
+ # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP
1776
+ #
1777
+ instance_name_has_markup = False
1778
+
1779
+ # How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
1780
+ # when auto-refresh is turned on
1781
+ #
1782
+ # Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL
1783
+ #
1784
+ auto_refresh_interval = 3
1785
+
1786
+ # Boolean for displaying warning for publicly viewable deployment
1787
+ #
1788
+ # Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE
1789
+ #
1790
+ warn_deployment_exposure = True
1791
+
1792
+ # Comma separated string of view events to exclude from dag audit view.
1793
+ # All other events will be added minus the ones passed here.
1794
+ # The audit logs in the db will not be affected by this parameter.
1795
+ #
1796
+ # Example: audit_view_excluded_events = cli_task_run,running,success
1797
+ #
1798
+ # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS
1799
+ #
1800
+ # audit_view_excluded_events =
1801
+
1802
+ # Comma separated string of view events to include in dag audit view.
1803
+ # If passed, only these events will populate the dag audit view.
1804
+ # The audit logs in the db will not be affected by this parameter.
1805
+ #
1806
+ # Example: audit_view_included_events = dagrun_cleared,failed
1807
+ #
1808
+ # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS
1809
+ #
1810
+ # audit_view_included_events =
1811
+
1812
+ # Boolean for running SwaggerUI in the webserver.
1813
+ #
1814
+ # Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI
1815
+ #
1816
+ enable_swagger_ui = True
1817
+
1818
+ # Boolean for running Internal API in the webserver.
1819
+ #
1820
+ # Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API
1821
+ #
1822
+ run_internal_api = False
1823
+
1824
+ # The caching algorithm used by the webserver. Must be a valid hashlib function name.
1825
+ #
1826
+ # Example: caching_hash_method = sha256
1827
+ #
1828
+ # Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD
1829
+ #
1830
+ caching_hash_method = md5
1831
+
1832
+ # Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger
1833
+ # without displaying a form to add a **dag_run.conf**, ``True`` to always display the form.
1834
+ # The form is displayed always if parameters are defined.
1835
+ #
1836
+ # Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS
1837
+ #
1838
+ show_trigger_form_if_no_params = False
1839
+
1840
+ # Number of recent DAG run configurations in the selector on the trigger web form.
1841
+ #
1842
+ # Example: num_recent_configurations_for_trigger = 10
1843
+ #
1844
+ # Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER
1845
+ #
1846
+ num_recent_configurations_for_trigger = 5
1847
+
1848
+ # A DAG author is able to provide any raw HTML into ``doc_md`` or params description in
1849
+ # ``description_md`` for text formatting. This is including potentially unsafe javascript.
1850
+ # Displaying the DAG or trigger form in web UI provides the DAG author the potential to
1851
+ # inject malicious code into clients browsers. To ensure the web UI is safe by default,
1852
+ # raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML
1853
+ # support in markdown by setting this option to ``True``.
1854
+ #
1855
+ # This parameter also enables the deprecated fields ``description_html`` and
1856
+ # ``custom_html_form`` in DAG params until the feature is removed in a future version.
1857
+ #
1858
+ # Example: allow_raw_html_descriptions = False
1859
+ #
1860
+ # Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS
1861
+ #
1862
+ allow_raw_html_descriptions = False
1863
+
1864
+ # The maximum size of the request payload (in MB) that can be sent.
1865
+ #
1866
+ # Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE
1867
+ #
1868
+ allowed_payload_size = 1.0
1869
+
1870
+ # Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes
1871
+ # to a DAG that may be running on sensitive environments like production.
1872
+ # When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause,
1873
+ # Trigger a DAG
1874
+ #
1875
+ # Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE
1876
+ #
1877
+ require_confirmation_dag_change = False
1878
+
1879
+ [email]
1880
+ # Configuration email backend and whether to
1881
+ # send email alerts on retry or failure
1882
+
1883
+ # Email backend to use
1884
+ #
1885
+ # Variable: AIRFLOW__EMAIL__EMAIL_BACKEND
1886
+ #
1887
+ email_backend = airflow.utils.email.send_email_smtp
1888
+
1889
+ # Email connection to use
1890
+ #
1891
+ # Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID
1892
+ #
1893
+ email_conn_id = smtp_default
1894
+
1895
+ # Whether email alerts should be sent when a task is retried
1896
+ #
1897
+ # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY
1898
+ #
1899
+ default_email_on_retry = True
1900
+
1901
+ # Whether email alerts should be sent when a task failed
1902
+ #
1903
+ # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE
1904
+ #
1905
+ default_email_on_failure = True
1906
+
1907
+ # File that will be used as the template for Email subject (which will be rendered using Jinja2).
1908
+ # If not set, Airflow uses a base template.
1909
+ #
1910
+ # Example: subject_template = /path/to/my_subject_template_file
1911
+ #
1912
+ # Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE
1913
+ #
1914
+ # subject_template =
1915
+
1916
+ # File that will be used as the template for Email content (which will be rendered using Jinja2).
1917
+ # If not set, Airflow uses a base template.
1918
+ #
1919
+ # Example: html_content_template = /path/to/my_html_content_template_file
1920
+ #
1921
+ # Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE
1922
+ #
1923
+ # html_content_template =
1924
+
1925
+ # Email address that will be used as sender address.
1926
+ # It can either be raw email or the complete address in a format ``Sender Name <sender@email.com>``
1927
+ #
1928
+ # Example: from_email = Airflow <airflow@example.com>
1929
+ #
1930
+ # Variable: AIRFLOW__EMAIL__FROM_EMAIL
1931
+ #
1932
+ # from_email =
1933
+
1934
+ # ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
1935
+ # which sets it to ``ssl.create_default_context()`` which provides the right balance between
1936
+ # compatibility and security, it however requires that certificates in your operating system are
1937
+ # updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
1938
+ # keys installed on your machines. You can switch it to "none" if you want to disable checking
1939
+ # of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
1940
+ # if your infrastructure is not sufficiently secured. It should only be set temporarily while you
1941
+ # are fixing your certificate configuration. This can be typically done by upgrading to newer
1942
+ # version of the operating system you run Airflow components on,by upgrading/refreshing proper
1943
+ # certificates in the OS or by updating certificates for your mail servers.
1944
+ #
1945
+ # Example: ssl_context = default
1946
+ #
1947
+ # Variable: AIRFLOW__EMAIL__SSL_CONTEXT
1948
+ #
1949
+ ssl_context = default
1950
+
1951
+ [smtp]
1952
+ # If you want airflow to send emails on retries, failure, and you want to use
1953
+ # the airflow.utils.email.send_email_smtp function, you have to configure an
1954
+ # smtp server here
1955
+
1956
+ # Specifies the host server address used by Airflow when sending out email notifications via SMTP.
1957
+ #
1958
+ # Variable: AIRFLOW__SMTP__SMTP_HOST
1959
+ #
1960
+ smtp_host = localhost
1961
+
1962
+ # Determines whether to use the STARTTLS command when connecting to the SMTP server.
1963
+ #
1964
+ # Variable: AIRFLOW__SMTP__SMTP_STARTTLS
1965
+ #
1966
+ smtp_starttls = True
1967
+
1968
+ # Determines whether to use an SSL connection when talking to the SMTP server.
1969
+ #
1970
+ # Variable: AIRFLOW__SMTP__SMTP_SSL
1971
+ #
1972
+ smtp_ssl = False
1973
+
1974
+ # Username to authenticate when connecting to smtp server.
1975
+ #
1976
+ # Example: smtp_user = airflow
1977
+ #
1978
+ # Variable: AIRFLOW__SMTP__SMTP_USER
1979
+ #
1980
+ # smtp_user =
1981
+
1982
+ # Password to authenticate when connecting to smtp server.
1983
+ #
1984
+ # Example: smtp_password = airflow
1985
+ #
1986
+ # Variable: AIRFLOW__SMTP__SMTP_PASSWORD
1987
+ #
1988
+ # smtp_password =
1989
+
1990
+ # Defines the port number on which Airflow connects to the SMTP server to send email notifications.
1991
+ #
1992
+ # Variable: AIRFLOW__SMTP__SMTP_PORT
1993
+ #
1994
+ smtp_port = 25
1995
+
1996
+ # Specifies the default **from** email address used when Airflow sends email notifications.
1997
+ #
1998
+ # Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM
1999
+ #
2000
+ smtp_mail_from = airflow@example.com
2001
+
2002
+ # Determines the maximum time (in seconds) the Apache Airflow system will wait for a
2003
+ # connection to the SMTP server to be established.
2004
+ #
2005
+ # Variable: AIRFLOW__SMTP__SMTP_TIMEOUT
2006
+ #
2007
+ smtp_timeout = 30
2008
+
2009
+ # Defines the maximum number of times Airflow will attempt to connect to the SMTP server.
2010
+ #
2011
+ # Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT
2012
+ #
2013
+ smtp_retry_limit = 5
2014
+
2015
+ [sentry]
2016
+ # `Sentry <https://docs.sentry.io>`__ integration. Here you can supply
2017
+ # additional configuration options based on the Python platform.
2018
+ # See `Python / Configuration / Basic Options
2019
+ # <https://docs.sentry.io/platforms/python/configuration/options/>`__ for more details.
2020
+ # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
2021
+ # ``ignore_errors``, ``before_breadcrumb``, ``transport``.
2022
+
2023
+ # Enable error reporting to Sentry
2024
+ #
2025
+ # Variable: AIRFLOW__SENTRY__SENTRY_ON
2026
+ #
2027
+ sentry_on = false
2028
+
2029
+ #
2030
+ # Variable: AIRFLOW__SENTRY__SENTRY_DSN
2031
+ #
2032
+ sentry_dsn =
2033
+
2034
+ # Dotted path to a before_send function that the sentry SDK should be configured to use.
2035
+ #
2036
+ # Variable: AIRFLOW__SENTRY__BEFORE_SEND
2037
+ #
2038
+ # before_send =
2039
+
2040
+ [scheduler]
2041
+ # Task instances listen for external kill signal (when you clear tasks
2042
+ # from the CLI or the UI), this defines the frequency at which they should
2043
+ # listen (in seconds).
2044
+ #
2045
+ # Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC
2046
+ #
2047
+ job_heartbeat_sec = 5
2048
+
2049
+ # The scheduler constantly tries to trigger new tasks (look at the
2050
+ # scheduler section in the docs for more information). This defines
2051
+ # how often the scheduler should run (in seconds).
2052
+ #
2053
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC
2054
+ #
2055
+ scheduler_heartbeat_sec = 5
2056
+
2057
+ # The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the
2058
+ # scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default
2059
+ # to the value of ``[scheduler] scheduler_zombie_task_threshold``.
2060
+ #
2061
+ # Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC
2062
+ #
2063
+ local_task_job_heartbeat_sec = 0
2064
+
2065
+ # The number of times to try to schedule each DAG file
2066
+ # -1 indicates unlimited number
2067
+ #
2068
+ # Variable: AIRFLOW__SCHEDULER__NUM_RUNS
2069
+ #
2070
+ num_runs = -1
2071
+
2072
+ # Controls how long the scheduler will sleep between loops, but if there was nothing to do
2073
+ # in the loop. i.e. if it scheduled something then it will start the next loop
2074
+ # iteration straight away.
2075
+ #
2076
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME
2077
+ #
2078
+ scheduler_idle_sleep_time = 1
2079
+
2080
+ # Number of seconds after which a DAG file is parsed. The DAG file is parsed every
2081
+ # ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
2082
+ # this interval. Keeping this number low will increase CPU usage.
2083
+ #
2084
+ # Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL
2085
+ #
2086
+ min_file_process_interval = 30
2087
+
2088
+ # How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
2089
+ # the expected files) which should be deactivated, as well as datasets that are no longer
2090
+ # referenced and should be marked as orphaned.
2091
+ #
2092
+ # Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL
2093
+ #
2094
+ parsing_cleanup_interval = 60
2095
+
2096
+ # How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale
2097
+ # DAGs (DAGs which are no longer present in the expected files). The reason why we need
2098
+ # this threshold is to account for the time between when the file is parsed and when the
2099
+ # DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``,
2100
+ # but when you have a long timeout configured, it results in a significant delay in the
2101
+ # deactivation of stale dags.
2102
+ #
2103
+ # Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD
2104
+ #
2105
+ stale_dag_threshold = 50
2106
+
2107
+ # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
2108
+ #
2109
+ # Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL
2110
+ #
2111
+ dag_dir_list_interval = 300
2112
+
2113
+ # How often should stats be printed to the logs. Setting to 0 will disable printing stats
2114
+ #
2115
+ # Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL
2116
+ #
2117
+ print_stats_interval = 30
2118
+
2119
+ # How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled)
2120
+ #
2121
+ # Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL
2122
+ #
2123
+ pool_metrics_interval = 5.0
2124
+
2125
+ # If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold``
2126
+ # ago (in seconds), scheduler is considered unhealthy.
2127
+ # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
2128
+ # for SchedulerJob.
2129
+ #
2130
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD
2131
+ #
2132
+ scheduler_health_check_threshold = 30
2133
+
2134
+ # When you start a scheduler, airflow starts a tiny web server
2135
+ # subprocess to serve a health check if this is set to ``True``
2136
+ #
2137
+ # Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK
2138
+ #
2139
+ enable_health_check = False
2140
+
2141
+ # When you start a scheduler, airflow starts a tiny web server
2142
+ # subprocess to serve a health check on this host
2143
+ #
2144
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST
2145
+ #
2146
+ scheduler_health_check_server_host = 0.0.0.0
2147
+
2148
+ # When you start a scheduler, airflow starts a tiny web server
2149
+ # subprocess to serve a health check on this port
2150
+ #
2151
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT
2152
+ #
2153
+ scheduler_health_check_server_port = 8974
2154
+
2155
+ # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
2156
+ #
2157
+ # Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL
2158
+ #
2159
+ orphaned_tasks_check_interval = 300.0
2160
+
2161
+ # Determines the directory where logs for the child processes of the scheduler will be stored
2162
+ #
2163
+ # Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY
2164
+ #
2165
+ child_process_log_directory = /kaggle/working/BTC-USDT-ETL-Pipeline/airflow/logs/scheduler
2166
+
2167
+ # Local task jobs periodically heartbeat to the DB. If the job has
2168
+ # not heartbeat in this many seconds, the scheduler will mark the
2169
+ # associated task instance as failed and will re-schedule the task.
2170
+ #
2171
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD
2172
+ #
2173
+ scheduler_zombie_task_threshold = 300
2174
+
2175
+ # How often (in seconds) should the scheduler check for zombie tasks.
2176
+ #
2177
+ # Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL
2178
+ #
2179
+ zombie_detection_interval = 10.0
2180
+
2181
+ # Turn off scheduler catchup by setting this to ``False``.
2182
+ # Default behavior is unchanged and
2183
+ # Command Line Backfills still work, but the scheduler
2184
+ # will not do scheduler catchup if this is ``False``,
2185
+ # however it can be set on a per DAG basis in the
2186
+ # DAG definition (catchup)
2187
+ #
2188
+ # Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT
2189
+ #
2190
+ catchup_by_default = True
2191
+
2192
+ # Setting this to ``True`` will make first task instance of a task
2193
+ # ignore depends_on_past setting. A task instance will be considered
2194
+ # as the first task instance of a task when there is no task instance
2195
+ # in the DB with an execution_date earlier than it., i.e. no manual marking
2196
+ # success will be needed for a newly added task to be scheduled.
2197
+ #
2198
+ # Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT
2199
+ #
2200
+ ignore_first_depends_on_past_by_default = True
2201
+
2202
+ # This changes the batch size of queries in the scheduling main loop.
2203
+ # This should not be greater than ``[core] parallelism``.
2204
+ # If this is too high, SQL query performance may be impacted by
2205
+ # complexity of query predicate, and/or excessive locking.
2206
+ # Additionally, you may hit the maximum allowable query length for your db.
2207
+ # Set this to 0 to use the value of ``[core] parallelism``
2208
+ #
2209
+ # Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY
2210
+ #
2211
+ max_tis_per_query = 16
2212
+
2213
+ # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
2214
+ # If this is set to ``False`` then you should not run more than a single
2215
+ # scheduler at once
2216
+ #
2217
+ # Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING
2218
+ #
2219
+ use_row_level_locking = True
2220
+
2221
+ # Max number of DAGs to create DagRuns for per scheduler loop.
2222
+ #
2223
+ # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP
2224
+ #
2225
+ max_dagruns_to_create_per_loop = 10
2226
+
2227
+ # How many DagRuns should a scheduler examine (and lock) when scheduling
2228
+ # and queuing tasks.
2229
+ #
2230
+ # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE
2231
+ #
2232
+ max_dagruns_per_loop_to_schedule = 20
2233
+
2234
+ # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
2235
+ # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
2236
+ # dags in some circumstances
2237
+ #
2238
+ # Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION
2239
+ #
2240
+ schedule_after_task_execution = True
2241
+
2242
+ # The scheduler reads dag files to extract the airflow modules that are going to be used,
2243
+ # and imports them ahead of time to avoid having to re-do it for each parsing process.
2244
+ # This flag can be set to ``False`` to disable this behavior in case an airflow module needs
2245
+ # to be freshly imported each time (at the cost of increased DAG parsing time).
2246
+ #
2247
+ # Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES
2248
+ #
2249
+ parsing_pre_import_modules = True
2250
+
2251
+ # The scheduler can run multiple processes in parallel to parse dags.
2252
+ # This defines how many processes will run.
2253
+ #
2254
+ # Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES
2255
+ #
2256
+ parsing_processes = 2
2257
+
2258
+ # One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
2259
+ # The scheduler will list and sort the dag files to decide the parsing order.
2260
+ #
2261
+ # * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
2262
+ # recently modified DAGs first.
2263
+ # * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
2264
+ # same host. This is useful when running with Scheduler in HA mode where each scheduler can
2265
+ # parse different DAG files.
2266
+ # * ``alphabetical``: Sort by filename
2267
+ #
2268
+ # Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE
2269
+ #
2270
+ file_parsing_sort_mode = modified_time
2271
+
2272
+ # Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
2273
+ # job.
2274
+ #
2275
+ # Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR
2276
+ #
2277
+ standalone_dag_processor = False
2278
+
2279
+ # Only applicable if ``[scheduler] standalone_dag_processor`` is true and callbacks are stored
2280
+ # in database. Contains maximum number of callbacks that are fetched during a single loop.
2281
+ #
2282
+ # Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP
2283
+ #
2284
+ max_callbacks_per_loop = 20
2285
+
2286
+ # Only applicable if ``[scheduler] standalone_dag_processor`` is true.
2287
+ # Time in seconds after which dags, which were not updated by Dag Processor are deactivated.
2288
+ #
2289
+ # Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION
2290
+ #
2291
+ dag_stale_not_seen_duration = 600
2292
+
2293
+ # Turn off scheduler use of cron intervals by setting this to ``False``.
2294
+ # DAGs submitted manually in the web UI or with trigger_dag will still run.
2295
+ #
2296
+ # Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE
2297
+ #
2298
+ use_job_schedule = True
2299
+
2300
+ # Allow externally triggered DagRuns for Execution Dates in the future
2301
+ # Only has effect if schedule_interval is set to None in DAG
2302
+ #
2303
+ # Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE
2304
+ #
2305
+ allow_trigger_in_future = False
2306
+
2307
+ # How often to check for expired trigger requests that have not run yet.
2308
+ #
2309
+ # Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL
2310
+ #
2311
+ trigger_timeout_check_interval = 15
2312
+
2313
+ # Amount of time a task can be in the queued state before being retried or set to failed.
2314
+ #
2315
+ # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT
2316
+ #
2317
+ task_queued_timeout = 600.0
2318
+
2319
+ # How often to check for tasks that have been in the queued state for
2320
+ # longer than ``[scheduler] task_queued_timeout``.
2321
+ #
2322
+ # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL
2323
+ #
2324
+ task_queued_timeout_check_interval = 120.0
2325
+
2326
+ # The run_id pattern used to verify the validity of user input to the run_id parameter when
2327
+ # triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id
2328
+ # for scheduled DAG runs or DAG runs triggered without changing the run_id parameter.
2329
+ #
2330
+ # Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN
2331
+ #
2332
+ allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$
2333
+
2334
+ # Whether to create DAG runs that span an interval or one single point in time for cron schedules, when
2335
+ # a cron string is provided to ``schedule`` argument of a DAG.
2336
+ #
2337
+ # * ``True``: **CronDataIntervalTimetable** is used, which is suitable
2338
+ # for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous
2339
+ # interval up to the scheduled datetime.
2340
+ # * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself.
2341
+ #
2342
+ # Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will
2343
+ # try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of
2344
+ # the data interval, but the DAG Run will try to schedule at the end of the data interval.
2345
+ #
2346
+ # Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS
2347
+ #
2348
+ create_cron_data_intervals = True
2349
+
2350
+ [triggerer]
2351
+ # How many triggers a single Triggerer will run at once, by default.
2352
+ #
2353
+ # Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY
2354
+ #
2355
+ default_capacity = 1000
2356
+
2357
+ # How often to heartbeat the Triggerer job to ensure it hasn't been killed.
2358
+ #
2359
+ # Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC
2360
+ #
2361
+ job_heartbeat_sec = 5
2362
+
2363
+ # If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold``
2364
+ # ago (in seconds), triggerer is considered unhealthy.
2365
+ # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
2366
+ # for TriggererJob.
2367
+ #
2368
+ # Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD
2369
+ #
2370
+ triggerer_health_check_threshold = 30
2371
+
2372
+ [kerberos]
2373
+ # Location of your ccache file once kinit has been performed.
2374
+ #
2375
+ # Variable: AIRFLOW__KERBEROS__CCACHE
2376
+ #
2377
+ ccache = /tmp/airflow_krb5_ccache
2378
+
2379
+ # gets augmented with fqdn
2380
+ #
2381
+ # Variable: AIRFLOW__KERBEROS__PRINCIPAL
2382
+ #
2383
+ principal = airflow
2384
+
2385
+ # Determines the frequency at which initialization or re-initialization processes occur.
2386
+ #
2387
+ # Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY
2388
+ #
2389
+ reinit_frequency = 3600
2390
+
2391
+ # Path to the kinit executable
2392
+ #
2393
+ # Variable: AIRFLOW__KERBEROS__KINIT_PATH
2394
+ #
2395
+ kinit_path = kinit
2396
+
2397
+ # Designates the path to the Kerberos keytab file for the Airflow user
2398
+ #
2399
+ # Variable: AIRFLOW__KERBEROS__KEYTAB
2400
+ #
2401
+ keytab = airflow.keytab
2402
+
2403
+ # Allow to disable ticket forwardability.
2404
+ #
2405
+ # Variable: AIRFLOW__KERBEROS__FORWARDABLE
2406
+ #
2407
+ forwardable = True
2408
+
2409
+ # Allow to remove source IP from token, useful when using token behind NATted Docker host.
2410
+ #
2411
+ # Variable: AIRFLOW__KERBEROS__INCLUDE_IP
2412
+ #
2413
+ include_ip = True
2414
+
2415
+ [sensors]
2416
+ # Sensor default timeout, 7 days by default (7 * 24 * 60 * 60).
2417
+ #
2418
+ # Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT
2419
+ #
2420
+ default_timeout = 604800
2421
+
2422
+ [usage_data_collection]
2423
+ # Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic platform and usage data
2424
+ # during operation. This data assists Airflow maintainers in better understanding how Airflow is used.
2425
+ # Insights gained from this telemetry are critical for prioritizing patches, minor releases, and
2426
+ # security fixes. Additionally, this information supports key decisions related to the development road map.
2427
+ # Check the FAQ doc for more information on what data is collected.
2428
+ #
2429
+ # Deployments can opt-out of analytics by setting the ``enabled`` option
2430
+ # to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable.
2431
+ # Individual users can easily opt-out of analytics in various ways documented in the
2432
+ # `Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__.
2433
+
2434
+ # Enable or disable usage data collection and sending.
2435
+ #
2436
+ # Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED
2437
+ #
2438
+ enabled = True
2439
+
2440
+ [common.io]
2441
+ # Common IO configuration section
2442
+
2443
+ # Path to a location on object storage where XComs can be stored in url format.
2444
+ #
2445
+ # Example: xcom_objectstorage_path = s3://conn_id@bucket/path
2446
+ #
2447
+ # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH
2448
+ #
2449
+ xcom_objectstorage_path =
2450
+
2451
+ # Threshold in bytes for storing XComs in object storage. -1 means always store in the
2452
+ # database. 0 means always store in object storage. Any positive number means
2453
+ # it will be stored in object storage if the size of the value is greater than the threshold.
2454
+ #
2455
+ # Example: xcom_objectstorage_threshold = 1000000
2456
+ #
2457
+ # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD
2458
+ #
2459
+ xcom_objectstorage_threshold = -1
2460
+
2461
+ # Compression algorithm to use when storing XComs in object storage. Supported algorithms
2462
+ # are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used.
2463
+ # Note that the compression algorithm must be available in the Python installation (e.g.
2464
+ # python-snappy for snappy). Zip, gz, bz2 are available by default.
2465
+ #
2466
+ # Example: xcom_objectstorage_compression = gz
2467
+ #
2468
+ # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION
2469
+ #
2470
+ xcom_objectstorage_compression =
2471
+
2472
+ [fab]
2473
+ # This section contains configs specific to FAB provider.
2474
+
2475
+ # Boolean for enabling rate limiting on authentication endpoints.
2476
+ #
2477
+ # Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED
2478
+ #
2479
+ auth_rate_limited = True
2480
+
2481
+ # Rate limit for authentication endpoints.
2482
+ #
2483
+ # Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT
2484
+ #
2485
+ auth_rate_limit = 5 per 40 second
2486
+
2487
+ # Update FAB permissions and sync security manager roles
2488
+ # on webserver startup
2489
+ #
2490
+ # Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS
2491
+ #
2492
+ update_fab_perms = True
2493
+
2494
+ [imap]
2495
+ # Options for IMAP provider.
2496
+
2497
+ # ssl_context =
2498
+
airflow/airflow.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33e2a2d0de019f573198b2bd423a7d642c3f01f321c8527c3e8f04b9ee73d60a
3
+ size 1282048
airflow/dags/new6.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from airflow import DAG
2
+ from airflow.operators.python import PythonOperator
3
+ from datetime import datetime, timedelta, timezone
4
+ import os
5
+ import sys
6
+
7
+ # Add project root to path
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
9
+
10
+ from components.btcusdt_ingest_data import crawl_data_from_sources
11
+ from components.datalake_cr import up_to_minio
12
+ from components.process_data import extract_from_minio, transform_financial_data
13
+ from components.duckdb_api import push_to_duckdb
14
+ from components.duckdb2csv import duckdb_to_csv
15
+ from components.model.training import train_lstm_model
16
+ from components.model.evaluation import metric_and_predict_lstm_model
17
+ from components.utils.file_utils import (
18
+ load_extract_config,
19
+ define_server_filenames,
20
+ load_pipeline_config
21
+ )
22
+
23
+ # DAG default arguments
24
+ default_args = {
25
+ 'owner': 'airflow',
26
+ 'start_date': datetime(2025, 10, 7, 20, 0),
27
+ 'retries': 1,
28
+ 'retry_delay': timedelta(minutes=5),
29
+ }
30
+
31
+ # Define DAGs
32
+ dag_1 = DAG('crawl_to_minio', default_args=default_args,
33
+ schedule_interval='@monthly', max_active_runs=1, catchup=False)
34
+ dag_2 = DAG('etl_to_duckdb', default_args=default_args,
35
+ schedule_interval='@monthly', max_active_runs=1, catchup=False)
36
+ dag_3 = DAG('lstm_forecast', default_args=default_args,
37
+ schedule_interval='@monthly', max_active_runs=1, catchup=False)
38
+ dag_4 = DAG('duckdb_to_csv_export', default_args=default_args,
39
+ schedule_interval='@monthly', max_active_runs=1, catchup=False)
40
+
41
+ # Load pipeline configuration
42
+ pipeline_config = load_pipeline_config()
43
+
44
+ # DAG 1: Crawl to MinIO
45
+ download_binance_csv = PythonOperator(
46
+ task_id='download_binance_csv',
47
+ python_callable=crawl_data_from_sources,
48
+ dag=dag_1
49
+ )
50
+
51
+ extract_filenames_task = PythonOperator(
52
+ task_id='extract_filenames',
53
+ python_callable=define_server_filenames,
54
+ dag=dag_1
55
+ )
56
+
57
+ upload_to_minio_storage = PythonOperator(
58
+ task_id='upload_to_minio',
59
+ python_callable=up_to_minio,
60
+ op_kwargs={
61
+ 'client_files': '{{ ti.xcom_pull(task_ids="download_binance_csv") }}',
62
+ 'server_files': '{{ ti.xcom_pull(task_ids="extract_filenames") }}',
63
+ 'bucket_name': pipeline_config['minio']['bucket_name']
64
+ },
65
+ dag=dag_1
66
+ )
67
+
68
+ # DAG 2: MinIO to DuckDB
69
+ extract_data = PythonOperator(
70
+ task_id='extract_data',
71
+ python_callable=extract_from_minio,
72
+ op_kwargs={
73
+ 'bucket_name': pipeline_config['minio']['bucket_name'],
74
+ 'file_names': load_extract_config("extract_data.yml")["files"]
75
+ },
76
+ dag=dag_2
77
+ )
78
+
79
+ transform_data = PythonOperator(
80
+ task_id='transform_data',
81
+ python_callable=transform_financial_data,
82
+ op_kwargs={
83
+ 'parquet_file_paths': '{{ ti.xcom_pull(task_ids="extract_data") }}',
84
+ 'temp_parquet_path': pipeline_config['paths']['temp_parquet_path'],
85
+ 'output_parquet_path': pipeline_config['paths']['output_parquet_path']
86
+ },
87
+ dag=dag_2
88
+ )
89
+
90
+ push_to_warehouse = PythonOperator(
91
+ task_id='export_duckdb',
92
+ python_callable=push_to_duckdb,
93
+ op_kwargs={
94
+ 'duckdb_path': pipeline_config['paths']['duckdb_path'],
95
+ 'parquet_path': '{{ ti.xcom_pull(task_ids="transform_data") }}'
96
+ },
97
+ dag=dag_2
98
+ )
99
+
100
+ # DAG 3: LSTM Forecasting
101
+ train_lstm = PythonOperator(
102
+ task_id='train_lstm_model',
103
+ python_callable=train_lstm_model,
104
+ dag=dag_3
105
+ )
106
+
107
+ # metric_and_predict_lstm = PythonOperator(
108
+ # task_id='metric_and_predict_lstm',
109
+ # python_callable=metric_and_predict_lstm_model,
110
+ # provide_context=True,
111
+ # dag=dag_3
112
+ # )
113
+
114
+ metric_and_predict_lstm = PythonOperator(
115
+ task_id='metric_and_predict_lstm',
116
+ python_callable=metric_and_predict_lstm_model,
117
+ op_kwargs={
118
+ 'train_result': '{{ ti.xcom_pull(task_ids="train_lstm_model") }}'
119
+ },
120
+ provide_context=True, # Still needed for Jinja templating in op_kwargs
121
+ dag=dag_3
122
+ )
123
+
124
+ # metric_and_predict_lstm = PythonOperator(
125
+ # task_id='metric_and_predict_lstm',
126
+ # python_callable=metric_and_predict_lstm_model,
127
+ # op_kwargs={'ti': '{{ ti }}'}, # Explicitly pass task instance
128
+ # provide_context=True, # Ensure context is provided for XCom
129
+ # dag=dag_3
130
+ # )
131
+
132
+ # DAG 4: DuckDB to CSV
133
+ export_duckdb_to_csv = PythonOperator(
134
+ task_id='export_duckdb_to_csv',
135
+ python_callable=duckdb_to_csv,
136
+ op_kwargs={
137
+ 'duckdb_path': pipeline_config['paths']['duckdb_path'],
138
+ 'output_csv_path': pipeline_config['paths']['output_csv_path']
139
+ },
140
+ dag=dag_4
141
+ )
142
+
143
+ # Dependencies
144
+ download_binance_csv >> extract_filenames_task >> upload_to_minio_storage
145
+ extract_data >> transform_data >> push_to_warehouse
146
+ train_lstm >> metric_and_predict_lstm
147
+ export_duckdb_to_csv
airflow/webserver_config.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one
3
+ # or more contributor license agreements. See the NOTICE file
4
+ # distributed with this work for additional information
5
+ # regarding copyright ownership. The ASF licenses this file
6
+ # to you under the Apache License, Version 2.0 (the
7
+ # "License"); you may not use this file except in compliance
8
+ # with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing,
13
+ # software distributed under the License is distributed on an
14
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ # KIND, either express or implied. See the License for the
16
+ # specific language governing permissions and limitations
17
+ # under the License.
18
+ """Default configuration for the Airflow webserver."""
19
+
20
+ from __future__ import annotations
21
+
22
+ import os
23
+
24
+ from flask_appbuilder.const import AUTH_DB
25
+
26
+ # from airflow.www.fab_security.manager import AUTH_LDAP
27
+ # from airflow.www.fab_security.manager import AUTH_OAUTH
28
+ # from airflow.www.fab_security.manager import AUTH_OID
29
+ # from airflow.www.fab_security.manager import AUTH_REMOTE_USER
30
+
31
+
32
+ basedir = os.path.abspath(os.path.dirname(__file__))
33
+
34
+ # Flask-WTF flag for CSRF
35
+ WTF_CSRF_ENABLED = True
36
+ WTF_CSRF_TIME_LIMIT = None
37
+
38
+ # ----------------------------------------------------
39
+ # AUTHENTICATION CONFIG
40
+ # ----------------------------------------------------
41
+ # For details on how to set up each of the following authentication, see
42
+ # http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods
43
+ # for details.
44
+
45
+ # The authentication type
46
+ # AUTH_OID : Is for OpenID
47
+ # AUTH_DB : Is for database
48
+ # AUTH_LDAP : Is for LDAP
49
+ # AUTH_REMOTE_USER : Is for using REMOTE_USER from web server
50
+ # AUTH_OAUTH : Is for OAuth
51
+ AUTH_TYPE = AUTH_DB
52
+
53
+ # Uncomment to setup Full admin role name
54
+ # AUTH_ROLE_ADMIN = 'Admin'
55
+
56
+ # Uncomment and set to desired role to enable access without authentication
57
+ # AUTH_ROLE_PUBLIC = 'Viewer'
58
+
59
+ # Will allow user self registration
60
+ # AUTH_USER_REGISTRATION = True
61
+
62
+ # The recaptcha it's automatically enabled for user self registration is active and the keys are necessary
63
+ # RECAPTCHA_PRIVATE_KEY = PRIVATE_KEY
64
+ # RECAPTCHA_PUBLIC_KEY = PUBLIC_KEY
65
+
66
+ # Config for Flask-Mail necessary for user self registration
67
+ # MAIL_SERVER = 'smtp.gmail.com'
68
+ # MAIL_USE_TLS = True
69
+ # MAIL_USERNAME = 'yourappemail@gmail.com'
70
+ # MAIL_PASSWORD = 'passwordformail'
71
+ # MAIL_DEFAULT_SENDER = 'sender@gmail.com'
72
+
73
+ # The default user self registration role
74
+ # AUTH_USER_REGISTRATION_ROLE = "Public"
75
+
76
+ # When using OAuth Auth, uncomment to setup provider(s) info
77
+ # Google OAuth example:
78
+ # OAUTH_PROVIDERS = [{
79
+ # 'name':'google',
80
+ # 'token_key':'access_token',
81
+ # 'icon':'fa-google',
82
+ # 'remote_app': {
83
+ # 'api_base_url':'https://www.googleapis.com/oauth2/v2/',
84
+ # 'client_kwargs':{
85
+ # 'scope': 'email profile'
86
+ # },
87
+ # 'access_token_url':'https://accounts.google.com/o/oauth2/token',
88
+ # 'authorize_url':'https://accounts.google.com/o/oauth2/auth',
89
+ # 'request_token_url': None,
90
+ # 'client_id': GOOGLE_KEY,
91
+ # 'client_secret': GOOGLE_SECRET_KEY,
92
+ # }
93
+ # }]
94
+
95
+ # When using LDAP Auth, setup the ldap server
96
+ # AUTH_LDAP_SERVER = "ldap://ldapserver.new"
97
+
98
+ # When using OpenID Auth, uncomment to setup OpenID providers.
99
+ # example for OpenID authentication
100
+ # OPENID_PROVIDERS = [
101
+ # { 'name': 'Yahoo', 'url': 'https://me.yahoo.com' },
102
+ # { 'name': 'AOL', 'url': 'http://openid.aol.com/<username>' },
103
+ # { 'name': 'Flickr', 'url': 'http://www.flickr.com/<username>' },
104
+ # { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }]
105
+
106
+ # ----------------------------------------------------
107
+ # Theme CONFIG
108
+ # ----------------------------------------------------
109
+ # Flask App Builder comes up with a number of predefined themes
110
+ # that you can use for Apache Airflow.
111
+ # http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes
112
+ # Please make sure to remove "navbar_color" configuration from airflow.cfg
113
+ # in order to fully utilize the theme. (or use that property in conjunction with theme)
114
+ # APP_THEME = "bootstrap-theme.css" # default bootstrap
115
+ # APP_THEME = "amelia.css"
116
+ # APP_THEME = "cerulean.css"
117
+ # APP_THEME = "cosmo.css"
118
+ # APP_THEME = "cyborg.css"
119
+ # APP_THEME = "darkly.css"
120
+ # APP_THEME = "flatly.css"
121
+ # APP_THEME = "journal.css"
122
+ # APP_THEME = "lumen.css"
123
+ # APP_THEME = "paper.css"
124
+ # APP_THEME = "readable.css"
125
+ # APP_THEME = "sandstone.css"
126
+ # APP_THEME = "simplex.css"
127
+ # APP_THEME = "slate.css"
128
+ # APP_THEME = "solar.css"
129
+ # APP_THEME = "spacelab.css"
130
+ # APP_THEME = "superhero.css"
131
+ # APP_THEME = "united.css"
132
+ # APP_THEME = "yeti.css"
analytics/BTCUSDT_report.pdf ADDED
Binary file (60.6 kB). View file
 
ckpts/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.keras
2
+ *.pkl
ckpts/model_2025-10-28-11-33-51-(+07).h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e23253135898780db99f96e8ff9625297d3d3389bd5f928a024f49c59e547c
3
+ size 2626736
ckpts/scaler_2025-10-28-11-33-51-(+07).pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9ec47a4549e291a30cbac7225656f56d65069b62ead6d139323ec28a3933f3
3
+ size 523
components/__init__.py ADDED
File without changes
components/btcusdt_ingest_data.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import zipfile
3
+ import io
4
+ import os
5
+ import yaml
6
+ import pandas as pd
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ def add_column_headers(csv_data: bytes, expected_columns: list) -> bytes:
12
+
13
+ try:
14
+ # Read CSV data into a DataFrame without headers
15
+ df = pd.read_csv(io.BytesIO(csv_data), header=None)
16
+
17
+ # Validate column count
18
+ if len(df.columns) != len(expected_columns):
19
+ raise ValueError(
20
+ f"CSV has {len(df.columns)} columns, expected {len(expected_columns)}"
21
+ )
22
+
23
+ # Assign column names
24
+ df.columns = expected_columns
25
+
26
+ # Convert DataFrame back to CSV bytes with headers
27
+ csv_buffer = io.StringIO()
28
+ df.to_csv(csv_buffer, index=False)
29
+ return csv_buffer.getvalue().encode('utf-8')
30
+
31
+ except pd.errors.ParserError as e:
32
+ raise Exception(f"Failed to parse CSV data: {e}")
33
+ except Exception as e:
34
+ raise Exception(f"Failed to process CSV with headers: {e}")
35
+
36
+ def download_and_extract_binance_data(url: str, output_path: str = "temp/input.csv") -> pd.DataFrame:
37
+ expected_columns = [
38
+ "Open time", "Open", "High", "Low", "Close", "Volume",
39
+ "Close time", "Quote asset volume", "Number of trades",
40
+ "Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"
41
+ ]
42
+
43
+ try:
44
+ response = requests.get(url, timeout=10)
45
+ response.raise_for_status()
46
+
47
+ with io.BytesIO(response.content) as zip_file:
48
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
49
+ csv_file_name = zip_ref.namelist()[0]
50
+ csv_data = zip_ref.read(csv_file_name)
51
+ # csv_data = add_column_headers(csv_data, expected_columns)
52
+ with open(output_path, 'wb') as output_file:
53
+ output_file.write(csv_data)
54
+
55
+ print(f"Successfully downloaded and extracted data to {output_path}")
56
+
57
+ # Load the saved CSV into a DataFrame
58
+ df = pd.read_csv(output_path)
59
+
60
+ # Print the first few rows to inspect the data
61
+ # print("First few rows of the CSV:")
62
+ # print(df.head())
63
+ # print("Open time and Close time dtypes:", df["Open time"].dtype, df["Close time"].dtype)
64
+
65
+ # Check for invalid or out-of-range timestamps
66
+ # try:
67
+ # df["Open time"] = pd.to_datetime(df["Open time"], unit="ms", errors="coerce")
68
+ # df["Close time"] = pd.to_datetime(df["Close time"], unit="ms", errors="coerce")
69
+ # except Exception as e:
70
+ # print(f"Timestamp conversion error: {e}")
71
+ # print("Invalid Open time values:", df[df["Open time"].isna()])
72
+ # print("Invalid Close time values:", df[df["Close time"].isna()])
73
+ # raise
74
+
75
+ print("CSV structure validated and headers added successfully")
76
+ return df
77
+
78
+ except requests.RequestException as e:
79
+ raise Exception(f"Failed to download file from {url}: {e}")
80
+ except zipfile.BadZipFile as e:
81
+ raise Exception(f"Invalid ZIP file: {e}")
82
+ except IOError as e:
83
+ raise Exception(f"Failed to write to {output_path}: {e}")
84
+ except Exception as e:
85
+ raise Exception(f"Failed during processing: {e}")
86
+
87
+ def crawl_data_from_sources():
88
+ try:
89
+ # Load data sources configuration
90
+ sources_path = Path("configs/data_sources.yml")
91
+ with open(sources_path, 'r') as file:
92
+ data_sources = yaml.safe_load(file)
93
+ if not data_sources or not isinstance(data_sources, list):
94
+ raise ValueError("Invalid or empty data_sources configuration file")
95
+
96
+ # Load data limit configuration
97
+ limits_path = Path("configs/data_limit.yml")
98
+ with open(limits_path, 'r') as file:
99
+ data_limits = yaml.safe_load(file)
100
+ if not data_limits or not isinstance(data_limits, list):
101
+ raise ValueError("Invalid or empty data_limit configuration file")
102
+
103
+ # Create a dictionary of limits for each data source
104
+ limits_dict = {limit['name']: limit['limit'] for limit in data_limits if isinstance(limit, dict) and 'name' in limit and 'limit' in limit}
105
+
106
+ output_paths = []
107
+ # Process each data source
108
+ for data_source in data_sources:
109
+ try:
110
+ if not isinstance(data_source, dict) or 'name' not in data_source or 'url' not in data_source:
111
+ print(f"Skipping invalid data source: {data_source}")
112
+ continue
113
+
114
+ # Get allowed periods for this data source
115
+ allowed_periods = limits_dict.get(data_source['name'], [])
116
+
117
+ # Process each allowed period
118
+ for period in allowed_periods:
119
+ try:
120
+ # Validate period format
121
+ try:
122
+ datetime.strptime(period, '%Y-%m')
123
+ except ValueError:
124
+ print(f"Invalid period format for {data_source['name']}: {period}")
125
+ continue
126
+
127
+ # Construct unique output path
128
+ output_path = f"temp/{data_source['name']}-{period}.csv"
129
+
130
+ # Create the directory if it doesn't exist
131
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
132
+
133
+ # Construct URL
134
+ url = f"{data_source['url']}{data_source['name']}-{period}.zip"
135
+
136
+ # Download and extract data
137
+ download_and_extract_binance_data(url, output_path)
138
+
139
+ except Exception as e:
140
+ print(f"Failed to process period {period} for {data_source['name']}: {e}")
141
+ continue
142
+ output_paths.append(output_path)
143
+
144
+ except Exception as e:
145
+ print(f"Failed to process data source {data_source.get('name', 'unknown')}: {e}")
146
+ continue
147
+
148
+ return output_paths
149
+
150
+ except (yaml.YAMLError, FileNotFoundError) as e:
151
+ raise Exception(f"Failed to load configuration: {e}")
152
+ except Exception as e:
153
+ raise Exception(f"Script execution failed: {e}")
154
+
155
+ if __name__ == "__main__":
156
+ out_paths = crawl_data_from_sources()
157
+ print("Downloaded files:", out_paths)
components/datalake_cr.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import ast
4
+
5
+ # Add the project root directory to the Python path
6
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
7
+ from minio_api.client import sign_in, upload_file, download_file, create_bucket, list_objects
8
+
9
+ def up_to_minio(client_files, server_files, bucket_name="minio-ngrok-bucket"):
10
+ """Upload the local CSV file to MinIO."""
11
+ # Convert stringified lists to actual lists if necessary
12
+ if isinstance(client_files, str):
13
+ try:
14
+ client_files = ast.literal_eval(client_files)
15
+ except (ValueError, SyntaxError) as e:
16
+ raise ValueError(f"Failed to parse client_files as a list: {client_files}, error: {e}")
17
+ if isinstance(server_files, str):
18
+ try:
19
+ server_files = ast.literal_eval(server_files)
20
+ except (ValueError, SyntaxError) as e:
21
+ raise ValueError(f"Failed to parse server_files as a list: {server_files}, error: {e}")
22
+
23
+ for client_file, server_file in zip(client_files, server_files):
24
+ # Check if local file exists
25
+ if not os.path.exists(client_file):
26
+ raise FileNotFoundError(f"Local file {client_file} does not exist")
27
+
28
+ minio_client = sign_in()
29
+ # Create bucket
30
+ create_bucket(minio_client, bucket_name)
31
+
32
+ # Upload file
33
+ print("Uploading file to MinIO:", client_file, "as", server_file)
34
+ upload_file(minio_client, bucket_name, client_file, server_file)
35
+
36
+ if __name__ == "__main__":
37
+ # Example usage
38
+ try:
39
+ up_to_minio(["temp/BTCUSDT-1s-2025-09.csv"],
40
+ ["BTCUSDT-1s-2025-09.csv"],
41
+ "minio-ngrok-bucket")
42
+ print("File uploaded successfully.")
43
+ except Exception as e:
44
+ print(f"Error uploading file: {e}")
components/delete_lstm_predict.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from components.model.model_utils import create_sequences
4
+ from model import build_lstm_model
5
+ from sklearn.preprocessing import MinMaxScaler
6
+ from tensorflow import keras
7
+
8
+ df = pd.read_csv('/tmp/BTCUSDT-1s-2024-05.csv', header=None)
9
+ df.columns = [
10
+ "open_time", "open", "high", "low", "close", "volume",
11
+ "close_time", "quote_asset_volume", "number_of_trades",
12
+ "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
13
+ ]
14
+
15
+ prices = df['close'].astype(float).values.reshape(-1, 1)
16
+ scaler = MinMaxScaler()
17
+ prices_scaled = scaler.fit_transform(prices)
18
+
19
+ split_idx = int(len(prices_scaled) * 0.8)
20
+ test_data = prices_scaled[split_idx - 60:]
21
+
22
+ seq_length = 60
23
+ X_test, y_test = create_sequences(test_data, seq_length)
24
+
25
+ model = build_lstm_model(seq_length)
26
+ model.load_weights('./ckpts/lstm_checkpoint.keras')
27
+
28
+ loss = model.evaluate(X_test, y_test, verbose=0)
29
+ print(f"Test loss: {loss}")
30
+
31
+ last_seq = prices_scaled[-seq_length:]
32
+ next_pred = model.predict(last_seq.reshape(1, seq_length, 1))
33
+ next_price = scaler.inverse_transform(next_pred)
34
+ print(f"Predicted next close price: {next_price[0][0]}")
35
+ checkpoint_cb = keras.callbacks.ModelCheckpoint(
36
+ './ckpts/lstm_checkpoint.keras', save_best_only=True, monitor='val_loss'
37
+ )
38
+
39
+ # Train
40
+ model.fit(
41
+ X_train, y_train,
42
+ epochs=5,
43
+ batch_size=64,
44
+ validation_data=(X_test, y_test),
45
+ callbacks=[checkpoint_cb],
46
+ verbose=2
47
+ )
48
+
49
+ # Evaluate and predict
50
+ loss = model.evaluate(X_test, y_test, verbose=0)
51
+ print(f"Test loss: {loss}")
52
+
53
+ last_seq = prices_scaled[-seq_length:]
54
+ next_pred = model.predict(last_seq.reshape(1, seq_length, 1))
55
+ next_price = scaler.inverse_transform(next_pred)
56
+ print(f"Predicted next close price: {next_price[0][0]}")
components/delete_lstm_train.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from components.model.model_utils import create_sequences
4
+ from model import build_lstm_model
5
+ from sklearn.preprocessing import MinMaxScaler
6
+ from tensorflow import keras
7
+ import os
8
+
9
+ df = pd.read_csv('/tmp/BTCUSDT-1s-2024-05.csv', header=None)
10
+ df.columns = [
11
+ "open_time", "open", "high", "low", "close", "volume",
12
+ "close_time", "quote_asset_volume", "number_of_trades",
13
+ "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
14
+ ]
15
+
16
+ prices = df['close'].astype(float).values.reshape(-1, 1)
17
+ scaler = MinMaxScaler()
18
+ prices_scaled = scaler.fit_transform(prices)
19
+
20
+ split_idx = int(len(prices_scaled) * 0.8)
21
+ train_data = prices_scaled[:split_idx]
22
+ test_data = prices_scaled[split_idx - 60:]
23
+
24
+ seq_length = 60
25
+ X_train, y_train = create_sequences(train_data, seq_length)
26
+ X_test, y_test = create_sequences(test_data, seq_length)
27
+
28
+ model = build_lstm_model(seq_length)
29
+
30
+ os.makedirs('./ckpts', exist_ok=True)
31
+ checkpoint_cb = keras.callbacks.ModelCheckpoint(
32
+ './ckpts/lstm_checkpoint.keras', save_best_only=True, monitor='val_loss'
33
+ )
34
+
35
+ model.fit(
36
+ X_train, y_train,
37
+ epochs=5,
38
+ batch_size=64,
39
+ validation_data=(X_test, y_test),
40
+ callbacks=[checkpoint_cb],
41
+ verbose=2
42
+ )
components/delete_model.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from tensorflow import keras
2
+
3
+ def build_lstm_model(seq_length):
4
+ model = keras.Sequential([
5
+ keras.layers.LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
6
+ keras.layers.LSTM(50),
7
+ keras.layers.Dense(1)
8
+ ])
9
+ model.compile(optimizer='adam', loss='mse')
10
+ return model
components/duckdb2csv.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import pandas as pd
3
+ import logging
4
+
5
+ def duckdb_to_csv(duckdb_path, output_csv_path):
6
+ try:
7
+ # Connect to DuckDB
8
+ con = duckdb.connect(duckdb_path)
9
+ # Query data
10
+ df = con.execute("SELECT * FROM aggregated_financial_data").fetchdf()
11
+ if df.empty:
12
+ raise ValueError("No data found in table 'aggregated_financial_data'")
13
+ # Save to CSV
14
+ df.to_csv(output_csv_path, index=False)
15
+ logging.info(f"Successfully exported data to {output_csv_path}")
16
+ except Exception as e:
17
+ logging.error(f"Error in duckdb_to_csv: {str(e)}")
18
+ raise
19
+
20
+ if __name__ == "__main__":
21
+ duckdb_to_csv("duckdb_databases/financial_data.db",
22
+ "analytics/financial_data.csv")
components/duckdb_api.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import duckdb
3
+ import shutil
4
+ import pandas as pd
5
+
6
+ def push_to_duckdb(duckdb_path, parquet_path, temp_parquet_path="temp/duckdb_temp_parquet"):
7
+ """
8
+ Push the aggregated data from a Parquet directory to DuckDB.
9
+
10
+ Args:
11
+ duckdb_path (str): Path to the DuckDB database file
12
+ parquet_path (str): Path to the Parquet directory containing the aggregated data
13
+ temp_parquet_path (str): Temporary path for storing Parquet files
14
+ """
15
+ # Validate input parquet_path
16
+ if not isinstance(parquet_path, str):
17
+ raise ValueError(f"parquet_path must be a string, got {type(parquet_path)}: {parquet_path}")
18
+ if not os.path.exists(parquet_path):
19
+ raise FileNotFoundError(f"Parquet directory does not exist at {parquet_path}")
20
+ if not os.path.isdir(parquet_path):
21
+ raise ValueError(f"parquet_path must be a directory, got a file at {parquet_path}")
22
+
23
+ # Ensure the temporary directory is clean before copying
24
+ if os.path.exists(temp_parquet_path):
25
+ shutil.rmtree(temp_parquet_path)
26
+ os.makedirs(temp_parquet_path, exist_ok=True)
27
+
28
+ # Copy the input Parquet directory to the temporary directory
29
+ try:
30
+ shutil.copytree(parquet_path, temp_parquet_path, dirs_exist_ok=True)
31
+ print(f"Copied Parquet directory from {parquet_path} to {temp_parquet_path}")
32
+ except Exception as e:
33
+ raise RuntimeError(f"Failed to copy Parquet directory from {parquet_path} to {temp_parquet_path}: {e}")
34
+
35
+ # Connect to DuckDB
36
+ directory = os.path.dirname(duckdb_path)
37
+ if directory and not os.path.exists(directory):
38
+ os.makedirs(directory)
39
+
40
+ con = duckdb.connect(duckdb_path)
41
+
42
+ # Create or replace the table in DuckDB by reading the Parquet files
43
+ try:
44
+ con.execute(f"""
45
+ CREATE OR REPLACE TABLE aggregated_financial_data AS
46
+ SELECT * FROM parquet_scan('{temp_parquet_path}/*.parquet')
47
+ """)
48
+ print(f"Successfully loaded data into DuckDB table from {temp_parquet_path}")
49
+ except Exception as e:
50
+ raise RuntimeError(f"Failed to load Parquet files into DuckDB: {e}")
51
+ finally:
52
+ con.close()
53
+
54
+ # Clean up temporary Parquet directory
55
+ if os.path.exists(temp_parquet_path):
56
+ shutil.rmtree(temp_parquet_path)
57
+ print(f"Cleaned up temporary directory {temp_parquet_path}")
58
+
59
+ if __name__ == "__main__":
60
+ from process_data import process_financial_data
61
+ duckdb_path = "duckdb_databases/financial_data.db"
62
+ parquet_path = process_financial_data()
63
+
64
+ try:
65
+ push_to_duckdb(duckdb_path, parquet_path)
66
+ except Exception as e:
67
+ print(f"Error pushing to DuckDB: {e}")
68
+
components/model/__init__.py ADDED
File without changes
components/model/data_utils.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import tensorflow as tf
4
+ import pandas as pd
5
+ import pyarrow.parquet as pq
6
+ from sklearn.preprocessing import MinMaxScaler
7
+
8
+ # Configure logging
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def create_data_loader(parquet_paths: list, scaler: MinMaxScaler, seq_length: int, batch_size: int) -> tf.data.Dataset:
12
+ """Create a tf.data.Dataset from Parquet files for LSTM training or evaluation.
13
+
14
+ Args:
15
+ parquet_paths (list): List of paths to Parquet files.
16
+ scaler (MinMaxScaler): Scaler fitted on the data.
17
+ seq_length (int): Length of input sequences.
18
+ batch_size (int): Batch size for the dataset.
19
+
20
+ Returns:
21
+ tf.data.Dataset: Dataset yielding (sequence, target) pairs with shapes (batch_size, seq_length, 1) and (batch_size, 1).
22
+
23
+ Raises:
24
+ ValueError: If inputs are invalid or no valid data is found.
25
+ """
26
+ if not parquet_paths:
27
+ logger.error("No parquet paths provided")
28
+ raise ValueError("parquet_paths cannot be empty")
29
+ if not isinstance(scaler, MinMaxScaler):
30
+ logger.error("Invalid scaler provided")
31
+ raise ValueError("scaler must be an instance of MinMaxScaler")
32
+ if not isinstance(seq_length, int) or seq_length <= 0:
33
+ logger.error(f"Invalid seq_length: {seq_length}")
34
+ raise ValueError("seq_length must be a positive integer")
35
+ if not isinstance(batch_size, int) or batch_size <= 0:
36
+ logger.error(f"Invalid batch_size: {batch_size}")
37
+ raise ValueError("batch_size must be a positive integer")
38
+
39
+ total_sequences = 0
40
+ def _scaled_generator():
41
+ nonlocal total_sequences
42
+ for path in parquet_paths:
43
+ if not os.path.exists(path):
44
+ logger.warning(f"Parquet file not found, skipping: {path}")
45
+ continue
46
+ try:
47
+ file_size = os.path.getsize(path) / (1024 * 1024) # Size in MB
48
+ if file_size < 100: # Load small files into memory
49
+ df = pd.read_parquet(path, columns=['Close'])
50
+ logger.debug(f"Loaded {path} into memory, size: {file_size:.2f} MB")
51
+ if 'Close' not in df.columns or df['Close'].isna().any():
52
+ logger.warning(f"Invalid or missing 'Close' column in {path}")
53
+ continue
54
+ prices = df['Close'].astype('float32').values.reshape(-1, 1)
55
+ if prices.size <= seq_length:
56
+ logger.warning(f"File {path} has {prices.size} rows, insufficient for seq_length {seq_length}")
57
+ continue
58
+ scaled = scaler.transform(prices)
59
+ for j in range(len(scaled) - seq_length):
60
+ total_sequences += 1
61
+ yield scaled[j:j + seq_length], scaled[j + seq_length]
62
+ else:
63
+ parquet_file = pq.ParquetFile(path)
64
+ for batch in parquet_file.iter_batches(batch_size=10_000, columns=['Close']):
65
+ chunk = batch.to_pandas()
66
+ if 'Close' not in chunk.columns or chunk['Close'].isna().any():
67
+ logger.warning(f"Invalid or missing 'Close' column in {path}")
68
+ continue
69
+ prices = chunk['Close'].astype('float32').values.reshape(-1, 1)
70
+ scaled = scaler.transform(prices)
71
+ logger.debug(f"Processing batch from {path}, scaled shape: {scaled.shape}")
72
+ for j in range(len(scaled) - seq_length):
73
+ total_sequences += 1
74
+ yield scaled[j:j + seq_length], scaled[j + seq_length]
75
+ except Exception as e:
76
+ logger.error(f"Error processing parquet file {path}: {e}")
77
+ continue
78
+
79
+ if total_sequences == 0:
80
+ logger.error("No valid sequences generated from any Parquet file")
81
+ raise ValueError("No valid sequences generated from any Parquet file")
82
+
83
+ dataset = tf.data.Dataset.from_generator(
84
+ _scaled_generator,
85
+ output_types=(tf.float32, tf.float32),
86
+ output_shapes=((seq_length, 1), (1,))
87
+ ).batch(batch_size).prefetch(tf.data.AUTOTUNE)
88
+
89
+ logger.info(f"Created data loader with seq_length={seq_length}, batch_size={batch_size}, total_sequences={total_sequences}")
90
+ return dataset
components/model/evaluation.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.preprocessing import MinMaxScaler
6
+ import pickle
7
+ from sklearn.metrics import (
8
+ mean_squared_error,
9
+ mean_absolute_error,
10
+ mean_absolute_percentage_error
11
+ )
12
+ from typing import Dict, List, Tuple
13
+ from datetime import datetime, timezone
14
+ import tensorflow as tf
15
+ import sys
16
+ import ast
17
+
18
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
19
+ from components.utils.file_utils import load_extract_config, get_parquet_file_names
20
+ from components.model.model_utils import build_model_from_config
21
+ from components.model.data_utils import create_data_loader
22
+ from components.utils.utils import parse_timezone
23
+
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s %(levelname)s: %(message)s',
27
+ datefmt='%Y-%m-%d %H:%M:%S %Z'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # def model_evaluate(model, scaler: MinMaxScaler, ds: tf.data.Dataset) -> Tuple[float, float]:
32
+ # """Evaluate a model on a dataset and return RMSE and MAE.
33
+
34
+ # Args:
35
+ # model: Trained Keras model.
36
+ # scaler (MinMaxScaler): Scaler used for data normalization.
37
+ # ds (tf.data.Dataset): Dataset to evaluate on.
38
+
39
+ # Returns:
40
+ # Tuple[float, float]: RMSE and MAE metrics.
41
+ # """
42
+ # y_true, y_pred = [], []
43
+ # for X, y in ds:
44
+ # pred = model.predict(X, verbose=2)
45
+ # y_true.append(y.numpy())
46
+ # y_pred.append(pred)
47
+ # y_true = np.concatenate(y_true)
48
+ # y_pred = np.concatenate(y_pred)
49
+ # y_true_orig = scaler.inverse_transform(y_true)
50
+ # y_pred_orig = scaler.inverse_transform(y_pred)
51
+ # return (np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
52
+ # mean_absolute_error(y_true_orig, y_pred_orig))
53
+
54
+ def model_evaluate(model, scaler: MinMaxScaler, ds: tf.data.Dataset) -> Tuple[float, float]:
55
+ """Evaluate a model on a dataset and return RMSE and MAE.
56
+
57
+ Args:
58
+ model: Trained Keras model.
59
+ scaler (MinMaxScaler): Scaler used for data normalization.
60
+ ds (tf.data.Dataset): Dataset to evaluate on.
61
+
62
+ Returns:
63
+ Tuple[float, float]: RMSE and MAE metrics.
64
+ """
65
+ # Collect true labels (y) from dataset
66
+ y_true = []
67
+ for _, y in ds:
68
+ y_true.append(y.numpy())
69
+ y_true = np.concatenate(y_true)
70
+
71
+ # Predict the entire dataset
72
+ y_pred = model.predict(ds, verbose=0) # Silent predictions
73
+
74
+ # Inverse transform to original scale
75
+ y_true_orig = scaler.inverse_transform(y_true)
76
+ y_pred_orig = scaler.inverse_transform(y_pred)
77
+
78
+ # Calculate metrics
79
+ return (np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
80
+ mean_absolute_error(y_true_orig, y_pred_orig),
81
+ mean_absolute_percentage_error(y_true_orig, y_pred_orig))
82
+
83
+ def metric_and_predict_lstm_model(train_result: Dict) -> Dict:
84
+ """Evaluate the trained LSTM model and predict the next price.
85
+
86
+ Args:
87
+ train_result (Dict): Training result dictionary from train_lstm_model task.
88
+
89
+ Returns:
90
+ Dict: Evaluation metrics and prediction metadata.
91
+ """
92
+ # Access ti directly from kwargs
93
+ if not train_result:
94
+ raise ValueError("No training result provided.")
95
+
96
+ # Convert string representation to dictionary if necessary
97
+ train_result = ast.literal_eval(train_result)
98
+
99
+ cfg = load_extract_config('model_config.yml')
100
+ parquet_folder = load_extract_config('pipeline_config.yml')['paths']['parquet_folder']
101
+ os.makedirs(parquet_folder, exist_ok=True)
102
+
103
+ model_cfg = cfg['model']
104
+ data_cfg = cfg['data']
105
+ out_cfg = cfg['output']
106
+ dt_str = train_result['datetime']
107
+ model_filename = train_result['model_filename']
108
+ dataset_merge = train_result['dataset_merge']
109
+
110
+ model_path = train_result['model_path']
111
+ scaler_path = train_result['scaler_path']
112
+ seq_length = data_cfg['seq_length']
113
+ batch_size = cfg['evaluation'].get('eval_batch_size', 64)
114
+
115
+ # Load scaler and model
116
+ with open(scaler_path, 'rb') as f:
117
+ scaler = pickle.load(f)
118
+ model = build_model_from_config(seq_length, cfg)
119
+ model.load_weights(model_path)
120
+
121
+ # Create dataset
122
+ parquet_paths = [parquet_folder for el in get_parquet_file_names()]
123
+ dataset = create_data_loader(parquet_paths, scaler, seq_length, batch_size)
124
+
125
+ # Calculate splits
126
+ total_seqs = sum(max(0, len(pd.read_parquet(path, columns=['Close'])) - seq_length)
127
+ for path in parquet_paths if os.path.exists(path))
128
+ if total_seqs == 0:
129
+ raise ValueError("Not enough sequences for evaluation.")
130
+
131
+ steps_total = (total_seqs + batch_size - 1) // batch_size
132
+ steps_train = int(steps_total * data_cfg['train_ratio'])
133
+ steps_val = int(steps_total * data_cfg['val_ratio'])
134
+ steps_test = steps_total - steps_train - steps_val
135
+
136
+ train_ds = dataset.take(steps_train)
137
+ val_ds = dataset.skip(steps_train).take(steps_val)
138
+ test_ds = dataset.skip(steps_train + steps_val)
139
+
140
+ # Evaluate model
141
+ # train_rmse, train_mae = model_evaluate(model, scaler, train_ds)
142
+ # val_rmse, val_mae = model_evaluate(model, scaler, val_ds)
143
+ train_rmse, train_mae, train_mape = model_evaluate(model, scaler, train_ds)
144
+ val_rmse, val_mae, val_mape = model_evaluate(model, scaler, val_ds)
145
+ test_rmse, test_mae, test_mape = model_evaluate(model, scaler, test_ds)
146
+
147
+ # Save metrics
148
+ metrics_path = os.path.join(out_cfg['metrics']['metrics_dir'], f"metrics_{dt_str}.csv")
149
+ os.makedirs(out_cfg['metrics']['metrics_dir'], exist_ok=True)
150
+
151
+ metrics_data = [
152
+ [model_filename, dataset_merge, "Train", "RMSE", train_rmse],
153
+ [model_filename, dataset_merge, "Train", "MAE", train_mae],
154
+ [model_filename, dataset_merge, "Train", "MAPE", train_mape],
155
+ [model_filename, dataset_merge, "Val", "RMSE", val_rmse],
156
+ [model_filename, dataset_merge, "Val", "MAE", val_mae],
157
+ [model_filename, dataset_merge, "Val", "MAPE", val_mape],
158
+ [model_filename, dataset_merge, "Test", "RMSE", test_rmse],
159
+ [model_filename, dataset_merge, "Test", "MAE", test_mae],
160
+ [model_filename, dataset_merge, "Test", "MAPE", test_mape],
161
+ ]
162
+
163
+ metrics_df = pd.DataFrame(
164
+ metrics_data,
165
+ columns=['model_path', 'dataset_merge', 'Split', 'Metric', 'Value']
166
+ )
167
+ metrics_df.to_csv(metrics_path, index=False)
168
+
169
+ # Predict next price
170
+ last_chunk = None
171
+ for path in reversed(parquet_paths):
172
+ if os.path.exists(path):
173
+ df_tail = pd.read_parquet(path).tail(seq_length)
174
+ if len(df_tail) >= seq_length:
175
+ last_chunk = df_tail['Close'].values.astype('float32').reshape(-1, 1)
176
+ break
177
+ if last_chunk is None:
178
+ raise ValueError("Not enough recent data for prediction.")
179
+
180
+ last_scaled = scaler.transform(last_chunk)
181
+ next_scaled = model.predict(last_scaled.reshape(1, seq_length, 1), verbose=2)
182
+ next_price = scaler.inverse_transform(next_scaled)[0][0]
183
+
184
+ # Save prediction
185
+ pred_path = os.path.join(out_cfg['predictions']['pred_dir'], f"prediction_{dt_str}.txt")
186
+ os.makedirs(os.path.dirname(pred_path), exist_ok=True)
187
+
188
+ with open(pred_path, 'w') as f:
189
+ f.write(f"Model Run: {dt_str}\n")
190
+ f.write(f"Model File: {model_filename}\n")
191
+ f.write(f"Dataset Merged: {dataset_merge}\n")
192
+ f.write(f"Architecture: {model_cfg['architecture'].upper()}\n")
193
+ f.write(f"Predicted Next Close: {next_price:.6f}\n")
194
+ f.write(f"Based on last {seq_length} timesteps.\n\n")
195
+ f.write("Evaluation Metrics:\n")
196
+ f.write(f" Train -> RMSE: {train_rmse:8.6f} | MAE: {train_mae:8.6f} | MAPE: {train_mape:8.6f}\n")
197
+ f.write(f" Val -> RMSE: {val_rmse:8.6f} | MAE: {val_mae:8.6f} | MAPE: {val_mape:8.6f}\n")
198
+ f.write(f" Test -> RMSE: {test_rmse:8.6f} | MAE: {test_mae:8.6f} | MAPE: {test_mape:8.6f}\n")
199
+
200
+ logging.info(f"Next price: {next_price:.4f} | Test RMSE: {test_rmse:.6f} | Dataset: {dataset_merge}")
201
+
202
+ return {
203
+ 'metrics_path': metrics_path,
204
+ 'prediction_path': pred_path,
205
+ 'next_price': float(next_price)
206
+ }
207
+
208
+ if __name__ == "__main__":
209
+ logger.info("Running standalone evaluation test")
210
+ # Simulate training result for testing
211
+ cfg = load_extract_config('model_config.yml')
212
+ out_cfg = cfg['output']
213
+ data_cfg = cfg['data']
214
+
215
+ # Mock training result (adjust paths to match an actual trained model and scaler)
216
+ mock_train_result = {
217
+ 'model_path': os.path.join(out_cfg['checkpoints']['model_dir'],
218
+ 'model_2025-10-24-21-59-42-(+07).h5'),
219
+ 'model_filename': 'model_2025-10-24-18-40-00-(+07).h5',
220
+ 'scaler_path': os.path.join(out_cfg['checkpoints']['scaler_dir'],
221
+ 'scaler_2025-10-24-21-59-42-(+07).pkl'),
222
+ 'datetime': '2025-10-24-21-59-42-(+07',
223
+ 'dataset_merge': 'BTCUSDT-1s-2025-08 + BTCUSDT-1s-2025-09'
224
+ }
225
+
226
+ # Simulate Airflow task instance
227
+ class MockTaskInstance:
228
+ def xcom_pull(self, task_ids):
229
+ return mock_train_result
230
+
231
+ mock_ti = MockTaskInstance()
232
+
233
+ try:
234
+ result = metric_and_predict_lstm_model(ti=mock_ti)
235
+ logger.info("Evaluation completed successfully!")
236
+ logger.info(f"Result: {result}")
237
+ except Exception as e:
238
+ logger.error(f"Evaluation failed: {str(e)}")
239
+ logger.info("Standalone evaluation run completed")
components/model/model_utils.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ import pyarrow.parquet as pq
6
+ import pandas as pd
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ from tensorflow import keras
9
+ from typing import Tuple
10
+ from datetime import datetime, timezone, timedelta
11
+ import sys
12
+
13
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
14
+ from components.model.data_utils import create_data_loader
15
+
16
+ # Configure logging with +07:00 timezone
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s %(levelname)s: %(message)s',
20
+ datefmt='%Y-%m-%d %H:%M:%S %Z'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ def create_sequences(data: np.ndarray, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
25
+ """Create sequences of data for LSTM model training and prediction.
26
+
27
+ Args:
28
+ data (np.ndarray): Input time series data (scaled), shape (n_samples, n_features).
29
+ seq_length (int): Length of each sequence.
30
+
31
+ Returns:
32
+ Tuple[np.ndarray, np.ndarray]: (X, y) where X is input sequences (n_samples, seq_length, n_features)
33
+ and y is target values (n_samples, n_features).
34
+
35
+ Raises:
36
+ ValueError: If data is empty, seq_length is invalid, or data has insufficient length.
37
+ """
38
+ if not isinstance(data, np.ndarray):
39
+ logger.error("Input data must be a numpy array")
40
+ raise ValueError("Input data must be a numpy array")
41
+ if data.size == 0:
42
+ logger.error("Input data is empty")
43
+ raise ValueError("Input data is empty")
44
+ if not isinstance(seq_length, int) or seq_length <= 0:
45
+ logger.error(f"Invalid seq_length: {seq_length}")
46
+ raise ValueError("seq_length must be a positive integer")
47
+ if len(data) <= seq_length:
48
+ logger.error(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
49
+ raise ValueError(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
50
+
51
+ X, y = [], []
52
+ for i in range(len(data) - seq_length):
53
+ sequence = data[i:i + seq_length]
54
+ target = data[i + seq_length]
55
+ X.append(sequence)
56
+ y.append(target)
57
+
58
+ X = np.array(X)
59
+ y = np.array(y)
60
+
61
+ if len(X.shape) == 2:
62
+ X = X.reshape(X.shape[0], X.shape[1], 1)
63
+
64
+ logger.info(f"Created {X.shape[0]} sequences: X shape {X.shape}, y shape {y.shape}")
65
+ return X, y
66
+
67
+ def build_model_from_config(seq_length: int, cfg: dict) -> keras.Model:
68
+ """Build an LSTM-based model based on configuration.
69
+
70
+ Args:
71
+ seq_length (int): Length of input sequences.
72
+ cfg (dict): Model configuration dictionary with 'model' key containing architecture, units, etc.
73
+
74
+ Returns:
75
+ keras.Model: Compiled Keras model.
76
+
77
+ Raises:
78
+ ValueError: If configuration is invalid or architecture is unsupported.
79
+ """
80
+ if not isinstance(cfg, dict) or 'model' not in cfg:
81
+ logger.error("Invalid configuration: 'model' key missing")
82
+ raise ValueError("Configuration must be a dictionary with a 'model' key")
83
+
84
+ model_cfg = cfg['model']
85
+ arch = model_cfg.get('architecture')
86
+ units = model_cfg.get('units')
87
+ layers = model_cfg.get('layers', 1)
88
+ dropout = model_cfg.get('dropout', 0.2)
89
+ activation = model_cfg.get('activation', 'tanh')
90
+ learning_rate = model_cfg.get('learning_rate', 0.001)
91
+
92
+ if not isinstance(units, int) or units <= 0:
93
+ logger.error(f"Invalid units: {units}")
94
+ raise ValueError("units must be a positive integer")
95
+ if not isinstance(layers, int) or layers <= 0:
96
+ logger.error(f"Invalid layers: {layers}")
97
+ raise ValueError("layers must be a positive integer")
98
+ if not isinstance(dropout, float) or not 0 <= dropout < 1:
99
+ logger.error(f"Invalid dropout: {dropout}")
100
+ raise ValueError("dropout must be a float between 0 and 1")
101
+ if arch not in ['lstm', 'bilstm', 'gru', 'custom']:
102
+ logger.error(f"Unsupported architecture: {arch}")
103
+ raise ValueError(f"Unsupported architecture: {arch}")
104
+ if not isinstance(seq_length, int) or seq_length <= 0:
105
+ logger.error(f"Invalid seq_length: {seq_length}")
106
+ raise ValueError("seq_length must be a positive integer")
107
+ if not isinstance(learning_rate, (int, float)) or learning_rate <= 0:
108
+ logger.error(f"Invalid learning_rate: {learning_rate}")
109
+ raise ValueError("learning_rate must be a positive number")
110
+
111
+ inputs = keras.layers.Input(shape=(seq_length, 1))
112
+ x = inputs
113
+
114
+ if arch == 'lstm':
115
+ # Improved LSTM Layer 1 (100 units, return_sequences=True)
116
+ x = keras.layers.LSTM(
117
+ units, return_sequences=True, activation=activation,
118
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
119
+ )(x)
120
+
121
+ # Attention mechanism
122
+ attention = keras.layers.Attention()([x, x])
123
+ x = keras.layers.Add()([x, attention]) # Residual connection
124
+
125
+ # Improved LSTM Layer 2 (50 units, return_sequences=True)
126
+ x = keras.layers.LSTM(
127
+ units // 2, return_sequences=True, activation=activation,
128
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
129
+ )(x)
130
+
131
+ # Improved LSTM Layer 3 (25 units, return_sequences=False)
132
+ x = keras.layers.LSTM(
133
+ units // 4, return_sequences=False, activation=activation,
134
+ dropout=dropout, recurrent_dropout=0.1
135
+ )(x)
136
+
137
+ # Dense layers
138
+ x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
139
+ x = keras.layers.Dropout(dropout)(x)
140
+ x = keras.layers.Dense(25, activation='relu')(x)
141
+ x = keras.layers.Dropout(dropout)(x)
142
+
143
+ # Output layer
144
+ x = keras.layers.Dense(1)(x)
145
+ elif arch == 'gru':
146
+ # Improved GRU Layer 1 (100 units, return_sequences=True)
147
+ x = keras.layers.GRU(
148
+ units, return_sequences=True, activation=activation,
149
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
150
+ )(x)
151
+
152
+ # Attention mechanism
153
+ attention = keras.layers.Attention()([x, x])
154
+ x = keras.layers.Add()([x, attention]) # Residual connection
155
+
156
+ # Improved GRU Layer 2 (50 units, return_sequences=True)
157
+ x = keras.layers.GRU(
158
+ units // 2, return_sequences=True, activation=activation,
159
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
160
+ )(x)
161
+
162
+ # Improved GRU Layer 3 (25 units, return_sequences=False)
163
+ x = keras.layers.GRU(
164
+ units // 4, return_sequences=False, activation=activation,
165
+ dropout=dropout, recurrent_dropout=0.1
166
+ )(x)
167
+
168
+ # Dense layers
169
+ x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
170
+ x = keras.layers.Dropout(dropout)(x)
171
+ x = keras.layers.Dense(25, activation='relu')(x)
172
+ x = keras.layers.Dropout(dropout)(x)
173
+
174
+ # Output layer
175
+ x = keras.layers.Dense(1)(x)
176
+ elif arch == 'bilstm':
177
+ # Improved BiLSTM Layer 1 (100 units, return_sequences=True)
178
+ x = keras.layers.Bidirectional(
179
+ keras.layers.LSTM(
180
+ units, return_sequences=True, activation=activation,
181
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
182
+ )
183
+ )(x)
184
+
185
+ # Attention mechanism
186
+ attention = keras.layers.Attention()([x, x])
187
+ x = keras.layers.Add()([x, attention]) # Residual connection
188
+
189
+ # Improved BiLSTM Layer 2 (50 units, return_sequences=True)
190
+ x = keras.layers.Bidirectional(
191
+ keras.layers.LSTM(
192
+ units // 2, return_sequences=True, activation=activation,
193
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
194
+ )
195
+ )(x)
196
+
197
+ # Improved BiLSTM Layer 3 (25 units, return_sequences=False)
198
+ x = keras.layers.Bidirectional(
199
+ keras.layers.LSTM(
200
+ units // 4, return_sequences=False, activation=activation,
201
+ dropout=dropout, recurrent_dropout=0.1
202
+ )
203
+ )(x)
204
+
205
+ # Dense layers
206
+ x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
207
+ x = keras.layers.Dropout(dropout)(x)
208
+ x = keras.layers.Dense(25, activation='relu')(x)
209
+ x = keras.layers.Dropout(dropout)(x)
210
+
211
+ # Output layer
212
+ x = keras.layers.Dense(1)(x)
213
+ elif arch == 'custom':
214
+ # Improved Custom Layer 1 (LSTM, 100 units, return_sequences=True)
215
+ x = keras.layers.LSTM(
216
+ units, return_sequences=True, activation=activation,
217
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
218
+ )(x)
219
+
220
+ # Attention mechanism
221
+ attention = keras.layers.Attention()([x, x])
222
+ x = keras.layers.Add()([x, attention]) # Residual connection
223
+
224
+ # Improved Custom Layer 2 (LSTM, 50 units, return_sequences=True)
225
+ x = keras.layers.LSTM(
226
+ units // 2, return_sequences=True, activation=activation,
227
+ dropout=dropout, recurrent_dropout=0.1, kernel_regularizer=tf.keras.regularizers.l2(0.01)
228
+ )(x)
229
+
230
+ # Improved Custom Layer 3 (LSTM, 25 units, return_sequences=False)
231
+ x = keras.layers.LSTM(
232
+ units // 4, return_sequences=False, activation=activation,
233
+ dropout=dropout, recurrent_dropout=0.1
234
+ )(x)
235
+
236
+ # Dense layers
237
+ x = keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
238
+ x = keras.layers.Dropout(dropout)(x)
239
+ x = keras.layers.Dense(25, activation='relu')(x)
240
+ x = keras.layers.Dropout(dropout)(x)
241
+
242
+ # Output layer
243
+ x = keras.layers.Dense(1)(x)
244
+
245
+ model = keras.Model(inputs, x)
246
+
247
+ optimizer_name = model_cfg.get('optimizer', 'adam').lower()
248
+ if optimizer_name == 'adam':
249
+ optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
250
+ else:
251
+ logger.warning(f"Optimizer {optimizer_name} not explicitly handled, using default parameters")
252
+ optimizer = optimizer_name
253
+
254
+ model.compile(
255
+ optimizer=optimizer,
256
+ loss=model_cfg.get('loss', 'mse'),
257
+ metrics=['mae']
258
+ )
259
+
260
+ logger.info(f"Built model: architecture={arch}, units={units}, layers={layers}, learning_rate={learning_rate}")
261
+ return model
262
+
263
+ if __name__ == "__main__":
264
+ import pandas as pd
265
+ from components.utils.file_utils import load_config
266
+
267
+ logger.info("Running standalone tests for model_utils.py")
268
+ # Test create_sequences
269
+ data = np.array([[10000], [10050], [10100], [10150], [10200]])
270
+ seq_length = 3
271
+ X, y = create_sequences(data, seq_length)
272
+ print(f"create_sequences: X shape {X.shape}, y shape {y.shape}")
273
+ print(f"Sample sequence: {X[0]}, target: {y[0]}")
274
+
275
+ # Test create_data_loader
276
+ scaler = MinMaxScaler()
277
+ scaler.fit(data)
278
+ parquet_paths = ['temp/extracted_from_minio/btcusdt_1h.parquet']
279
+ if not os.path.exists(parquet_paths[0]):
280
+ os.makedirs(os.path.dirname(parquet_paths[0]), exist_ok=True)
281
+ pd.DataFrame({'Close': [10000, 10050, 10100, 10150, 10200]}).to_parquet(parquet_paths[0])
282
+
283
+ dataset = create_data_loader(parquet_paths, scaler, seq_length=3, batch_size=2)
284
+ for x, y in dataset.take(1):
285
+ print(f"create_data_loader: x shape {x.shape}, y shape {y.shape}")
286
+
287
+ # Test build_model_from_config for all architectures
288
+ config = load_config('configs/model_config.yml')
289
+ for arch in ['lstm', 'gru', 'bilstm', 'custom']:
290
+ config['model']['architecture'] = arch
291
+ model = build_model_from_config(seq_length=3, cfg=config)
292
+ print(f"\nModel summary for {arch}:")
293
+ model.summary()
294
+ logger.info("Standalone tests completed successfully.")
components/model/old_model_utils.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ import pyarrow.parquet as pq
6
+ import pandas as pd
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ from tensorflow import keras
9
+ from typing import Tuple
10
+ from datetime import datetime, timezone, timedelta
11
+ import sys
12
+
13
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
14
+ from components.model.data_utils import create_data_loader
15
+
16
+ # Configure logging with +07:00 timezone
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s %(levelname)s: %(message)s',
20
+ datefmt='%Y-%m-%d %H:%M:%S %Z'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ def create_sequences(data: np.ndarray, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
25
+ """Create sequences of data for LSTM model training and prediction.
26
+
27
+ Args:
28
+ data (np.ndarray): Input time series data (scaled), shape (n_samples, n_features).
29
+ seq_length (int): Length of each sequence.
30
+
31
+ Returns:
32
+ Tuple[np.ndarray, np.ndarray]: (X, y) where X is input sequences (n_samples, seq_length, n_features)
33
+ and y is target values (n_samples, n_features).
34
+
35
+ Raises:
36
+ ValueError: If data is empty, seq_length is invalid, or data has insufficient length.
37
+ """
38
+ if not isinstance(data, np.ndarray):
39
+ logger.error("Input data must be a numpy array")
40
+ raise ValueError("Input data must be a numpy array")
41
+ if data.size == 0:
42
+ logger.error("Input data is empty")
43
+ raise ValueError("Input data is empty")
44
+ if not isinstance(seq_length, int) or seq_length <= 0:
45
+ logger.error(f"Invalid seq_length: {seq_length}")
46
+ raise ValueError("seq_length must be a positive integer")
47
+ if len(data) <= seq_length:
48
+ logger.error(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
49
+ raise ValueError(f"Data length {len(data)} is insufficient for seq_length {seq_length}")
50
+
51
+ X, y = [], []
52
+ for i in range(len(data) - seq_length):
53
+ sequence = data[i:i + seq_length]
54
+ target = data[i + seq_length]
55
+ X.append(sequence)
56
+ y.append(target)
57
+
58
+ X = np.array(X)
59
+ y = np.array(y)
60
+
61
+ if len(X.shape) == 2:
62
+ X = X.reshape(X.shape[0], X.shape[1], 1)
63
+
64
+ logger.info(f"Created {X.shape[0]} sequences: X shape {X.shape}, y shape {y.shape}")
65
+ return X, y
66
+
67
+ def build_model_from_config(seq_length: int, cfg: dict) -> keras.Model:
68
+ """Build an LSTM-based model based on configuration.
69
+
70
+ Args:
71
+ seq_length (int): Length of input sequences.
72
+ cfg (dict): Model configuration dictionary with 'model' key containing architecture, units, etc.
73
+
74
+ Returns:
75
+ keras.Model: Compiled Keras model.
76
+
77
+ Raises:
78
+ ValueError: If configuration is invalid or architecture is unsupported.
79
+ """
80
+ if not isinstance(cfg, dict) or 'model' not in cfg:
81
+ logger.error("Invalid configuration: 'model' key missing")
82
+ raise ValueError("Configuration must be a dictionary with a 'model' key")
83
+
84
+ model_cfg = cfg['model']
85
+ arch = model_cfg.get('architecture')
86
+ units = model_cfg.get('units')
87
+ layers = model_cfg.get('layers', 1)
88
+ dropout = model_cfg.get('dropout', 0.2)
89
+ activation = model_cfg.get('activation', 'tanh')
90
+ learning_rate = model_cfg.get('learning_rate', 0.001) # Default learning rate if not specified
91
+
92
+ if not isinstance(units, int) or units <= 0:
93
+ logger.error(f"Invalid units: {units}")
94
+ raise ValueError("units must be a positive integer")
95
+ if not isinstance(layers, int) or layers <= 0:
96
+ logger.error(f"Invalid layers: {layers}")
97
+ raise ValueError("layers must be a positive integer")
98
+ if not isinstance(dropout, float) or not 0 <= dropout < 1:
99
+ logger.error(f"Invalid dropout: {dropout}")
100
+ raise ValueError("dropout must be a float between 0 and 1")
101
+ if arch not in ['lstm', 'bilstm', 'gru', 'custom']:
102
+ logger.error(f"Unsupported architecture: {arch}")
103
+ raise ValueError(f"Unsupported architecture: {arch}")
104
+ if not isinstance(seq_length, int) or seq_length <= 0:
105
+ logger.error(f"Invalid seq_length: {seq_length}")
106
+ raise ValueError("seq_length must be a positive integer")
107
+ if not isinstance(learning_rate, (int, float)) or learning_rate <= 0:
108
+ logger.error(f"Invalid learning_rate: {learning_rate}")
109
+ raise ValueError("learning_rate must be a positive number")
110
+
111
+ inputs = keras.layers.Input(shape=(seq_length, 1))
112
+ x = inputs
113
+
114
+ for i in range(layers):
115
+ return_seq = i < layers - 1
116
+ if arch == 'lstm':
117
+ x = keras.layers.LSTM(
118
+ units, return_sequences=return_seq, activation=activation,
119
+ dropout=dropout, recurrent_dropout=0.1
120
+ )(x)
121
+ elif arch == 'bilstm':
122
+ x = keras.layers.Bidirectional(
123
+ keras.layers.LSTM(
124
+ units, return_sequences=return_seq, activation=activation,
125
+ dropout=dropout, recurrent_dropout=0.1
126
+ )
127
+ )(x)
128
+ elif arch == 'gru':
129
+ x = keras.layers.GRU(
130
+ units, return_sequences=return_seq, activation=activation,
131
+ dropout=dropout, recurrent_dropout=0.1
132
+ )(x)
133
+ elif arch == 'custom':
134
+ x = keras.layers.LSTM(units, return_sequences=True)(x)
135
+ x = keras.layers.LSTM(units // 2, return_sequences=False)(x)
136
+ x = keras.layers.Dense(50, activation='relu')(x)
137
+ x = keras.layers.Dropout(dropout)(x)
138
+
139
+ if arch != 'custom':
140
+ x = keras.layers.Dense(1)(x)
141
+
142
+ model = keras.Model(inputs, x)
143
+
144
+ # Configure optimizer with specified learning rate
145
+ optimizer_name = model_cfg.get('optimizer', 'adam').lower()
146
+ if optimizer_name == 'adam':
147
+ optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
148
+ else:
149
+ logger.warning(f"Optimizer {optimizer_name} not explicitly handled, using default parameters")
150
+ optimizer = optimizer_name # Fallback to string, Keras will handle it
151
+
152
+ model.compile(
153
+ optimizer=optimizer,
154
+ loss=model_cfg.get('loss', 'mse'),
155
+ metrics=['mae']
156
+ )
157
+
158
+ logger.info(f"Built model: architecture={arch}, units={units}, layers={layers}, learning_rate={learning_rate}")
159
+ return model
160
+
161
+ if __name__ == "__main__":
162
+ import pandas as pd
163
+ from components.utils.file_utils import load_config
164
+
165
+ logger.info("Running standalone tests for lstm_utils.py")
166
+ # Test create_sequences
167
+ data = np.array([[10000], [10050], [10100], [10150], [10200]])
168
+ seq_length = 3
169
+ X, y = create_sequences(data, seq_length)
170
+ print(f"create_sequences: X shape {X.shape}, y shape {y.shape}")
171
+ print(f"Sample sequence: {X[0]}, target: {y[0]}")
172
+
173
+ # Test create_data_loader
174
+ scaler = MinMaxScaler()
175
+ scaler.fit(data)
176
+ parquet_paths = ['temp/extracted_from_minio/btcusdt_1h.parquet']
177
+ if not os.path.exists(parquet_paths[0]):
178
+ os.makedirs(os.path.dirname(parquet_paths[0]), exist_ok=True)
179
+ pd.DataFrame({'Close': [10000, 10050, 10100, 10150, 10200]}).to_parquet(parquet_paths[0])
180
+
181
+ dataset = create_data_loader(parquet_paths, scaler, seq_length=3, batch_size=2)
182
+ for x, y in dataset.take(1):
183
+ print(f"create_data_loader: x shape {x.shape}, y shape {y.shape}")
184
+
185
+ # Test build_model_from_config
186
+ config = load_config('model_config.yml')
187
+ model = build_model_from_config(seq_length=3, cfg=config)
188
+ model.summary()
189
+ logger.info("Standalone tests completed successfully.")
components/model/training.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.preprocessing import MinMaxScaler
6
+ import pickle
7
+ from datetime import datetime, timezone, timedelta
8
+ import tensorflow as tf
9
+ from tensorflow import keras
10
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
11
+ from typing import Dict, List
12
+ import sys
13
+
14
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
15
+ from components.utils.file_utils import load_extract_config, get_parquet_file_names
16
+ from components.model.model_utils import build_model_from_config
17
+ from components.model.data_utils import create_data_loader
18
+ from components.utils.utils import parse_timezone
19
+
20
+ # Configure logging with +07:00 timezone
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s %(levelname)s: %(message)s',
24
+ datefmt='%Y-%m-%d %H:%M:%S %Z'
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+ def train_lstm_model(**kwargs) -> Dict:
29
+ """Train an LSTM model for BTC/USDT forecasting and save model and scaler.
30
+
31
+ Args:
32
+ kwargs: Airflow task instance arguments.
33
+
34
+ Returns:
35
+ Dict: Training metadata including model path, scaler path, metrics, and dataset info.
36
+ """
37
+ # Verify GPU availability
38
+ gpus = tf.config.list_physical_devices('GPU')
39
+ if not gpus:
40
+ logger.warning("No GPU detected. Training on CPU, which may be slower.")
41
+ else:
42
+ logger.info(f"GPUs detected: {len(gpus)}. Using CUDA for training.")
43
+ for gpu in gpus:
44
+ tf.config.experimental.set_memory_growth(gpu, True)
45
+
46
+ cfg = load_extract_config('model_config.yml')
47
+ model_cfg = cfg['model']
48
+ train_cfg = cfg['training']
49
+ data_cfg = cfg['data']
50
+ out_cfg = cfg['output']
51
+ ver_cfg = cfg['versioning']
52
+
53
+ # Parse timezone from YAML
54
+ tz_offset_str = ver_cfg['timezone'] # '+07:00'
55
+ tz = parse_timezone(tz_offset_str)
56
+
57
+ # Get current time in the specified timezone
58
+ dt = datetime.now(tz)
59
+ dt_str = dt.strftime(ver_cfg['datetime_format']) + f"-({dt.strftime('%z')[:3]})"
60
+ model_path = os.path.join(out_cfg['checkpoints']['model_dir'], f"model_{dt_str}.h5")
61
+ scaler_path = os.path.join(out_cfg['checkpoints']['scaler_dir'], f"scaler_{dt_str}.pkl")
62
+ parquet_folder = load_extract_config('pipeline_config.yml')['paths']['parquet_folder']
63
+
64
+ # Ensure output directories exist
65
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
66
+ os.makedirs(os.path.dirname(scaler_path), exist_ok=True)
67
+
68
+
69
+ # Load data
70
+ file_names = get_parquet_file_names()
71
+ parquet_paths = [os.path.join(parquet_folder, el) for el in file_names]
72
+ all_df = pd.DataFrame()
73
+ used_files = []
74
+
75
+ for path, name in zip(parquet_paths, file_names):
76
+ if os.path.exists(path):
77
+ df = pd.read_parquet(path)
78
+ logger.info(f"Loaded {path} with {len(df)} rows")
79
+ all_df = pd.concat([all_df, df], ignore_index=True)
80
+ clean_name = name.replace(".parquet", "").replace(".csv", "")
81
+ used_files.append(clean_name)
82
+ else:
83
+ logger.warning(f"File not found: {path}")
84
+
85
+ if all_df.empty:
86
+ logger.error("No data loaded from Parquet files")
87
+ raise ValueError("No data loaded from Parquet files")
88
+
89
+ dataset_merge = " + ".join(used_files) if used_files else "none"
90
+ logger.info(f"Dataset merged: {dataset_merge}, total rows: {len(all_df)}")
91
+
92
+ # Scale data
93
+ scaler = MinMaxScaler()
94
+ prices = all_df['Close'].astype(float).values.reshape(-1, 1)
95
+ if prices.size <= data_cfg['seq_length']:
96
+ logger.error(f"Total data size {prices.size} is insufficient for seq_length {data_cfg['seq_length']}")
97
+ raise ValueError(f"Total data size {prices.size} is insufficient for seq_length {data_cfg['seq_length']}")
98
+ prices_scaled = scaler.fit_transform(prices)
99
+
100
+ # Create dataset with smaller batch size
101
+ seq_length = data_cfg['seq_length']
102
+ batch_size = train_cfg.get('batch_size', 64) # Default to 64 if not specified
103
+ if batch_size > 8192:
104
+ logger.warning(f"Batch size {batch_size} is large; reducing to 64 to avoid memory issues")
105
+ batch_size = 64
106
+
107
+ dataset = create_data_loader(parquet_paths, scaler, seq_length, batch_size)
108
+
109
+ # Calculate exact number of sequences
110
+ total_seqs = 0
111
+ for path in parquet_paths:
112
+ if os.path.exists(path):
113
+ df = pd.read_parquet(path, columns=['Close'])
114
+ seqs = max(0, len(df) - seq_length)
115
+ total_seqs += seqs
116
+ logger.info(f"File {path}: {len(df)} rows, {seqs} sequences")
117
+
118
+ if total_seqs == 0:
119
+ logger.error("Not enough sequences for training")
120
+ raise ValueError("Not enough sequences for training")
121
+
122
+ # Calculate steps for training, validation, and test
123
+ steps_total = (total_seqs + batch_size - 1) // batch_size
124
+ train_ratio = data_cfg.get('train_ratio', 0.7)
125
+ val_ratio = data_cfg.get('val_ratio', 0.2)
126
+ steps_train = max(1, int(steps_total * train_ratio))
127
+ steps_val = max(1, int(steps_total * val_ratio))
128
+ steps_test = max(1, steps_total - steps_train - steps_val)
129
+ logger.info(f"Dataset splits: total_steps={steps_total}, train={steps_train}, val={steps_val}, test={steps_test}")
130
+
131
+ # Save scaler
132
+ with open(scaler_path, 'wb') as f:
133
+ pickle.dump(scaler, f)
134
+
135
+ train_ds = dataset.take(steps_train)
136
+ val_ds = dataset.skip(steps_train).take(steps_val)
137
+ test_ds = dataset.skip(steps_train + steps_val)
138
+
139
+ # Build and train model
140
+ model = build_model_from_config(seq_length, cfg)
141
+ checkpoint_cb = keras.callbacks.ModelCheckpoint(
142
+ model_path, save_best_only=True, monitor='val_loss', verbose=0
143
+ )
144
+ early_stop = keras.callbacks.EarlyStopping(
145
+ monitor='val_loss', patience=train_cfg['patience'], restore_best_weights=True
146
+ )
147
+
148
+ # Log model summary
149
+ model.summary(print_fn=lambda x: logger.info(x))
150
+
151
+ # Train with exact steps_per_epoch
152
+ model.fit(
153
+ train_ds,
154
+ epochs=train_cfg['epochs'],
155
+ steps_per_epoch=steps_train,
156
+ validation_data=val_ds,
157
+ validation_steps=steps_val,
158
+ callbacks=[checkpoint_cb, early_stop],
159
+ verbose=2
160
+ )
161
+
162
+
163
+
164
+ # # Test evaluation
165
+ # y_true, y_pred = [], []
166
+ # for X, y in test_ds:
167
+ # pred = model.predict(X, verbose=0)
168
+ # y_true.append(y.numpy())
169
+ # y_pred.append(pred)
170
+ # y_true = np.concatenate(y_true)
171
+ # y_pred = np.concatenate(y_pred)
172
+ # y_true_orig = scaler.inverse_transform(y_true)
173
+ # y_pred_orig = scaler.inverse_transform(y_pred)
174
+ # test_rmse = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
175
+ # test_mae = mean_absolute_error(y_true_orig, y_pred_orig)
176
+
177
+ # logger.info(f"Test RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")
178
+
179
+ return {
180
+ 'model_path': model_path,
181
+ 'model_filename': os.path.basename(model_path),
182
+ 'scaler_path': scaler_path,
183
+ 'datetime': dt_str,
184
+ 'dataset_merge': dataset_merge,
185
+ # 'test_rmse': float(test_rmse),
186
+ # 'test_mae': float(test_mae)
187
+ }
188
+
189
+ if __name__ == "__main__":
190
+ logger.info("Running standalone training test")
191
+ result = train_lstm_model()
192
+ print("Training completed successfully!")
193
+ print(result)
194
+ logger.info("Standalone training run completed")
components/old-process_data.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/process_data.py
2
+ from pyspark.sql import SparkSession
3
+ from pyspark.sql.types import StructType, StructField, LongType, DoubleType, IntegerType
4
+ from pyspark.sql.functions import col, row_number, floor, first, max, min, last, sum
5
+ from pyspark.sql.window import Window
6
+ import os
7
+ import sys
8
+ import shutil
9
+ import pandas as pd
10
+
11
+ # Add the project root directory to the Python path
12
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
13
+ from minio_api.client import sign_in
14
+ from minio_api.minio_utils import get_minio_data
15
+
16
+ def initialize_spark_session(app_name="MinIO to Spark DataFrame",
17
+ driver_memory="4g", executor_memory="4g"):
18
+ return SparkSession.builder \
19
+ .appName(app_name) \
20
+ .config("spark.driver.memory", driver_memory) \
21
+ .config("spark.executor.memory", executor_memory) \
22
+ .getOrCreate()
23
+
24
+ def create_dataframe_from_csv(spark, csv_lines, temp_parquet_path="temp/temp_parquet_chunks",
25
+ chunk_size=int(3e+6)):
26
+ os.makedirs(temp_parquet_path, exist_ok=True)
27
+ schema = StructType([
28
+ StructField("Open time", LongType(), True),
29
+ StructField("Open", DoubleType(), True),
30
+ StructField("High", DoubleType(), True),
31
+ StructField("Low", DoubleType(), True),
32
+ StructField("Close", DoubleType(), True),
33
+ StructField("Volume", DoubleType(), True),
34
+ StructField("Close time", LongType(), True),
35
+ StructField("Quote asset volume", DoubleType(), True),
36
+ StructField("Number of trades", IntegerType(), True),
37
+ StructField("Taker buy base asset volume", DoubleType(), True),
38
+ StructField("Taker buy quote asset volume", DoubleType(), True),
39
+ StructField("Ignore", IntegerType(), True)
40
+ ])
41
+
42
+ if csv_lines and csv_lines[0].startswith("Open time,"):
43
+ data_lines = csv_lines[1:]
44
+ else:
45
+ data_lines = csv_lines
46
+
47
+ if os.path.exists(temp_parquet_path):
48
+ shutil.rmtree(temp_parquet_path)
49
+
50
+ for i in range(0, len(data_lines), chunk_size):
51
+ chunk = data_lines[i:i + chunk_size]
52
+ rdd_chunk = spark.sparkContext.parallelize(chunk).repartition(8)
53
+ df_chunk = spark.read.schema(schema).csv(rdd_chunk, header=False)
54
+ df_chunk.write.mode("append").parquet(temp_parquet_path)
55
+
56
+ return spark.read.parquet(temp_parquet_path)
57
+
58
+ def resample_dataframe(df, track_each=3600):
59
+ keep_cols = ["Open time", "Open", "High", "Low", "Close", "Number of trades"]
60
+ df = df.select(keep_cols)
61
+ window_spec = Window.orderBy("Open time")
62
+ df = df.withColumn("row_number", row_number().over(window_spec))
63
+ df = df.withColumn("group_id", floor((col("row_number") - 1) / track_each))
64
+ aggregations = [
65
+ first("Open time").alias("Open time"),
66
+ first("Open").alias("Open"),
67
+ max("High").alias("High"),
68
+ min("Low").alias("Low"),
69
+ last("Close").alias("Close"),
70
+ sum("Number of trades").alias("Number of trades")
71
+ ]
72
+ aggregated_df = df.groupBy("group_id").agg(*aggregations)
73
+ return aggregated_df.select("Open time", "Open", "High", "Low", "Close", "Number of trades")
74
+
75
+ def process_financial_data(bucket_name="minio-ngrok-bucket", file_name="BTCUSDT-1s-2025-09.csv",
76
+ temp_parquet_path="temp/temp_parquet_chunks",
77
+ output_parquet_path="temp/aggregated_output"):
78
+ minio_client = sign_in()
79
+ spark = initialize_spark_session()
80
+
81
+ try:
82
+ csv_lines = get_minio_data(minio_client, bucket_name, file_name)
83
+ print(f"Fetched CSV data from MinIO: {len(csv_lines)} lines")
84
+ df = create_dataframe_from_csv(spark, csv_lines, temp_parquet_path)
85
+ print("Created Spark DataFrame from CSV data.")
86
+ aggregated_df = resample_dataframe(df)
87
+ print("Resampled DataFrame with OHLC aggregations.")
88
+
89
+ # Save aggregated DataFrame to a temporary Parquet directory
90
+ os.makedirs(os.path.dirname(output_parquet_path), exist_ok=True)
91
+ aggregated_df.write.mode("overwrite").parquet(output_parquet_path)
92
+ print(f"Saved aggregated DataFrame to {output_parquet_path}")
93
+
94
+ # Verify that the Parquet directory exists
95
+ if not os.path.exists(output_parquet_path) or not os.path.isdir(output_parquet_path):
96
+ raise FileNotFoundError(f"Parquet directory {output_parquet_path} was not created or is not a directory.")
97
+ else:
98
+ print(f"Verified: Parquet directory exists at {output_parquet_path}")
99
+
100
+ return output_parquet_path
101
+
102
+ except Exception as e:
103
+ print(f"Error in process_financial_data: {e}")
104
+ raise
105
+ finally:
106
+ spark.stop()
107
+
108
+ if __name__ == "__main__":
109
+ # Example usage
110
+ output_parquet_path = process_financial_data()
111
+ print(output_parquet_path)
components/process_data.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyspark.sql import SparkSession
2
+ from pyspark.sql.types import StructType, StructField, LongType, DoubleType, IntegerType
3
+ from pyspark.sql.functions import col, row_number, floor, first, max, min, last, sum
4
+ from pyspark.sql.window import Window
5
+
6
+ import os
7
+ import sys
8
+ import shutil
9
+ import pandas as pd
10
+ import ast
11
+ import io
12
+
13
+ # Add the project root directory to the Python path
14
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
15
+ from minio_api.minio_utils import get_minio_data
16
+ from minio_api.client import sign_in
17
+
18
+
19
+ def initialize_spark_session(app_name="MinIO to Spark DataFrame",
20
+ driver_memory="4g", executor_memory="4g"):
21
+ return SparkSession.builder \
22
+ .appName(app_name) \
23
+ .config("spark.driver.memory", driver_memory) \
24
+ .config("spark.executor.memory", executor_memory) \
25
+ .getOrCreate()
26
+
27
+ def create_dataframe_from_csv(spark, parquet_file_path, schema, temp_parquet_path="temp/temp_parquet_chunks",
28
+ chunk_size=int(3e+6)):
29
+ os.makedirs(temp_parquet_path, exist_ok=True)
30
+
31
+ # Clear the temporary Parquet path if it exists
32
+ if os.path.exists(temp_parquet_path):
33
+ shutil.rmtree(temp_parquet_path)
34
+
35
+ # Read Parquet file directly with Spark, applying the schema
36
+ df = spark.read.schema(schema).parquet(parquet_file_path)
37
+
38
+ # Write the DataFrame to the temporary Parquet path (for consistency with original logic)
39
+ df.write.mode("append").parquet(temp_parquet_path)
40
+
41
+ # Read back the Parquet data from the temporary path
42
+ return spark.read.parquet(temp_parquet_path)
43
+
44
+ def resample_dataframe(df, track_each=3600):
45
+ keep_cols = ["Open time", "Open", "High", "Low", "Close", "Number of trades"]
46
+ df = df.select(keep_cols)
47
+ window_spec = Window.orderBy("Open time")
48
+ df = df.withColumn("row_number", row_number().over(window_spec))
49
+ df = df.withColumn("group_id", floor((col("row_number") - 1) / track_each))
50
+ aggregations = [
51
+ first("Open time").alias("Open time"),
52
+ first("Open").alias("Open"),
53
+ max("High").alias("High"),
54
+ min("Low").alias("Low"),
55
+ last("Close").alias("Close"),
56
+ sum("Number of trades").alias("Number of trades")
57
+ ]
58
+ aggregated_df = df.groupBy("group_id").agg(*aggregations)
59
+ return aggregated_df.select("Open time", "Open", "High", "Low", "Close", "Number of trades")
60
+
61
+ def extract_from_minio(bucket_name="minio-ngrok-bucket",
62
+ file_names=["BTCUSDT-1s-2025-09.csv"]):
63
+ minio_client = sign_in()
64
+ out_parquet_file_paths = []
65
+ headers = [
66
+ "Open time", "Open", "High", "Low", "Close", "Volume",
67
+ "Close time", "Quote asset volume", "Number of trades",
68
+ "Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"
69
+ ]
70
+
71
+ for file_name in file_names:
72
+ csv_lines = get_minio_data(minio_client, bucket_name, file_name)
73
+ if not csv_lines:
74
+ raise ValueError(f"No data retrieved from MinIO for bucket {bucket_name}, file {file_name}")
75
+ temp_parquet_path = f"temp/extracted_from_minio/{os.path.splitext(os.path.basename(file_name))[0]}.parquet"
76
+ os.makedirs(os.path.dirname(temp_parquet_path), exist_ok=True)
77
+
78
+ # Convert CSV lines to DataFrame with specified headers
79
+ df = pd.read_csv(io.StringIO('\n'.join(csv_lines)), names=headers)
80
+ df.to_parquet(temp_parquet_path, index=False)
81
+
82
+ out_parquet_file_paths.append(temp_parquet_path)
83
+
84
+ return out_parquet_file_paths
85
+
86
+ def transform_financial_data(parquet_file_paths,
87
+ temp_parquet_path="temp/temp_parquet_chunks",
88
+ output_parquet_path="temp/aggregated_output"):
89
+ spark = initialize_spark_session()
90
+
91
+ try:
92
+ # Define the schema
93
+ schema = StructType([
94
+ StructField("Open time", LongType(), True),
95
+ StructField("Open", DoubleType(), True),
96
+ StructField("High", DoubleType(), True),
97
+ StructField("Low", DoubleType(), True),
98
+ StructField("Close", DoubleType(), True),
99
+ StructField("Volume", DoubleType(), True),
100
+ StructField("Close time", LongType(), True),
101
+ StructField("Quote asset volume", DoubleType(), True),
102
+ StructField("Number of trades", LongType(), True),
103
+ StructField("Taker buy base asset volume", DoubleType(), True),
104
+ StructField("Taker buy quote asset volume", DoubleType(), True),
105
+ StructField("Ignore", LongType(), True)
106
+ ])
107
+
108
+ # output_parquet_paths = []
109
+ if isinstance(parquet_file_paths, str):
110
+ try:
111
+ parquet_file_paths = ast.literal_eval(parquet_file_paths)
112
+ except (ValueError, SyntaxError) as e:
113
+ raise ValueError(f"Failed to parse server_files as a list: {parquet_file_paths}, error: {e}")
114
+
115
+ for parquet_file_path in parquet_file_paths:
116
+ # Create DataFrame using create_dataframe_from_csv
117
+ df = create_dataframe_from_csv(spark, parquet_file_path, schema, temp_parquet_path)
118
+ print("Created Spark DataFrame from CSV file.")
119
+ aggregated_df = resample_dataframe(df)
120
+ print("Resampled DataFrame with OHLC aggregations.")
121
+
122
+ # Save aggregated DataFrame to a temporary Parquet directory
123
+ os.makedirs(os.path.dirname(output_parquet_path), exist_ok=True)
124
+ # aggregated_df.write.mode("overwrite").parquet(output_parquet_path)
125
+ aggregated_df.write.mode("append").parquet(output_parquet_path)
126
+ print(f"Saved aggregated DataFrame to {output_parquet_path}")
127
+
128
+ # Verify that the Parquet directory exists
129
+ if not os.path.exists(output_parquet_path) or not os.path.isdir(output_parquet_path):
130
+ raise FileNotFoundError(f"Parquet directory {output_parquet_path} was not created or is not a directory.")
131
+ else:
132
+ print(f"Verified: Parquet directory exists at {output_parquet_path}")
133
+ # output_parquet_paths.append(output_parquet_path)
134
+
135
+ # name_output_parquet_paths = [os.path.basename(path) for path in output_parquet_paths]
136
+
137
+ return output_parquet_path#, name_output_parquet_paths
138
+
139
+ except Exception as e:
140
+ print(f"Error in transform_financial_data: {e}")
141
+ raise
142
+ finally:
143
+ spark.stop()
144
+
145
+ if __name__ == "__main__":
146
+ # Example usage
147
+ extracted_parquet_path = extract_from_minio()
148
+ output_parquet_path, name_output_parquet_paths = transform_financial_data(extracted_parquet_path)
149
+ print(output_parquet_path)
150
+ print(name_output_parquet_paths)
components/utils/__init__.py ADDED
File without changes
components/utils/file_utils.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import logging
4
+ from typing import Dict, List
5
+
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ def load_extract_config(config_name: str) -> Dict:
10
+ """Load a YAML configuration file from the configs directory.
11
+
12
+ Args:
13
+ config_name (str): Name of the config file (e.g., 'model_config.yml').
14
+
15
+ Returns:
16
+ Dict: Parsed configuration dictionary.
17
+
18
+ Raises:
19
+ FileNotFoundError: If the configuration file does not exist.
20
+ ValueError: If config_name is empty or not a string.
21
+ """
22
+ if not isinstance(config_name, str) or not config_name.strip():
23
+ raise ValueError("config_name must be a non-empty string")
24
+
25
+ config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'configs', config_name))
26
+ logging.debug(f"Attempting to load config from: {config_path}")
27
+
28
+ if not os.path.exists(config_path):
29
+ logging.error(f"Configuration file not found: {config_path}")
30
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
31
+
32
+ try:
33
+ with open(config_path, 'r') as f:
34
+ config = yaml.safe_load(f)
35
+ if config is None:
36
+ logging.warning(f"Configuration file {config_name} is empty")
37
+ return {}
38
+ logging.info(f"Successfully loaded config: {config_name}")
39
+ return config
40
+ except yaml.YAMLError as e:
41
+ logging.error(f"Failed to parse YAML in {config_name}: {e}")
42
+ raise
43
+
44
+ def get_parquet_file_names() -> List[str]:
45
+ """Retrieve Parquet file names from the extract_data.yml configuration.
46
+
47
+ Returns:
48
+ List[str]: List of Parquet file names derived from CSV file names.
49
+
50
+ Raises:
51
+ FileNotFoundError: If extract_data.yml is missing.
52
+ ValueError: If no files are specified in the configuration.
53
+ """
54
+ config = load_extract_config('extract_data.yml')
55
+ files = config.get('files', [])
56
+ if not files:
57
+ logging.error("No files specified in extract_data.yml")
58
+ raise ValueError("No files specified in extract_data.yml")
59
+
60
+ parquet_files = [f.replace(".csv", ".parquet") for f in files]
61
+ logging.debug(f"Derived Parquet file names: {parquet_files}")
62
+ return parquet_files
63
+
64
+ def load_pipeline_config() -> Dict:
65
+ """Load pipeline configuration from pipeline_config.yml.
66
+
67
+ Returns:
68
+ Dict: Pipeline configuration dictionary.
69
+
70
+ Raises:
71
+ FileNotFoundError: If pipeline_config.yml is missing.
72
+ """
73
+ config = load_extract_config('pipeline_config.yml')
74
+ logging.debug(f"Pipeline config loaded: {config}")
75
+ return config
76
+
77
+ def define_server_filenames(**kwargs) -> List[str]:
78
+ """Extract base filenames from client file paths using Airflow XCom.
79
+
80
+ Args:
81
+ kwargs: Airflow task instance arguments containing 'ti' for XCom.
82
+
83
+ Returns:
84
+ List[str]: List of base filenames.
85
+
86
+ Raises:
87
+ KeyError: If 'ti' is not provided in kwargs.
88
+ ValueError: If no files are retrieved from XCom.
89
+ """
90
+ if 'ti' not in kwargs:
91
+ logging.error("Task instance 'ti' not provided in kwargs")
92
+ raise KeyError("Task instance 'ti' not provided in kwargs")
93
+
94
+ ti = kwargs['ti']
95
+ client_files = ti.xcom_pull(task_ids='download_binance_csv')
96
+ if client_files is None:
97
+ logging.error("No files retrieved from XCom for task 'download_binance_csv'")
98
+ raise ValueError("No files retrieved from XCom for task 'download_binance_csv'")
99
+
100
+ if not isinstance(client_files, list):
101
+ client_files = [client_files]
102
+
103
+ server_files = [os.path.basename(p) for p in client_files]
104
+ logging.debug(f"Extracted server filenames: {server_files}")
105
+ return server_files
components/utils/utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import timezone, timedelta
2
+ import re
3
+
4
+ def parse_timezone(tz_offset_str):
5
+ """
6
+ Parse a timezone offset string (e.g., '+07:00') and return a timezone object.
7
+
8
+ Args:
9
+ tz_offset_str (str): Timezone offset in format '[+-]HH:MM'
10
+
11
+ Returns:
12
+ timezone: A datetime.timezone object with the specified offset
13
+
14
+ Raises:
15
+ ValueError: If the timezone format is invalid
16
+ """
17
+ match = re.match(r'([+-])(\d{2}):(\d{2})', tz_offset_str)
18
+ if not match:
19
+ raise ValueError(f"Invalid timezone format: {tz_offset_str}")
20
+
21
+ sign, hours, minutes = match.groups()
22
+ hours, minutes = int(hours), int(minutes)
23
+ if sign == '-':
24
+ hours, minutes = -hours, -minutes
25
+
26
+ return timezone(timedelta(hours=hours, minutes=minutes))
configs/data_limit.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ - name: BTCUSDT-1s
2
+ limit:
3
+ - "2025-08"
4
+ - "2025-09"
configs/data_sources.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - name: BTCUSDT-1s
2
+ url: https://data.binance.vision/data/spot/monthly/klines/BTCUSDT/1s/
configs/delete_lstm_hyperparams.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LSTM / BiLSTM / GRU / Custom Model Hyperparameters
2
+ architecture: bilstm # Options: lstm, bilstm, gru, custom
3
+ seq_length: 60 # Number of time steps to look back
4
+ units: 100
5
+ layers: 2
6
+ dropout: 0.2
7
+ activation: tanh
8
+
9
+ optimizer: adam
10
+ loss: mse
11
+ epochs: 50
12
+ batch_size: 64
13
+ patience: 10
14
+
15
+ train_ratio: 0.8
16
+ val_ratio: 0.1
17
+ test_ratio: 0.1
configs/extract_data.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ files:
2
+ - BTCUSDT-1s-2025-08.csv
3
+ - BTCUSDT-1s-2025-09.csv
4
+ storage_folder: temp/extracted_from_minio
configs/model_config.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # Unified Model & Output Configuration
3
+ # =============================================================================
4
+
5
+ model:
6
+ architecture: bilstm # lstm | bilstm | gru | custom
7
+ units: 100 #100
8
+ layers: 3 #2
9
+ dropout: 0.2
10
+ activation: tanh
11
+ optimizer: adam
12
+ learning_rate: 0.0005
13
+ loss: mse
14
+
15
+ training:
16
+ epochs: 20
17
+ batch_size: 6144
18
+ patience: 5
19
+
20
+ evaluation:
21
+ eval_batch_size: 6144
22
+
23
+ data:
24
+ seq_length: 60
25
+ train_ratio: 0.8
26
+ val_ratio: 0.1
27
+
28
+ output:
29
+ checkpoints:
30
+ model_dir: ckpts
31
+ scaler_dir: ckpts
32
+
33
+ metrics:
34
+ metrics_dir: evaluation
35
+
36
+ predictions:
37
+ pred_dir: evaluation
38
+
39
+ versioning:
40
+ datetime_format: '%Y-%m-%d-%H-%M-%S'
41
+ timezone: '+07:00'
configs/pipeline_config.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ minio:
2
+ bucket_name: minio-ngrok-bucket
3
+ paths:
4
+ temp_parquet_path: temp/temp_parquet_chunks
5
+ output_parquet_path: temp/aggregated_output
6
+ duckdb_path: duckdb_databases/financial_data.db
7
+ output_csv_path: analytics/financial_data.csv
8
+ parquet_folder: temp/extracted_from_minio
docs/data_sources.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Source: Binance BTCUSDT 1s Kline
2
+
3
+ - **Source:** [Binance Data Public](https://data.binance.vision/?prefix=data/spot/monthly/klines/BTCUSDT/1s/)
4
+ - **Example file:** `BTCUSDT-1s-2024-05.csv.gz`
5
+ - **Columns:**
6
+ 1. Open time
7
+ 2. Open
8
+ 3. High
9
+ 4. Low
10
+ 5. Close
11
+ 6. Volume
12
+ 7. Close time
13
+ 8. Quote asset volume
14
+ 9. Number of trades
15
+ 10. Taker buy base asset volume
16
+ 11. Taker buy quote asset volume
17
+ 12. Ignore
18
+
19
+ ## Usage
20
+
21
+ - The pipeline downloads and processes this data for deep learning time series forecasting.
22
+ - The DAG can be modified to use a different month by changing the filename in the download step.
23
+
24
+ ## Acknowledgment
25
+ You can refer to https://github.com/binance/binance-public-data?tab=readme-ov-file#klines to get more infromation
26
+
docs/dependencies.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+
3
+ ## System
4
+
5
+ - Hadoop/HDFS
6
+ - Spark
7
+ - Airflow
8
+ - Python 3.8+
9
+
10
+ ## Python
11
+
12
+ Install with pip:
13
+
14
+ ```bash
15
+ pip install pandas numpy scikit-learn tensorflow
16
+ ```
17
+
18
+ ## Notes
19
+
20
+ - Ensure Java is installed for Hadoop/Spark.
21
+ - Airflow and Hadoop should be configured and running before triggering the DAG.
22
+ - If using a dev container, dependencies may already be installed.
docs/frameworks_installation.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Spark
2
+ ### Install Java 8 (required for Spark)
3
+ !apt-get update -qq
4
+ !apt-get install openjdk-8-jdk-headless -qq > /dev/null
5
+
6
+ ### Download and extract Spark (use the latest version; this is 3.5.6 with Hadoop 3)
7
+ !wget -q https://downloads.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
8
+ !tar xf spark-3.5.6-bin-hadoop3.tgz
9
+
10
+ ### Install PySpark and findspark (helps locate Spark)
11
+ !pip install -q pyspark findspark duckdb # duckdb for your script
12
+
13
+ ### Set environment variables
14
+ import os
15
+ os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
16
+ os.environ["SPARK_HOME"] = "/content/spark-3.5.6-bin-hadoop3"
17
+
18
+ ### Initialize findspark
19
+ import findspark
20
+ findspark.init()
21
+
22
+
23
+ ## Hadoop
24
+
25
+ !wget https://downloads.apache.org/hadoop/common/hadoop-3.4.2/hadoop-3.4.2.tar.gz
26
+ !tar -xzvf hadoop-3.4.2.tar.gz && cp -r hadoop-3.4.2/ /usr/local/
27
+
28
+
29
+ JAVA_HOME = !readlink -f /usr/bin/java | sed "s:bin/java::"
30
+ java_home_text = JAVA_HOME[0]
31
+ java_home_text_command = f"$ {JAVA_HOME[0]} "
32
+ !echo export JAVA_HOME=$java_home_text >>/usr/local/hadoop-3.4.2/etc/hadoop/hadoop-env.sh
33
+
34
+ # Set environment variables
35
+ import os
36
+ os.environ['HADOOP_HOME']="/usr/local/hadoop-3.4.2"
37
+ os.environ['JAVA_HOME']=java_home_text
38
+
39
+ !alias hadoop="/usr/local/hadoop-3.4.2/bin/hadoop"
40
+ !alias hdfs="/usr/local/hadoop-3.4.2/bin/hdfs"
41
+ !source ~/.bashrc # or source ~/.zshrc
42
+ !sudo ln -s /usr/local/hadoop-3.4.2/bin/hadoop /usr/local/bin/hadoop
43
+ !sudo ln -s /usr/local/hadoop-3.4.2/bin/hdfs /usr/local/bin/hdfs
44
+ !hadoop
45
+ !hdfs
46
+ ## Airflow
47
+
48
+ pip install apache-airflow
49
+
50
+ airflow db init
51
+
52
+ airflow webserver -p 8080 &
53
+ airflow scheduler &
54
+
55
+ ## Ngrok
56
+
57
+ ## MinIO
58
+ ### Client
59
+ ```bash
60
+ pip install minio
61
+ ```
62
+ ### Server
63
+ # Install MinIO binary
64
+ !wget https://dl.min.io/server/minio/release/linux-amd64/minio
65
+ !chmod +x minio
66
+ !mkdir -p ~/minio-data
67
+
68
+ import os
69
+ os.environ['MINIO_ROOT_USER'] = 'username'
70
+ os.environ['MINIO_ROOT_PASSWORD'] = 'username_password'
71
+
72
+ !./minio server ~/minio-data --address ":12390" --console-address ":12391" &
docs/install_airflow.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installing and Setting Up Apache Airflow
2
+
3
+ This guide provides detailed instructions for installing and configuring Apache Airflow with support for asynchronous tasks, Celery, PostgreSQL, and Kubernetes. The steps below ensure a proper setup for running Airflow, initializing its database, creating an admin user, and starting the scheduler and webserver. This setup is suitable for a local development environment or a scalable production setup with the specified backends.
4
+
5
+ ## Prerequisites
6
+ Before proceeding, ensure you have the following:
7
+ - **Python 3.12**: Airflow 2.10.3 is compatible with Python 3.12, as specified in the constraint file.
8
+ - **pip**: The Python package manager to install Airflow and its dependencies.
9
+ - **PostgreSQL**: If using PostgreSQL as the metadata database (recommended for production).
10
+ - **Celery**: For distributed task execution (optional, included in the installation).
11
+ - **Kubernetes**: For running Airflow in a Kubernetes cluster (optional, included in the installation).
12
+ - **Sufficient permissions**: To create directories and run background processes.
13
+ - **Virtual environment** (recommended): To isolate dependencies. Create one with:
14
+ ```bash
15
+ python -m venv venv
16
+ source venv/bin/activate # On Windows: venv\Scripts\activate
17
+ ```
18
+
19
+ ## Installation Steps
20
+
21
+ ### 1. Install Apache Airflow
22
+ Install Airflow for version 2.10.3.
23
+
24
+ ```bash
25
+ pip install apache-airflow==2.10.3
26
+ ```
27
+
28
+ ### 2. Set Up the Airflow Home Directory
29
+ Airflow requires a home directory to store its configuration, logs, and DAGs. The following Python script sets the `AIRFLOW_HOME` environment variable and creates the directory if it doesn't exist.
30
+
31
+ ```python
32
+ import os
33
+ import time
34
+
35
+ # Ensure environment
36
+ os.environ['AIRFLOW_HOME'] = '<your_project_path>/airflow'
37
+ os.makedirs('airflow', exist_ok=True)
38
+ ```
39
+
40
+ Replace `<your_project_path>` with the absolute path to your project directory (e.g., `/home/user/BTC-USDT-ETL-Pipeline`). For example:
41
+ ```python
42
+ os.environ['AIRFLOW_HOME'] = '/home/user/BTC-USDT-ETL-Pipeline/airflow'
43
+ ```
44
+
45
+ This script ensures the `airflow` directory is created in your project path to store Airflow's configuration files, logs, and SQLite database (if not using PostgreSQL).
46
+
47
+ ### 3. Initialize the Airflow Database
48
+ Initialize the Airflow metadata database, which stores DAG runs, task instances, and other metadata. This step is required before starting the scheduler or webserver.
49
+
50
+ ```bash
51
+ # Re-init the database (resets metadata but keeps DAGs if any)
52
+ airflow db init
53
+ ```
54
+
55
+ **Note**:
56
+ - This command creates a default `airflow.cfg` configuration file in `AIRFLOW_HOME`.
57
+ - If using PostgreSQL, ensure the database is running and update the `sql_alchemy_conn` in `airflow.cfg` to point to your PostgreSQL instance (e.g., `postgresql+psycopg2://user:password@localhost:5432/airflow`).
58
+ - Running `airflow db init` resets metadata but preserves any DAGs in the `dags` folder.
59
+
60
+ ### 4. Create an Admin User
61
+ The Airflow webserver requires at least one admin user for login. Create an admin user with the following command:
62
+
63
+ ```bash
64
+ # Create admin user (critical—webserver needs this for login)
65
+ airflow users create \
66
+ --username admin \
67
+ --firstname Admin \
68
+ --lastname User \
69
+ --role Admin \
70
+ --email admin@example.com \
71
+ --password admin
72
+ ```
73
+
74
+ This command creates a user with:
75
+ - Username: `admin`
76
+ - Password: `admin` (change this in production for security)
77
+ - Role: `Admin` (grants full access to the Airflow UI)
78
+
79
+ To verify the user was created successfully, list all users:
80
+
81
+ ```bash
82
+ # Verify user creation
83
+ airflow users list
84
+ ```
85
+
86
+ ### 5. Start the Airflow Scheduler
87
+ The scheduler is responsible for scheduling and executing DAGs. Start it in the background using `nohup` to ensure it continues running.
88
+
89
+ ```bash
90
+ # Start scheduler first (it needs DB)
91
+ nohup airflow scheduler > airflow/scheduler.log 2>&1 &
92
+ ```
93
+
94
+ **Notes**:
95
+ - The scheduler requires the database to be initialized first.
96
+ - Logs are redirected to `scheduler.log` in the specified directory.
97
+ - Replace `airflow` with your `AIRFLOW_HOME` path if different.
98
+
99
+ ### 6. Start the Airflow Webserver
100
+ The webserver provides the Airflow UI for managing DAGs, viewing task logs, and monitoring runs. Start it on port 8081 (or another port if needed).
101
+
102
+ ```bash
103
+ airflow webserver --port 8081 > airflow/airflow.log 2>&1 &
104
+ ```
105
+
106
+ **Notes**:
107
+ - The webserver runs on `http://localhost:8081` by default.
108
+ - Logs are redirected to `airflow.log` in the `AIRFLOW_HOME` directory.
109
+ - Access the UI by navigating to `http://localhost:8081` in your browser and logging in with the admin credentials (username: `admin`, password: `admin`).
110
+
111
+ ## Additional Notes
112
+ - **Configuration**: After running `airflow db init`, review and modify `airflow.cfg` in the `AIRFLOW_HOME` directory to customize settings (e.g., executor type, database connection, or Celery broker).
113
+ - **Celery Setup**: If using the Celery executor, ensure a message broker (e.g., Redis or RabbitMQ) is running and configured in `airflow.cfg`.
114
+ - **Kubernetes Executor**: For Kubernetes, configure the Kubernetes executor in `airflow.cfg` and ensure your Kubernetes cluster is accessible.
115
+ - **Security**: Change the default admin password and secure the database connection in production environments.
116
+ - **Logs**: Check `scheduler.log` and `airflow.log` for troubleshooting.
117
+
118
+ ## Next Steps
119
+ - Place your DAGs in the `AIRFLOW_HOME/dags` folder to start defining workflows.
120
+ - Explore the Airflow UI to monitor and manage your DAGs.
121
+ - Refer to the [Apache Airflow documentation](https://airflow.apache.org/docs/apache-airflow/stable/) for advanced configurations.
docs/install_minio_server.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MinIO Server Setup Guide
2
+
3
+ This guide provides step-by-step instructions to set up and run a MinIO server on a Linux system.
4
+
5
+ ## Prerequisites
6
+ - Python 3.x installed
7
+ - Required Python packages: `python-dotenv`, `wget` (install using `pip install python-dotenv wget`)
8
+ - A `minio.env` environment file
9
+ - Administrative privileges for file permissions and port usage
10
+ - Free ports for MinIO API (default: 9000) and WebUI (default: 9001)
11
+
12
+ ## Setup Instructions
13
+
14
+ ### 1. Download and Prepare MinIO Binary
15
+ Run the following commands to download the MinIO server binary and set up the data directory:
16
+
17
+ ```bash
18
+ wget https://dl.min.io/server/minio/release/linux-amd64/minio
19
+ chmod +x minio
20
+ mkdir -p ~/minio-data
21
+ mkdir -p ~/minio-logs
22
+ ```
23
+
24
+ ### 2. Configure Environment Variables
25
+ Create a `minio.env` file in the same directory as the MinIO binary with the following content, replacing placeholders with your desired values:
26
+
27
+ ```
28
+ MINIO_ROOT_USER=<your_username>
29
+ MINIO_ROOT_PASSWORD=<your_password>
30
+ MINIO_HOST=localhost:<minio_port>
31
+ MINIO_CONSOLE_ADDRESS=localhost:<minio_web_port>
32
+ ```
33
+
34
+ - `<your_username>`: Choose a secure username for the MinIO admin account.
35
+ - `<your_password>`: Use a strong password (at least 8 characters).
36
+ - Ensure ports `<minio_port>` (API) and `<minio_web_port>` (WebUI) are free, or update them to available ports.
37
+
38
+ ### 3. Start the MinIO Server
39
+ Run the following command to start the MinIO server in the background, using the environment variables from `minio.env`:
40
+
41
+ ```bash
42
+ export MINIO_ROOT_USER=<your_username>
43
+ export MINIO_ROOT_PASSWORD=<your_password>
44
+ MINIO_ROOT_USER=$MINIO_ROOT_USER MINIO_ROOT_PASSWORD=$MINIO_ROOT_PASSWORD \
45
+ ./minio server ~/minio-data --address :<minio_port> --console-address :<minio_web_port> > ~/minio-logs/minio_server.log 2>&1 &
46
+ ```
47
+
48
+ - This command exports environment variables and starts the MinIO server.
49
+ - Logs are saved to `~/minio-logs/minio_server.log` for troubleshooting.
50
+
51
+ ### 4. Access MinIO
52
+ - **API Access**: Connect to `http://localhost:<minio_port>` for programmatic access.
53
+ - **WebUI Access**: Open `http://localhost:<minio_web_port>` in a browser and log in with `<your_username>` and `<your_password>`.
54
+
55
+ ## Notes
56
+ - **Stopping the Server**: To stop the MinIO server, find its process ID using `ps aux | grep minio` and terminate it with `kill <pid>`.
57
+ - **Port Conflicts**: If ports `<minio_port>` or `<minio_web_port>` are in use, modify `MINIO_ADDRESS` and `MINIO_CONSOLE_ADDRESS` in `minio.env` to use different ports.
58
+ - **Security**: Store `minio.env` securely and avoid exposing sensitive credentials.
59
+ - **Data Directory**: The `~/minio-data` directory stores MinIO buckets and objects. Ensure it has sufficient disk space.
docs/install_spark.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install Apache Spark
2
+
3
+ This guide provides step-by-step instructions to download, install, and configure Apache Spark 3.5.6 with Hadoop 3 support on a Linux-based system. Apache Spark is a powerful open-source data processing engine designed for big data and machine learning workloads. The following commands will help you set up Spark and configure the environment variables to run Spark applications, including PySpark with Python 3.
4
+
5
+ ## Prerequisites
6
+ - A Linux-based operating system (e.g., Ubuntu, CentOS).
7
+ - `wget` and `tar` utilities installed.
8
+ - `sudo` privileges for moving files to system directories.
9
+ - Python 3 installed (for PySpark).
10
+
11
+ ## Installation Steps
12
+
13
+ ```bash
14
+ # Download Apache Spark 3.5.6 with Hadoop 3 support
15
+ wget https://downloads.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
16
+
17
+ # Extract the downloaded tarball
18
+ tar -xzf spark-3.5.6-bin-hadoop3.tgz
19
+
20
+ # Move the extracted folder to /opt/spark
21
+ sudo mv spark-3.5.6-bin-hadoop3 /opt/spark
22
+
23
+ # Set environment variables for Spark
24
+ export SPARK_HOME=/opt/spark
25
+ export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
26
+ export PYSPARK_PYTHON=python3
27
+ ```
docs/visualize_data.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Analytics & Visualization
2
+
3
+ ## DuckDB
4
+
5
+ Run ad-hoc queries using python to get `output_csv_path`:
6
+
7
+ ```bash
8
+ python components/duckdb2csv.py
9
+ ```
10
+
11
+ ## LockerStudio
12
+
13
+ Upload `analytics/financial_data.csv` to Google Looker and create Report.
14
+
15
+ ## Looker
16
+ You can see report at: https://lookerstudio.google.com/reporting/d12e8138-ffdb-40ac-a7fd-9fa986464f54/page/YtFbF/edit
duckdb_databases/financial_data.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20b333763c7dd6bdddd4eb0e88b7eb427faa464a213bc024fcd6cb5001e92bdb
3
+ size 536576
evaluation/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.csv
logs/.gitkeep ADDED
File without changes