init
#2
by SmileXing - opened
- .env.example +0 -10
- .gitignore +0 -216
- CHANGELOG.md +0 -20
- Dockerfile +0 -19
- LICENSE +0 -201
- README.md +7 -257
- app.py +0 -15
- pyproject.toml +0 -46
- requirements.txt +0 -8
- src/leaderboard_analytics/__init__.py +0 -1
- src/leaderboard_analytics/config.py +0 -25
- src/leaderboard_analytics/db.py +0 -24
- src/leaderboard_analytics/geoip_database.py +0 -36
- src/leaderboard_analytics/main.py +0 -49
- src/leaderboard_analytics/repositories.py +0 -463
- src/leaderboard_analytics/schemas.py +0 -27
- src/leaderboard_analytics/services.py +0 -264
- src/leaderboard_analytics/ui.py +0 -481
- tests/test_geoip_database.py +0 -29
- tests/test_repositories.py +0 -95
- tests/test_schemas.py +0 -16
- tests/test_services.py +0 -110
.env.example
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
MONGO_URI=mongodb://localhost:27017
|
| 2 |
-
MONGO_DATABASE=event_logger
|
| 3 |
-
MONGO_COLLECTION=events
|
| 4 |
-
HOST=0.0.0.0
|
| 5 |
-
PORT=7860
|
| 6 |
-
GRADIO_SHARE=false
|
| 7 |
-
GRADIO_SSR_MODE=false
|
| 8 |
-
GEOIP_DATABASE_PATH=GeoLite2-Country.mmdb
|
| 9 |
-
GEOIP_DATABASE_URL=https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz
|
| 10 |
-
GEOIP_AUTO_DOWNLOAD=true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
DELETED
|
@@ -1,216 +0,0 @@
|
|
| 1 |
-
# Byte-compiled / optimized / DLL files
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.py[codz]
|
| 4 |
-
*$py.class
|
| 5 |
-
|
| 6 |
-
# C extensions
|
| 7 |
-
*.so
|
| 8 |
-
|
| 9 |
-
# Distribution / packaging
|
| 10 |
-
.Python
|
| 11 |
-
build/
|
| 12 |
-
develop-eggs/
|
| 13 |
-
dist/
|
| 14 |
-
downloads/
|
| 15 |
-
eggs/
|
| 16 |
-
.eggs/
|
| 17 |
-
lib/
|
| 18 |
-
lib64/
|
| 19 |
-
parts/
|
| 20 |
-
sdist/
|
| 21 |
-
var/
|
| 22 |
-
wheels/
|
| 23 |
-
share/python-wheels/
|
| 24 |
-
*.egg-info/
|
| 25 |
-
.installed.cfg
|
| 26 |
-
*.egg
|
| 27 |
-
MANIFEST
|
| 28 |
-
|
| 29 |
-
# PyInstaller
|
| 30 |
-
# Usually these files are written by a python script from a template
|
| 31 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
-
*.manifest
|
| 33 |
-
*.spec
|
| 34 |
-
|
| 35 |
-
# Installer logs
|
| 36 |
-
pip-log.txt
|
| 37 |
-
pip-delete-this-directory.txt
|
| 38 |
-
|
| 39 |
-
# Unit test / coverage reports
|
| 40 |
-
htmlcov/
|
| 41 |
-
.tox/
|
| 42 |
-
.nox/
|
| 43 |
-
.coverage
|
| 44 |
-
.coverage.*
|
| 45 |
-
.cache
|
| 46 |
-
nosetests.xml
|
| 47 |
-
coverage.xml
|
| 48 |
-
*.cover
|
| 49 |
-
*.py.cover
|
| 50 |
-
.hypothesis/
|
| 51 |
-
.pytest_cache/
|
| 52 |
-
.pytest_tmp/
|
| 53 |
-
cover/
|
| 54 |
-
|
| 55 |
-
# Translations
|
| 56 |
-
*.mo
|
| 57 |
-
*.pot
|
| 58 |
-
|
| 59 |
-
# Django stuff:
|
| 60 |
-
*.log
|
| 61 |
-
local_settings.py
|
| 62 |
-
db.sqlite3
|
| 63 |
-
db.sqlite3-journal
|
| 64 |
-
|
| 65 |
-
# Flask stuff:
|
| 66 |
-
instance/
|
| 67 |
-
.webassets-cache
|
| 68 |
-
|
| 69 |
-
# Scrapy stuff:
|
| 70 |
-
.scrapy
|
| 71 |
-
|
| 72 |
-
# Sphinx documentation
|
| 73 |
-
docs/_build/
|
| 74 |
-
|
| 75 |
-
# PyBuilder
|
| 76 |
-
.pybuilder/
|
| 77 |
-
target/
|
| 78 |
-
|
| 79 |
-
# Jupyter Notebook
|
| 80 |
-
.ipynb_checkpoints
|
| 81 |
-
|
| 82 |
-
# IPython
|
| 83 |
-
profile_default/
|
| 84 |
-
ipython_config.py
|
| 85 |
-
|
| 86 |
-
# pyenv
|
| 87 |
-
# For a library or package, you might want to ignore these files since the code is
|
| 88 |
-
# intended to run in multiple environments; otherwise, check them in:
|
| 89 |
-
.python-version
|
| 90 |
-
|
| 91 |
-
# pipenv
|
| 92 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 93 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 94 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 95 |
-
# install all needed dependencies.
|
| 96 |
-
Pipfile.lock
|
| 97 |
-
|
| 98 |
-
# UV
|
| 99 |
-
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 100 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 101 |
-
# commonly ignored for libraries.
|
| 102 |
-
uv.lock
|
| 103 |
-
|
| 104 |
-
# poetry
|
| 105 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 106 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 107 |
-
# commonly ignored for libraries.
|
| 108 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 109 |
-
poetry.lock
|
| 110 |
-
poetry.toml
|
| 111 |
-
|
| 112 |
-
# pdm
|
| 113 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 114 |
-
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 115 |
-
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 116 |
-
#pdm.lock
|
| 117 |
-
#pdm.toml
|
| 118 |
-
.pdm-python
|
| 119 |
-
.pdm-build/
|
| 120 |
-
|
| 121 |
-
# pixi
|
| 122 |
-
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 123 |
-
#pixi.lock
|
| 124 |
-
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 125 |
-
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 126 |
-
.pixi
|
| 127 |
-
|
| 128 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 129 |
-
__pypackages__/
|
| 130 |
-
|
| 131 |
-
# Celery stuff
|
| 132 |
-
celerybeat-schedule
|
| 133 |
-
celerybeat.pid
|
| 134 |
-
|
| 135 |
-
# SageMath parsed files
|
| 136 |
-
*.sage.py
|
| 137 |
-
|
| 138 |
-
# Environments
|
| 139 |
-
.env
|
| 140 |
-
.envrc
|
| 141 |
-
.venv
|
| 142 |
-
env/
|
| 143 |
-
venv/
|
| 144 |
-
ENV/
|
| 145 |
-
env.bak/
|
| 146 |
-
venv.bak/
|
| 147 |
-
|
| 148 |
-
# Local GeoIP databases
|
| 149 |
-
*.mmdb
|
| 150 |
-
*.mmdb.gz
|
| 151 |
-
|
| 152 |
-
# Local analytics exports
|
| 153 |
-
visitor_ips*.csv
|
| 154 |
-
|
| 155 |
-
# Spyder project settings
|
| 156 |
-
.spyderproject
|
| 157 |
-
.spyproject
|
| 158 |
-
|
| 159 |
-
# Rope project settings
|
| 160 |
-
.ropeproject
|
| 161 |
-
|
| 162 |
-
# mkdocs documentation
|
| 163 |
-
/site
|
| 164 |
-
|
| 165 |
-
# mypy
|
| 166 |
-
.mypy_cache/
|
| 167 |
-
.dmypy.json
|
| 168 |
-
dmypy.json
|
| 169 |
-
|
| 170 |
-
# Pyre type checker
|
| 171 |
-
.pyre/
|
| 172 |
-
|
| 173 |
-
# pytype static type analyzer
|
| 174 |
-
.pytype/
|
| 175 |
-
|
| 176 |
-
# Cython debug symbols
|
| 177 |
-
cython_debug/
|
| 178 |
-
|
| 179 |
-
# PyCharm
|
| 180 |
-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 181 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 182 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 183 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 184 |
-
.idea/
|
| 185 |
-
|
| 186 |
-
# Abstra
|
| 187 |
-
# Abstra is an AI-powered process automation framework.
|
| 188 |
-
# Ignore directories containing user credentials, local state, and settings.
|
| 189 |
-
# Learn more at https://abstra.io/docs
|
| 190 |
-
.abstra/
|
| 191 |
-
|
| 192 |
-
# Visual Studio Code
|
| 193 |
-
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 194 |
-
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 195 |
-
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 196 |
-
# you could uncomment the following to ignore the entire vscode folder
|
| 197 |
-
.vscode/
|
| 198 |
-
|
| 199 |
-
# Ruff stuff:
|
| 200 |
-
.ruff_cache/
|
| 201 |
-
|
| 202 |
-
# PyPI configuration file
|
| 203 |
-
.pypirc
|
| 204 |
-
|
| 205 |
-
# Cursor
|
| 206 |
-
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 207 |
-
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 208 |
-
# refer to https://docs.cursor.com/context/ignore-files
|
| 209 |
-
.cursorignore
|
| 210 |
-
.cursorindexingignore
|
| 211 |
-
.cursor
|
| 212 |
-
|
| 213 |
-
# Marimo
|
| 214 |
-
marimo/_static/
|
| 215 |
-
marimo/_lsp/
|
| 216 |
-
__marimo__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHANGELOG.md
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
# Changelog
|
| 2 |
-
|
| 3 |
-
All notable changes to this project will be documented in this file.
|
| 4 |
-
|
| 5 |
-
## Unreleased
|
| 6 |
-
|
| 7 |
-
### Added
|
| 8 |
-
|
| 9 |
-
- Added full-range overview totals so UV and Sessions are distinct counts across the selected range.
|
| 10 |
-
- Added ordered funnel logic that counts each step only when it occurs after the previous required step.
|
| 11 |
-
- Added benchmark choices, raw data tables, and CSV export support to the dashboard.
|
| 12 |
-
- Added query validation, MongoDB ping checks, and dashboard-friendly error messages.
|
| 13 |
-
- Added pytest coverage for metric totals, query validation, and MongoDB aggregation pipeline shape.
|
| 14 |
-
- Added CI for formatting, linting, and tests.
|
| 15 |
-
|
| 16 |
-
### Changed
|
| 17 |
-
|
| 18 |
-
- Updated new vs returning visitor logic to compute first-seen dates from the full available page-view history before applying the selected reporting range.
|
| 19 |
-
- Updated MongoDB aggregation pipelines to prefer an indexed `ts` Date field while retaining fallback support for legacy `timestamp` values.
|
| 20 |
-
- Documented recommended MongoDB indexes for production deployments.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
FROM python:3.12-slim
|
| 2 |
-
|
| 3 |
-
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
-
PYTHONUNBUFFERED=1 \
|
| 5 |
-
PIP_NO_CACHE_DIR=1 \
|
| 6 |
-
HOST=0.0.0.0 \
|
| 7 |
-
PORT=7860 \
|
| 8 |
-
GRADIO_SHARE=false
|
| 9 |
-
|
| 10 |
-
WORKDIR /app
|
| 11 |
-
|
| 12 |
-
# Install project dependencies and package from pyproject.toml
|
| 13 |
-
COPY pyproject.toml README.md ./
|
| 14 |
-
COPY src ./src
|
| 15 |
-
RUN pip install --upgrade pip && pip install .
|
| 16 |
-
|
| 17 |
-
EXPOSE 7860
|
| 18 |
-
|
| 19 |
-
CMD ["leaderboard-analytics"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LICENSE
DELETED
|
@@ -1,201 +0,0 @@
|
|
| 1 |
-
Apache License
|
| 2 |
-
Version 2.0, January 2004
|
| 3 |
-
http://www.apache.org/licenses/
|
| 4 |
-
|
| 5 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
-
|
| 7 |
-
1. Definitions.
|
| 8 |
-
|
| 9 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
-
|
| 12 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
-
the copyright owner that is granting the License.
|
| 14 |
-
|
| 15 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
-
other entities that control, are controlled by, or are under common
|
| 17 |
-
control with that entity. For the purposes of this definition,
|
| 18 |
-
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
-
direction or management of such entity, whether by contract or
|
| 20 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
-
|
| 23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
-
exercising permissions granted by this License.
|
| 25 |
-
|
| 26 |
-
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
-
including but not limited to software source code, documentation
|
| 28 |
-
source, and configuration files.
|
| 29 |
-
|
| 30 |
-
"Object" form shall mean any form resulting from mechanical
|
| 31 |
-
transformation or translation of a Source form, including but
|
| 32 |
-
not limited to compiled object code, generated documentation,
|
| 33 |
-
and conversions to other media types.
|
| 34 |
-
|
| 35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
-
Object form, made available under the License, as indicated by a
|
| 37 |
-
copyright notice that is included in or attached to the work
|
| 38 |
-
(an example is provided in the Appendix below).
|
| 39 |
-
|
| 40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
-
form, that is based on (or derived from) the Work and for which the
|
| 42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
-
of this License, Derivative Works shall not include works that remain
|
| 45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
-
the Work and Derivative Works thereof.
|
| 47 |
-
|
| 48 |
-
"Contribution" shall mean any work of authorship, including
|
| 49 |
-
the original version of the Work and any modifications or additions
|
| 50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
-
means any form of electronic, verbal, or written communication sent
|
| 55 |
-
to the Licensor or its representatives, including but not limited to
|
| 56 |
-
communication on electronic mailing lists, source code control systems,
|
| 57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
-
excluding communication that is conspicuously marked or otherwise
|
| 60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
-
|
| 62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
-
subsequently incorporated within the Work.
|
| 65 |
-
|
| 66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
-
Work and such Derivative Works in Source or Object form.
|
| 72 |
-
|
| 73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
-
(except as stated in this section) patent license to make, have made,
|
| 77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
-
where such license applies only to those patent claims licensable
|
| 79 |
-
by such Contributor that are necessarily infringed by their
|
| 80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
-
institute patent litigation against any entity (including a
|
| 83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
-
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
-
or contributory patent infringement, then any patent licenses
|
| 86 |
-
granted to You under this License for that Work shall terminate
|
| 87 |
-
as of the date such litigation is filed.
|
| 88 |
-
|
| 89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
-
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
-
modifications, and in Source or Object form, provided that You
|
| 92 |
-
meet the following conditions:
|
| 93 |
-
|
| 94 |
-
(a) You must give any other recipients of the Work or
|
| 95 |
-
Derivative Works a copy of this License; and
|
| 96 |
-
|
| 97 |
-
(b) You must cause any modified files to carry prominent notices
|
| 98 |
-
stating that You changed the files; and
|
| 99 |
-
|
| 100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
-
that You distribute, all copyright, patent, trademark, and
|
| 102 |
-
attribution notices from the Source form of the Work,
|
| 103 |
-
excluding those notices that do not pertain to any part of
|
| 104 |
-
the Derivative Works; and
|
| 105 |
-
|
| 106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
-
distribution, then any Derivative Works that You distribute must
|
| 108 |
-
include a readable copy of the attribution notices contained
|
| 109 |
-
within such NOTICE file, excluding those notices that do not
|
| 110 |
-
pertain to any part of the Derivative Works, in at least one
|
| 111 |
-
of the following places: within a NOTICE text file distributed
|
| 112 |
-
as part of the Derivative Works; within the Source form or
|
| 113 |
-
documentation, if provided along with the Derivative Works; or,
|
| 114 |
-
within a display generated by the Derivative Works, if and
|
| 115 |
-
wherever such third-party notices normally appear. The contents
|
| 116 |
-
of the NOTICE file are for informational purposes only and
|
| 117 |
-
do not modify the License. You may add Your own attribution
|
| 118 |
-
notices within Derivative Works that You distribute, alongside
|
| 119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
-
that such additional attribution notices cannot be construed
|
| 121 |
-
as modifying the License.
|
| 122 |
-
|
| 123 |
-
You may add Your own copyright statement to Your modifications and
|
| 124 |
-
may provide additional or different license terms and conditions
|
| 125 |
-
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
-
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
-
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
-
the conditions stated in this License.
|
| 129 |
-
|
| 130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
-
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
-
this License, without any additional terms or conditions.
|
| 134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
-
the terms of any separate license agreement you may have executed
|
| 136 |
-
with Licensor regarding such Contributions.
|
| 137 |
-
|
| 138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
-
except as required for reasonable and customary use in describing the
|
| 141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
-
|
| 143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
-
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
-
implied, including, without limitation, any warranties or conditions
|
| 148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
-
appropriateness of using or redistributing the Work and assume any
|
| 151 |
-
risks associated with Your exercise of permissions under this License.
|
| 152 |
-
|
| 153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
-
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
-
unless required by applicable law (such as deliberate and grossly
|
| 156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
-
liable to You for damages, including any direct, indirect, special,
|
| 158 |
-
incidental, or consequential damages of any character arising as a
|
| 159 |
-
result of this License or out of the use or inability to use the
|
| 160 |
-
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
-
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
-
other commercial damages or losses), even if such Contributor
|
| 163 |
-
has been advised of the possibility of such damages.
|
| 164 |
-
|
| 165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
-
or other liability obligations and/or rights consistent with this
|
| 169 |
-
License. However, in accepting such obligations, You may act only
|
| 170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
-
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
-
defend, and hold each Contributor harmless for any liability
|
| 173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
-
of your accepting any such warranty or additional liability.
|
| 175 |
-
|
| 176 |
-
END OF TERMS AND CONDITIONS
|
| 177 |
-
|
| 178 |
-
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
-
|
| 180 |
-
To apply the Apache License to your work, attach the following
|
| 181 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
-
replaced with your own identifying information. (Don't include
|
| 183 |
-
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
-
comment syntax for the file format. We also recommend that a
|
| 185 |
-
file or class name and description of purpose be included on the
|
| 186 |
-
same "printed page" as the copyright notice for easier
|
| 187 |
-
identification within third-party archives.
|
| 188 |
-
|
| 189 |
-
Copyright [yyyy] [name of copyright owner]
|
| 190 |
-
|
| 191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
-
you may not use this file except in compliance with the License.
|
| 193 |
-
You may obtain a copy of the License at
|
| 194 |
-
|
| 195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
-
|
| 197 |
-
Unless required by applicable law or agreed to in writing, software
|
| 198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
-
See the License for the specific language governing permissions and
|
| 201 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,262 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: green
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version: "6.0.0"
|
| 8 |
-
python_version: "3.11"
|
| 9 |
-
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
This project analyzes user behavior on the MTEB leaderboard page from event logs in MongoDB.
|
| 16 |
-
|
| 17 |
-
The primary purpose of this document is to define **what is measured**, **where each metric comes from**, and **how each metric is calculated**.
|
| 18 |
-
|
| 19 |
-
---
|
| 20 |
-
|
| 21 |
-
## Data Contract
|
| 22 |
-
|
| 23 |
-
All analytics are based on the `events` collection and the following stable fields:
|
| 24 |
-
|
| 25 |
-
- Core dimensions: `event_name`, `timestamp`, `session_id`
|
| 26 |
-
- Preferred event time: `ts` as a MongoDB Date
|
| 27 |
-
- Behavior context: `benchmark`, `filters`
|
| 28 |
-
- Visitor identity (approximate): `properties.visitor_id`
|
| 29 |
-
- Visitor IP for country analysis: `properties.ip`
|
| 30 |
-
- Change context: `properties.old_value`, `properties.new_value`, `properties.filter_name`
|
| 31 |
-
|
| 32 |
-
Important event names:
|
| 33 |
-
|
| 34 |
-
- `page_view`
|
| 35 |
-
- `benchmark_change`
|
| 36 |
-
- `filter_change_`* (dynamic names, such as `filter_change_task_type`)
|
| 37 |
-
- `table_download` (currently may be missing in some deployments)
|
| 38 |
-
|
| 39 |
-
---
|
| 40 |
-
|
| 41 |
-
## Metrics Dictionary
|
| 42 |
-
|
| 43 |
-
### 1) PV (Page Views)
|
| 44 |
-
|
| 45 |
-
- **Definition**: Number of page view events.
|
| 46 |
-
- **Source fields**: `event_name`
|
| 47 |
-
- **Calculation**:
|
| 48 |
-
- Filter events where `event_name == "page_view"`
|
| 49 |
-
- PV = count of matched events
|
| 50 |
-
|
| 51 |
-
### 2) Sessions
|
| 52 |
-
|
| 53 |
-
- **Definition**: Number of unique interaction sessions.
|
| 54 |
-
- **Source fields**: `session_id`
|
| 55 |
-
- **Calculation**:
|
| 56 |
-
- Sessions = count of distinct non-empty `session_id` values in the selected time range
|
| 57 |
-
|
| 58 |
-
### 3) UV (Unique Visitors, Approximate)
|
| 59 |
-
|
| 60 |
-
- **Definition**: Number of unique visitors identified by hashed fingerprint.
|
| 61 |
-
- **Source fields**: `properties.visitor_id`
|
| 62 |
-
- **Calculation**:
|
| 63 |
-
- Remove null/empty `properties.visitor_id`
|
| 64 |
-
- UV = count of distinct `properties.visitor_id` values in the selected time range
|
| 65 |
-
|
| 66 |
-
### 4) Sessions Per Visitor
|
| 67 |
-
|
| 68 |
-
- **Definition**: Average number of sessions per visitor.
|
| 69 |
-
- **Source fields**: derived from Sessions and UV
|
| 70 |
-
- **Calculation**:
|
| 71 |
-
- Sessions Per Visitor = `Sessions / UV`
|
| 72 |
-
- If UV is 0, result is 0
|
| 73 |
-
|
| 74 |
-
### 5) Session Depth (Events Per Session)
|
| 75 |
-
|
| 76 |
-
- **Definition**: Average interaction intensity per session.
|
| 77 |
-
- **Source fields**: all events, `session_id`
|
| 78 |
-
- **Calculation**:
|
| 79 |
-
- Total Events = count of all events in range
|
| 80 |
-
- Session Depth = `Total Events / Sessions`
|
| 81 |
-
- If Sessions is 0, result is 0
|
| 82 |
-
|
| 83 |
-
---
|
| 84 |
-
|
| 85 |
-
## Behavior Metrics
|
| 86 |
-
|
| 87 |
-
### 6) Benchmark Popularity
|
| 88 |
-
|
| 89 |
-
- **Definition**: Frequency of selected benchmarks.
|
| 90 |
-
- **Source fields**: `event_name`, `properties.new_value`
|
| 91 |
-
- **Calculation**:
|
| 92 |
-
- Filter `event_name == "benchmark_change"`
|
| 93 |
-
- Group by `properties.new_value`
|
| 94 |
-
- Popularity = event count per benchmark value
|
| 95 |
-
|
| 96 |
-
### 7) Filter Usage Distribution
|
| 97 |
-
|
| 98 |
-
- **Definition**: Usage volume by filter event type.
|
| 99 |
-
- **Source fields**: `event_name`
|
| 100 |
-
- **Calculation**:
|
| 101 |
-
- Filter `event_name` matching regex `^filter_change_`
|
| 102 |
-
- Group by `event_name`
|
| 103 |
-
- Distribution = count per filter event
|
| 104 |
-
|
| 105 |
-
### 8) Filter Session Coverage
|
| 106 |
-
|
| 107 |
-
- **Definition**: Number of sessions that used each filter type.
|
| 108 |
-
- **Source fields**: `event_name`, `session_id`
|
| 109 |
-
- **Calculation**:
|
| 110 |
-
- For each `filter_change_`* event type:
|
| 111 |
-
- collect distinct non-empty `session_id`
|
| 112 |
-
- coverage = distinct session count
|
| 113 |
-
|
| 114 |
-
---
|
| 115 |
-
|
| 116 |
-
## Funnel Metrics
|
| 117 |
-
|
| 118 |
-
Recommended session-level funnel:
|
| 119 |
-
|
| 120 |
-
1. `page_view`
|
| 121 |
-
2. `benchmark_change`
|
| 122 |
-
3. `filter_change_`*
|
| 123 |
-
4. `table_download`
|
| 124 |
-
|
| 125 |
-
### 9) Step Session Count
|
| 126 |
-
|
| 127 |
-
- **Definition**: Number of sessions that reached each ordered funnel step.
|
| 128 |
-
- **Source fields**: `session_id`, `event_name`, `ts` or `timestamp`
|
| 129 |
-
- **Calculation**:
|
| 130 |
-
- Group events by `session_id`
|
| 131 |
-
- Sort events by event time
|
| 132 |
-
- Count each cumulative step only when it occurs after the previous required step
|
| 133 |
-
|
| 134 |
-
### 10) Step Conversion Rate
|
| 135 |
-
|
| 136 |
-
- **Definition**: Conversion from funnel step 1 (`page_view`) to each step.
|
| 137 |
-
- **Source fields**: derived from Step Session Count
|
| 138 |
-
- **Calculation**:
|
| 139 |
-
- Conversion Rate(step N) = `StepN Sessions / Step1 Sessions * 100%`
|
| 140 |
-
- If Step1 Sessions is 0, result is 0%
|
| 141 |
-
|
| 142 |
-
---
|
| 143 |
-
|
| 144 |
-
## Visitor Segmentation Metrics
|
| 145 |
-
|
| 146 |
-
### 11) New Visitors
|
| 147 |
-
|
| 148 |
-
- **Definition**: Visitors whose current period contains their first observed visit date.
|
| 149 |
-
- **Source fields**: `event_name`, `ts` or `timestamp`, `properties.visitor_id`
|
| 150 |
-
- **Calculation**:
|
| 151 |
-
- Use `page_view` events only
|
| 152 |
-
- For each `visitor_id`, find earliest timestamp (`first_seen`) from the full available dataset
|
| 153 |
-
- If event date equals `first_seen` date, classify as `new`
|
| 154 |
-
- Count distinct `visitor_id` by period
|
| 155 |
-
|
| 156 |
-
### 12) Returning Visitors
|
| 157 |
-
|
| 158 |
-
- **Definition**: Visitors seen after their first observed date.
|
| 159 |
-
- **Source fields**: same as New Visitors
|
| 160 |
-
- **Calculation**:
|
| 161 |
-
- Use same first-seen logic
|
| 162 |
-
- If event date is later than first-seen date, classify as `returning`
|
| 163 |
-
- Count distinct `visitor_id` by period
|
| 164 |
-
|
| 165 |
-
### 13) Visitor Locations by Country
|
| 166 |
-
|
| 167 |
-
- **Definition**: Page view volume by visitor IP country/region.
|
| 168 |
-
- **Source fields**: `event_name`, `properties.ip`
|
| 169 |
-
- **Calculation**:
|
| 170 |
-
- Filter `event_name == "page_view"`
|
| 171 |
-
- Remove null/empty `properties.ip`
|
| 172 |
-
- Group page views by IP in MongoDB
|
| 173 |
-
- Resolve each IP to a country using the local MaxMind GeoLite2 Country database
|
| 174 |
-
- Group by `country_code` and `country_name`
|
| 175 |
-
- Map color = page view count (`pv`)
|
| 176 |
-
- Private, invalid, unresolved, or unconfigured IPs are grouped as `Unknown`
|
| 177 |
-
|
| 178 |
-
---
|
| 179 |
-
|
| 180 |
-
## Time Aggregation Rules
|
| 181 |
-
|
| 182 |
-
All trend metrics support these granularities:
|
| 183 |
-
|
| 184 |
-
- `day` -> `%Y-%m-%d`
|
| 185 |
-
- `week` -> `%G-W%V` (ISO week)
|
| 186 |
-
- `month` -> `%Y-%m`
|
| 187 |
-
|
| 188 |
-
Time filtering rules:
|
| 189 |
-
|
| 190 |
-
- Prefer the indexed MongoDB Date field `ts`
|
| 191 |
-
- Fall back to converting legacy `timestamp` values when `ts` is not present
|
| 192 |
-
- Keep records where `start_time <= event time <= end_time`
|
| 193 |
-
|
| 194 |
-
Optional benchmark filtering:
|
| 195 |
-
|
| 196 |
-
- If benchmark filter is provided, add `benchmark == <value>` to match conditions
|
| 197 |
-
|
| 198 |
-
---
|
| 199 |
-
|
| 200 |
-
## Data Quality Notes
|
| 201 |
-
|
| 202 |
-
1. `visitor_id` is an approximate identifier, not a strict user identity.
|
| 203 |
-
2. For `filter_change_`*, `properties.new_value` may not always represent the actual final filter value; prefer `filters` snapshot for behavioral context.
|
| 204 |
-
3. If `table_download` is not instrumented, funnel step 4 will under-report by design.
|
| 205 |
-
4. Total UV and Sessions are distinct counts across the full selected time range. They are not calculated by summing per-period trend values.
|
| 206 |
-
5. Funnel steps are ordered by event time. A session only reaches a later step when that step happens after the previous required step.
|
| 207 |
-
|
| 208 |
-
---
|
| 209 |
-
|
| 210 |
-
## MongoDB Performance Notes
|
| 211 |
-
|
| 212 |
-
For production deployments, store event time as a MongoDB Date field named `ts`. Keeping only string timestamps forces aggregation pipelines to convert time values at query time and can reduce index usage.
|
| 213 |
-
|
| 214 |
-
Recommended indexes:
|
| 215 |
-
|
| 216 |
-
```javascript
|
| 217 |
-
db.events.createIndex({ ts: 1 })
|
| 218 |
-
db.events.createIndex({ ts: 1, benchmark: 1 })
|
| 219 |
-
db.events.createIndex({ event_name: 1, ts: 1 })
|
| 220 |
-
db.events.createIndex({ session_id: 1, ts: 1 })
|
| 221 |
-
db.events.createIndex({ "properties.visitor_id": 1, ts: 1 })
|
| 222 |
-
db.events.createIndex({ event_name: 1, ts: 1, "properties.ip": 1 })
|
| 223 |
-
```
|
| 224 |
-
|
| 225 |
-
Legacy events with only `timestamp` remain supported, but backfilling `ts` is recommended before running this dashboard against large collections.
|
| 226 |
-
|
| 227 |
-
---
|
| 228 |
-
|
| 229 |
-
## Minimal Runtime Notes
|
| 230 |
-
|
| 231 |
-
Only required runtime inputs:
|
| 232 |
-
|
| 233 |
-
- MongoDB connection URI (`MONGO_URI`)
|
| 234 |
-
- Mongo database/collection names (defaults supported)
|
| 235 |
-
|
| 236 |
-
Optional visitor location input:
|
| 237 |
-
|
| 238 |
-
- `GEOIP_DATABASE_PATH`: path to a local MaxMind `GeoLite2-Country.mmdb` file
|
| 239 |
-
- `GEOIP_DATABASE_URL`: URL for a gzipped GeoLite2 Country MMDB download
|
| 240 |
-
- `GEOIP_AUTO_DOWNLOAD`: whether to download and decompress the MMDB when missing
|
| 241 |
-
|
| 242 |
-
The dashboard does not call an external IP lookup API for visitor lookups. By default,
|
| 243 |
-
startup downloads `https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz`
|
| 244 |
-
when `GEOIP_DATABASE_PATH` is missing, decompresses it, and uses the resulting MMDB file
|
| 245 |
-
locally. Set `GEOIP_AUTO_DOWNLOAD=false` if the runtime cannot access the network or if
|
| 246 |
-
you prefer to mount the MMDB yourself. If the database is unavailable, visitor location
|
| 247 |
-
rows are grouped as `Unknown`.
|
| 248 |
-
|
| 249 |
-
Local commands:
|
| 250 |
-
|
| 251 |
-
```bash
|
| 252 |
-
uv sync
|
| 253 |
-
uv run leaderboard-analytics
|
| 254 |
-
```
|
| 255 |
-
|
| 256 |
-
Run quality checks:
|
| 257 |
-
|
| 258 |
-
```bash
|
| 259 |
-
uv run ruff format --check .
|
| 260 |
-
uv run ruff check .
|
| 261 |
-
uv run pytest
|
| 262 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Leaderboard Analytics Service
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: green
|
| 5 |
colorTo: green
|
| 6 |
+
sdk: docker
|
|
|
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
short_description: A backend analytics service for the MTEB Leaderboard
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
|
| 4 |
-
# Ensure src-layout package is importable in Hugging Face Spaces runtime.
|
| 5 |
-
ROOT_DIR = Path(__file__).resolve().parent
|
| 6 |
-
SRC_DIR = ROOT_DIR / "src"
|
| 7 |
-
if str(SRC_DIR) not in sys.path:
|
| 8 |
-
sys.path.insert(0, str(SRC_DIR))
|
| 9 |
-
|
| 10 |
-
from leaderboard_analytics.main import create_demo, launch_demo # noqa: E402
|
| 11 |
-
|
| 12 |
-
demo = create_demo()
|
| 13 |
-
|
| 14 |
-
if __name__ == "__main__":
|
| 15 |
-
launch_demo(demo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
[project]
|
| 2 |
-
name = "leaderboard-analytics-service"
|
| 3 |
-
version = "0.1.0"
|
| 4 |
-
description = "Analytics dashboard for MTEB leaderboard event logs"
|
| 5 |
-
readme = "README.md"
|
| 6 |
-
requires-python = ">=3.11"
|
| 7 |
-
dependencies = [
|
| 8 |
-
"gradio>=6.0.0",
|
| 9 |
-
"pymongo>=4.10.0",
|
| 10 |
-
"pydantic>=2.9.0",
|
| 11 |
-
"pydantic-settings>=2.6.0",
|
| 12 |
-
"python-dotenv>=1.0.1",
|
| 13 |
-
"pandas>=2.2.3",
|
| 14 |
-
"plotly>=5.24.1",
|
| 15 |
-
"geoip2>=4.8.0",
|
| 16 |
-
]
|
| 17 |
-
|
| 18 |
-
[project.optional-dependencies]
|
| 19 |
-
dev = [
|
| 20 |
-
"pytest>=8.3.0",
|
| 21 |
-
"ruff>=0.8.0",
|
| 22 |
-
]
|
| 23 |
-
|
| 24 |
-
[tool.ruff]
|
| 25 |
-
line-length = 100
|
| 26 |
-
target-version = "py311"
|
| 27 |
-
|
| 28 |
-
[tool.ruff.lint]
|
| 29 |
-
select = ["E", "F", "I", "B", "UP", "C4"]
|
| 30 |
-
|
| 31 |
-
[tool.ruff.format]
|
| 32 |
-
quote-style = "double"
|
| 33 |
-
indent-style = "space"
|
| 34 |
-
|
| 35 |
-
[project.scripts]
|
| 36 |
-
leaderboard-analytics = "leaderboard_analytics.main:run"
|
| 37 |
-
|
| 38 |
-
[build-system]
|
| 39 |
-
requires = ["hatchling"]
|
| 40 |
-
build-backend = "hatchling.build"
|
| 41 |
-
|
| 42 |
-
[tool.hatch.build.targets.wheel]
|
| 43 |
-
packages = ["src/leaderboard_analytics"]
|
| 44 |
-
|
| 45 |
-
[tool.pytest.ini_options]
|
| 46 |
-
pythonpath = ["src"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
gradio>=6.0.0
|
| 2 |
-
pymongo>=4.10.0
|
| 3 |
-
pydantic>=2.9.0
|
| 4 |
-
pydantic-settings>=2.6.0
|
| 5 |
-
python-dotenv>=1.0.1
|
| 6 |
-
pandas>=2.2.3
|
| 7 |
-
plotly>=5.24.1
|
| 8 |
-
geoip2>=4.8.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
"""Leaderboard analytics package."""
|
|
|
|
|
|
src/leaderboard_analytics/config.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
from functools import lru_cache
|
| 2 |
-
|
| 3 |
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class Settings(BaseSettings):
|
| 7 |
-
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
| 8 |
-
|
| 9 |
-
mongo_uri: str = ""
|
| 10 |
-
mongo_database: str = "event_logger"
|
| 11 |
-
mongo_collection: str = "events"
|
| 12 |
-
host: str = "0.0.0.0"
|
| 13 |
-
port: int = 7860
|
| 14 |
-
gradio_share: bool = False
|
| 15 |
-
gradio_ssr_mode: bool = False
|
| 16 |
-
geoip_database_path: str = "GeoLite2-Country.mmdb"
|
| 17 |
-
geoip_database_url: str = (
|
| 18 |
-
"https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
|
| 19 |
-
)
|
| 20 |
-
geoip_auto_download: bool = True
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
@lru_cache(maxsize=1)
|
| 24 |
-
def get_settings() -> Settings:
|
| 25 |
-
return Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/db.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
from pymongo import MongoClient
|
| 2 |
-
from pymongo.collection import Collection
|
| 3 |
-
from pymongo.database import Database
|
| 4 |
-
|
| 5 |
-
from leaderboard_analytics.config import get_settings
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def get_mongo_client() -> MongoClient:
|
| 9 |
-
settings = get_settings()
|
| 10 |
-
if not settings.mongo_uri:
|
| 11 |
-
raise ValueError("MONGO_URI is not configured. Please set MONGO_URI in .env file.")
|
| 12 |
-
client = MongoClient(settings.mongo_uri, serverSelectionTimeoutMS=5000)
|
| 13 |
-
client.admin.command("ping")
|
| 14 |
-
return client
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def get_database(client: MongoClient) -> Database:
|
| 18 |
-
settings = get_settings()
|
| 19 |
-
return client[settings.mongo_database]
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def get_events_collection(db: Database) -> Collection:
|
| 23 |
-
settings = get_settings()
|
| 24 |
-
return db[settings.mongo_collection]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/geoip_database.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import gzip
|
| 2 |
-
import shutil
|
| 3 |
-
import tempfile
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from urllib.request import urlopen
|
| 6 |
-
|
| 7 |
-
DEFAULT_GEOIP_DATABASE_URL = (
|
| 8 |
-
"https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
|
| 9 |
-
)
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def ensure_geoip_database(
|
| 13 |
-
database_path: str | Path,
|
| 14 |
-
source_url: str = DEFAULT_GEOIP_DATABASE_URL,
|
| 15 |
-
*,
|
| 16 |
-
auto_download: bool = True,
|
| 17 |
-
timeout: float = 30.0,
|
| 18 |
-
) -> Path:
|
| 19 |
-
target_path = Path(database_path)
|
| 20 |
-
if target_path.exists() or not auto_download:
|
| 21 |
-
return target_path
|
| 22 |
-
|
| 23 |
-
target_path.parent.mkdir(parents=True, exist_ok=True)
|
| 24 |
-
with tempfile.NamedTemporaryFile(
|
| 25 |
-
prefix=f"{target_path.name}.",
|
| 26 |
-
suffix=".tmp",
|
| 27 |
-
dir=target_path.parent,
|
| 28 |
-
delete=False,
|
| 29 |
-
) as temp_file:
|
| 30 |
-
temp_path = Path(temp_file.name)
|
| 31 |
-
with urlopen(source_url, timeout=timeout) as response:
|
| 32 |
-
with gzip.GzipFile(fileobj=response) as gzip_file:
|
| 33 |
-
shutil.copyfileobj(gzip_file, temp_file)
|
| 34 |
-
|
| 35 |
-
temp_path.replace(target_path)
|
| 36 |
-
return target_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/main.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
from leaderboard_analytics.config import get_settings
|
| 2 |
-
from leaderboard_analytics.db import get_database, get_events_collection, get_mongo_client
|
| 3 |
-
from leaderboard_analytics.geoip_database import ensure_geoip_database
|
| 4 |
-
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 5 |
-
from leaderboard_analytics.services import AnalyticsService
|
| 6 |
-
from leaderboard_analytics.ui import build_dashboard
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def create_demo():
|
| 10 |
-
settings = get_settings()
|
| 11 |
-
client = get_mongo_client()
|
| 12 |
-
db = get_database(client)
|
| 13 |
-
events_collection = get_events_collection(db)
|
| 14 |
-
geoip_database_path = settings.geoip_database_path
|
| 15 |
-
try:
|
| 16 |
-
geoip_database_path = str(
|
| 17 |
-
ensure_geoip_database(
|
| 18 |
-
settings.geoip_database_path,
|
| 19 |
-
settings.geoip_database_url,
|
| 20 |
-
auto_download=settings.geoip_auto_download,
|
| 21 |
-
)
|
| 22 |
-
)
|
| 23 |
-
except Exception as exc:
|
| 24 |
-
print(f"GeoIP database download failed: {exc}")
|
| 25 |
-
|
| 26 |
-
repository = AnalyticsRepository(events_collection=events_collection)
|
| 27 |
-
service = AnalyticsService(
|
| 28 |
-
repository=repository,
|
| 29 |
-
geoip_database_path=geoip_database_path,
|
| 30 |
-
)
|
| 31 |
-
return build_dashboard(service=service)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def launch_demo(demo) -> None:
|
| 35 |
-
settings = get_settings()
|
| 36 |
-
demo.launch(
|
| 37 |
-
server_name=settings.host,
|
| 38 |
-
server_port=settings.port,
|
| 39 |
-
share=settings.gradio_share,
|
| 40 |
-
ssr_mode=settings.gradio_ssr_mode,
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def run() -> None:
|
| 45 |
-
launch_demo(create_demo())
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
if __name__ == "__main__":
|
| 49 |
-
run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/repositories.py
DELETED
|
@@ -1,463 +0,0 @@
|
|
| 1 |
-
from collections.abc import Iterable
|
| 2 |
-
|
| 3 |
-
from pymongo.collection import Collection
|
| 4 |
-
|
| 5 |
-
from leaderboard_analytics.schemas import Granularity, QueryFilters
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def _period_expression(granularity: Granularity) -> dict:
|
| 9 |
-
format_map = {
|
| 10 |
-
Granularity.DAY: "%Y-%m-%d",
|
| 11 |
-
Granularity.WEEK: "%G-W%V",
|
| 12 |
-
Granularity.MONTH: "%Y-%m",
|
| 13 |
-
}
|
| 14 |
-
return {"$dateToString": {"format": format_map[granularity], "date": "$event_ts"}}
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def _with_normalized_time() -> dict:
|
| 18 |
-
return {
|
| 19 |
-
"$addFields": {
|
| 20 |
-
"event_ts": {"$ifNull": ["$ts", {"$toDate": "$timestamp"}]},
|
| 21 |
-
"visitor_id": "$properties.visitor_id",
|
| 22 |
-
}
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def _indexed_time_prefilter(filters: QueryFilters) -> dict:
|
| 27 |
-
matcher: dict = {
|
| 28 |
-
"$or": [
|
| 29 |
-
{"ts": {"$gte": filters.start_time, "$lte": filters.end_time}},
|
| 30 |
-
{"ts": None},
|
| 31 |
-
{"ts": {"$exists": False}},
|
| 32 |
-
]
|
| 33 |
-
}
|
| 34 |
-
if filters.benchmark:
|
| 35 |
-
matcher["benchmark"] = filters.benchmark
|
| 36 |
-
return matcher
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
|
| 40 |
-
matcher: dict = {
|
| 41 |
-
"event_ts": {
|
| 42 |
-
"$gte": filters.start_time,
|
| 43 |
-
"$lte": filters.end_time,
|
| 44 |
-
}
|
| 45 |
-
}
|
| 46 |
-
if filters.benchmark:
|
| 47 |
-
matcher["benchmark"] = filters.benchmark
|
| 48 |
-
return matcher
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def _non_empty_set_size(field_name: str, variable_name: str) -> dict:
|
| 52 |
-
return {
|
| 53 |
-
"$size": {
|
| 54 |
-
"$filter": {
|
| 55 |
-
"input": f"${field_name}",
|
| 56 |
-
"as": variable_name,
|
| 57 |
-
"cond": {
|
| 58 |
-
"$and": [
|
| 59 |
-
{"$ne": [f"$${variable_name}", None]},
|
| 60 |
-
{"$ne": [f"$${variable_name}", ""]},
|
| 61 |
-
]
|
| 62 |
-
},
|
| 63 |
-
}
|
| 64 |
-
}
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
class AnalyticsRepository:
|
| 69 |
-
def __init__(self, events_collection: Collection) -> None:
|
| 70 |
-
self.events_collection = events_collection
|
| 71 |
-
|
| 72 |
-
def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
|
| 73 |
-
period_expr = _period_expression(filters.granularity)
|
| 74 |
-
pipeline: list[dict] = [
|
| 75 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 76 |
-
_with_normalized_time(),
|
| 77 |
-
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 78 |
-
{
|
| 79 |
-
"$group": {
|
| 80 |
-
"_id": {"period": period_expr},
|
| 81 |
-
"pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
|
| 82 |
-
"event_count": {"$sum": 1},
|
| 83 |
-
"sessions": {"$addToSet": "$session_id"},
|
| 84 |
-
"visitors": {"$addToSet": "$visitor_id"},
|
| 85 |
-
}
|
| 86 |
-
},
|
| 87 |
-
{
|
| 88 |
-
"$project": {
|
| 89 |
-
"_id": 0,
|
| 90 |
-
"period": "$_id.period",
|
| 91 |
-
"pv": 1,
|
| 92 |
-
"event_count": 1,
|
| 93 |
-
"session_count": _non_empty_set_size("sessions", "s"),
|
| 94 |
-
"uv": _non_empty_set_size("visitors", "v"),
|
| 95 |
-
}
|
| 96 |
-
},
|
| 97 |
-
{"$sort": {"period": 1}},
|
| 98 |
-
]
|
| 99 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 100 |
-
|
| 101 |
-
def overview_totals(self, filters: QueryFilters) -> dict:
|
| 102 |
-
pipeline: list[dict] = [
|
| 103 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 104 |
-
_with_normalized_time(),
|
| 105 |
-
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 106 |
-
{
|
| 107 |
-
"$group": {
|
| 108 |
-
"_id": None,
|
| 109 |
-
"pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
|
| 110 |
-
"events": {"$sum": 1},
|
| 111 |
-
"sessions": {"$addToSet": "$session_id"},
|
| 112 |
-
"visitors": {"$addToSet": "$visitor_id"},
|
| 113 |
-
}
|
| 114 |
-
},
|
| 115 |
-
{
|
| 116 |
-
"$project": {
|
| 117 |
-
"_id": 0,
|
| 118 |
-
"pv": 1,
|
| 119 |
-
"events": 1,
|
| 120 |
-
"sessions": _non_empty_set_size("sessions", "s"),
|
| 121 |
-
"uv": _non_empty_set_size("visitors", "v"),
|
| 122 |
-
}
|
| 123 |
-
},
|
| 124 |
-
]
|
| 125 |
-
return self.safe_first(self.events_collection.aggregate(pipeline))
|
| 126 |
-
|
| 127 |
-
def benchmark_top(self, filters: QueryFilters, limit: int = 20) -> list[dict]:
|
| 128 |
-
pipeline: list[dict] = [
|
| 129 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 130 |
-
_with_normalized_time(),
|
| 131 |
-
{
|
| 132 |
-
"$match": {
|
| 133 |
-
**_with_time_and_optional_benchmark(filters),
|
| 134 |
-
"event_name": "benchmark_change",
|
| 135 |
-
}
|
| 136 |
-
},
|
| 137 |
-
{"$group": {"_id": "$properties.new_value", "count": {"$sum": 1}}},
|
| 138 |
-
{"$match": {"_id": {"$nin": [None, ""]}}},
|
| 139 |
-
{"$project": {"_id": 0, "benchmark": "$_id", "count": 1}},
|
| 140 |
-
{"$sort": {"count": -1}},
|
| 141 |
-
{"$limit": limit},
|
| 142 |
-
]
|
| 143 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 144 |
-
|
| 145 |
-
def filter_distribution(self, filters: QueryFilters) -> list[dict]:
|
| 146 |
-
pipeline: list[dict] = [
|
| 147 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 148 |
-
_with_normalized_time(),
|
| 149 |
-
{
|
| 150 |
-
"$match": {
|
| 151 |
-
**_with_time_and_optional_benchmark(filters),
|
| 152 |
-
"event_name": {"$regex": "^filter_change_"},
|
| 153 |
-
}
|
| 154 |
-
},
|
| 155 |
-
{
|
| 156 |
-
"$group": {
|
| 157 |
-
"_id": "$event_name",
|
| 158 |
-
"count": {"$sum": 1},
|
| 159 |
-
"sessions": {"$addToSet": "$session_id"},
|
| 160 |
-
}
|
| 161 |
-
},
|
| 162 |
-
{
|
| 163 |
-
"$project": {
|
| 164 |
-
"_id": 0,
|
| 165 |
-
"event_name": "$_id",
|
| 166 |
-
"count": 1,
|
| 167 |
-
"session_coverage": _non_empty_set_size("sessions", "s"),
|
| 168 |
-
}
|
| 169 |
-
},
|
| 170 |
-
{"$sort": {"count": -1}},
|
| 171 |
-
]
|
| 172 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 173 |
-
|
| 174 |
-
def funnel(self, filters: QueryFilters) -> list[dict]:
|
| 175 |
-
pipeline: list[dict] = [
|
| 176 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 177 |
-
_with_normalized_time(),
|
| 178 |
-
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 179 |
-
{"$sort": {"session_id": 1, "event_ts": 1}},
|
| 180 |
-
{
|
| 181 |
-
"$group": {
|
| 182 |
-
"_id": "$session_id",
|
| 183 |
-
"events": {"$push": {"name": "$event_name", "ts": "$event_ts"}},
|
| 184 |
-
}
|
| 185 |
-
},
|
| 186 |
-
{"$match": {"_id": {"$nin": [None, ""]}}},
|
| 187 |
-
{
|
| 188 |
-
"$project": {
|
| 189 |
-
"events": 1,
|
| 190 |
-
"page_view_at": {
|
| 191 |
-
"$arrayElemAt": [
|
| 192 |
-
{
|
| 193 |
-
"$map": {
|
| 194 |
-
"input": {
|
| 195 |
-
"$filter": {
|
| 196 |
-
"input": "$events",
|
| 197 |
-
"as": "event",
|
| 198 |
-
"cond": {"$eq": ["$$event.name", "page_view"]},
|
| 199 |
-
}
|
| 200 |
-
},
|
| 201 |
-
"as": "event",
|
| 202 |
-
"in": "$$event.ts",
|
| 203 |
-
}
|
| 204 |
-
},
|
| 205 |
-
0,
|
| 206 |
-
]
|
| 207 |
-
},
|
| 208 |
-
}
|
| 209 |
-
},
|
| 210 |
-
{
|
| 211 |
-
"$project": {
|
| 212 |
-
"events": 1,
|
| 213 |
-
"page_view_at": 1,
|
| 214 |
-
"benchmark_change_at": {
|
| 215 |
-
"$arrayElemAt": [
|
| 216 |
-
{
|
| 217 |
-
"$map": {
|
| 218 |
-
"input": {
|
| 219 |
-
"$filter": {
|
| 220 |
-
"input": "$events",
|
| 221 |
-
"as": "event",
|
| 222 |
-
"cond": {
|
| 223 |
-
"$and": [
|
| 224 |
-
{"$eq": ["$$event.name", "benchmark_change"]},
|
| 225 |
-
{"$gte": ["$$event.ts", "$page_view_at"]},
|
| 226 |
-
]
|
| 227 |
-
},
|
| 228 |
-
}
|
| 229 |
-
},
|
| 230 |
-
"as": "event",
|
| 231 |
-
"in": "$$event.ts",
|
| 232 |
-
}
|
| 233 |
-
},
|
| 234 |
-
0,
|
| 235 |
-
]
|
| 236 |
-
},
|
| 237 |
-
}
|
| 238 |
-
},
|
| 239 |
-
{
|
| 240 |
-
"$project": {
|
| 241 |
-
"events": 1,
|
| 242 |
-
"page_view_at": 1,
|
| 243 |
-
"benchmark_change_at": 1,
|
| 244 |
-
"filter_change_at": {
|
| 245 |
-
"$arrayElemAt": [
|
| 246 |
-
{
|
| 247 |
-
"$map": {
|
| 248 |
-
"input": {
|
| 249 |
-
"$filter": {
|
| 250 |
-
"input": "$events",
|
| 251 |
-
"as": "event",
|
| 252 |
-
"cond": {
|
| 253 |
-
"$and": [
|
| 254 |
-
{
|
| 255 |
-
"$regexMatch": {
|
| 256 |
-
"input": "$$event.name",
|
| 257 |
-
"regex": "^filter_change_",
|
| 258 |
-
}
|
| 259 |
-
},
|
| 260 |
-
{
|
| 261 |
-
"$gte": [
|
| 262 |
-
"$$event.ts",
|
| 263 |
-
"$benchmark_change_at",
|
| 264 |
-
]
|
| 265 |
-
},
|
| 266 |
-
]
|
| 267 |
-
},
|
| 268 |
-
}
|
| 269 |
-
},
|
| 270 |
-
"as": "event",
|
| 271 |
-
"in": "$$event.ts",
|
| 272 |
-
}
|
| 273 |
-
},
|
| 274 |
-
0,
|
| 275 |
-
]
|
| 276 |
-
},
|
| 277 |
-
}
|
| 278 |
-
},
|
| 279 |
-
{
|
| 280 |
-
"$project": {
|
| 281 |
-
"page_view_at": 1,
|
| 282 |
-
"benchmark_change_at": 1,
|
| 283 |
-
"filter_change_at": 1,
|
| 284 |
-
"table_download_at": {
|
| 285 |
-
"$arrayElemAt": [
|
| 286 |
-
{
|
| 287 |
-
"$map": {
|
| 288 |
-
"input": {
|
| 289 |
-
"$filter": {
|
| 290 |
-
"input": "$events",
|
| 291 |
-
"as": "event",
|
| 292 |
-
"cond": {
|
| 293 |
-
"$and": [
|
| 294 |
-
{"$eq": ["$$event.name", "table_download"]},
|
| 295 |
-
{"$gte": ["$$event.ts", "$filter_change_at"]},
|
| 296 |
-
]
|
| 297 |
-
},
|
| 298 |
-
}
|
| 299 |
-
},
|
| 300 |
-
"as": "event",
|
| 301 |
-
"in": "$$event.ts",
|
| 302 |
-
}
|
| 303 |
-
},
|
| 304 |
-
0,
|
| 305 |
-
]
|
| 306 |
-
},
|
| 307 |
-
}
|
| 308 |
-
},
|
| 309 |
-
{
|
| 310 |
-
"$group": {
|
| 311 |
-
"_id": None,
|
| 312 |
-
"step1_page_view": {
|
| 313 |
-
"$sum": {"$cond": [{"$ne": ["$page_view_at", None]}, 1, 0]}
|
| 314 |
-
},
|
| 315 |
-
"step2_benchmark_change": {
|
| 316 |
-
"$sum": {
|
| 317 |
-
"$cond": [
|
| 318 |
-
{
|
| 319 |
-
"$and": [
|
| 320 |
-
{"$ne": ["$page_view_at", None]},
|
| 321 |
-
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 322 |
-
]
|
| 323 |
-
},
|
| 324 |
-
1,
|
| 325 |
-
0,
|
| 326 |
-
]
|
| 327 |
-
}
|
| 328 |
-
},
|
| 329 |
-
"step3_filter_change": {
|
| 330 |
-
"$sum": {
|
| 331 |
-
"$cond": [
|
| 332 |
-
{
|
| 333 |
-
"$and": [
|
| 334 |
-
{"$ne": ["$page_view_at", None]},
|
| 335 |
-
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 336 |
-
{"$gte": ["$filter_change_at", "$benchmark_change_at"]},
|
| 337 |
-
]
|
| 338 |
-
},
|
| 339 |
-
1,
|
| 340 |
-
0,
|
| 341 |
-
]
|
| 342 |
-
}
|
| 343 |
-
},
|
| 344 |
-
"step4_table_download": {
|
| 345 |
-
"$sum": {
|
| 346 |
-
"$cond": [
|
| 347 |
-
{
|
| 348 |
-
"$and": [
|
| 349 |
-
{"$ne": ["$page_view_at", None]},
|
| 350 |
-
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 351 |
-
{"$gte": ["$filter_change_at", "$benchmark_change_at"]},
|
| 352 |
-
{"$gte": ["$table_download_at", "$filter_change_at"]},
|
| 353 |
-
]
|
| 354 |
-
},
|
| 355 |
-
1,
|
| 356 |
-
0,
|
| 357 |
-
]
|
| 358 |
-
}
|
| 359 |
-
},
|
| 360 |
-
}
|
| 361 |
-
},
|
| 362 |
-
{
|
| 363 |
-
"$project": {
|
| 364 |
-
"_id": 0,
|
| 365 |
-
"step1_page_view": 1,
|
| 366 |
-
"step2_benchmark_change": 1,
|
| 367 |
-
"step3_filter_change": 1,
|
| 368 |
-
"step4_table_download": 1,
|
| 369 |
-
}
|
| 370 |
-
},
|
| 371 |
-
]
|
| 372 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 373 |
-
|
| 374 |
-
def visitors_new_vs_returning(self, filters: QueryFilters) -> list[dict]:
|
| 375 |
-
period_expr = _period_expression(filters.granularity)
|
| 376 |
-
pipeline: list[dict] = [
|
| 377 |
-
_with_normalized_time(),
|
| 378 |
-
{
|
| 379 |
-
"$match": {
|
| 380 |
-
"event_name": "page_view",
|
| 381 |
-
"visitor_id": {"$nin": [None, ""]},
|
| 382 |
-
}
|
| 383 |
-
},
|
| 384 |
-
{
|
| 385 |
-
"$setWindowFields": {
|
| 386 |
-
"partitionBy": "$visitor_id",
|
| 387 |
-
"sortBy": {"event_ts": 1},
|
| 388 |
-
"output": {"first_seen": {"$first": "$event_ts"}},
|
| 389 |
-
}
|
| 390 |
-
},
|
| 391 |
-
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 392 |
-
{
|
| 393 |
-
"$project": {
|
| 394 |
-
"period": period_expr,
|
| 395 |
-
"is_new": {
|
| 396 |
-
"$eq": [
|
| 397 |
-
{"$dateToString": {"format": "%Y-%m-%d", "date": "$event_ts"}},
|
| 398 |
-
{"$dateToString": {"format": "%Y-%m-%d", "date": "$first_seen"}},
|
| 399 |
-
]
|
| 400 |
-
},
|
| 401 |
-
"visitor_id": 1,
|
| 402 |
-
}
|
| 403 |
-
},
|
| 404 |
-
{
|
| 405 |
-
"$group": {
|
| 406 |
-
"_id": {"period": "$period", "is_new": "$is_new"},
|
| 407 |
-
"visitors": {"$addToSet": "$visitor_id"},
|
| 408 |
-
}
|
| 409 |
-
},
|
| 410 |
-
{
|
| 411 |
-
"$project": {
|
| 412 |
-
"_id": 0,
|
| 413 |
-
"period": "$_id.period",
|
| 414 |
-
"is_new": "$_id.is_new",
|
| 415 |
-
"visitor_count": _non_empty_set_size("visitors", "v"),
|
| 416 |
-
}
|
| 417 |
-
},
|
| 418 |
-
{"$sort": {"period": 1, "is_new": -1}},
|
| 419 |
-
]
|
| 420 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 421 |
-
|
| 422 |
-
def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
|
| 423 |
-
pipeline: list[dict] = [
|
| 424 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 425 |
-
_with_normalized_time(),
|
| 426 |
-
{
|
| 427 |
-
"$match": {
|
| 428 |
-
**_with_time_and_optional_benchmark(filters),
|
| 429 |
-
"event_name": "page_view",
|
| 430 |
-
"properties.ip": {"$nin": [None, ""]},
|
| 431 |
-
}
|
| 432 |
-
},
|
| 433 |
-
{"$group": {"_id": "$properties.ip", "pv": {"$sum": 1}}},
|
| 434 |
-
{"$project": {"_id": 0, "ip": "$_id", "pv": 1}},
|
| 435 |
-
{"$sort": {"pv": -1}},
|
| 436 |
-
]
|
| 437 |
-
return list(self.events_collection.aggregate(pipeline))
|
| 438 |
-
|
| 439 |
-
def available_benchmarks(
|
| 440 |
-
self, filters: QueryFilters | None = None, limit: int = 100
|
| 441 |
-
) -> list[str]:
|
| 442 |
-
pipeline: list[dict] = []
|
| 443 |
-
if filters is not None:
|
| 444 |
-
pipeline.extend(
|
| 445 |
-
[
|
| 446 |
-
{"$match": _indexed_time_prefilter(filters)},
|
| 447 |
-
_with_normalized_time(),
|
| 448 |
-
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 449 |
-
]
|
| 450 |
-
)
|
| 451 |
-
pipeline.extend(
|
| 452 |
-
[
|
| 453 |
-
{"$match": {"benchmark": {"$nin": [None, ""]}}},
|
| 454 |
-
{"$group": {"_id": "$benchmark"}},
|
| 455 |
-
{"$sort": {"_id": 1}},
|
| 456 |
-
{"$limit": limit},
|
| 457 |
-
]
|
| 458 |
-
)
|
| 459 |
-
return [row["_id"] for row in self.events_collection.aggregate(pipeline)]
|
| 460 |
-
|
| 461 |
-
@staticmethod
|
| 462 |
-
def safe_first(items: Iterable[dict]) -> dict:
|
| 463 |
-
return next(iter(items), {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/schemas.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
from datetime import UTC, datetime
|
| 2 |
-
from enum import StrEnum
|
| 3 |
-
|
| 4 |
-
from pydantic import BaseModel, Field, model_validator
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class Granularity(StrEnum):
|
| 8 |
-
DAY = "day"
|
| 9 |
-
WEEK = "week"
|
| 10 |
-
MONTH = "month"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class QueryFilters(BaseModel):
|
| 14 |
-
start_time: datetime = Field(
|
| 15 |
-
default_factory=lambda: datetime.now(tz=UTC).replace(
|
| 16 |
-
hour=0, minute=0, second=0, microsecond=0
|
| 17 |
-
)
|
| 18 |
-
)
|
| 19 |
-
end_time: datetime = Field(default_factory=lambda: datetime.now(tz=UTC))
|
| 20 |
-
benchmark: str | None = None
|
| 21 |
-
granularity: Granularity = Granularity.DAY
|
| 22 |
-
|
| 23 |
-
@model_validator(mode="after")
|
| 24 |
-
def validate_time_range(self) -> "QueryFilters":
|
| 25 |
-
if self.start_time > self.end_time:
|
| 26 |
-
raise ValueError("start_time must be earlier than or equal to end_time")
|
| 27 |
-
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/services.py
DELETED
|
@@ -1,264 +0,0 @@
|
|
| 1 |
-
import ipaddress
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import Any, Protocol
|
| 4 |
-
|
| 5 |
-
import pandas as pd
|
| 6 |
-
|
| 7 |
-
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 8 |
-
from leaderboard_analytics.schemas import QueryFilters
|
| 9 |
-
|
| 10 |
-
UNKNOWN_COUNTRY_CODE = "Unknown"
|
| 11 |
-
UNKNOWN_COUNTRY_NAME = "Unknown"
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def _empty_ip_debug() -> dict[str, object]:
|
| 15 |
-
return {
|
| 16 |
-
"total_unique_ips": 0,
|
| 17 |
-
"total_ip_pv": 0,
|
| 18 |
-
"global_ips": 0,
|
| 19 |
-
"global_ip_pv": 0,
|
| 20 |
-
"private_ips": 0,
|
| 21 |
-
"private_ip_pv": 0,
|
| 22 |
-
"loopback_ips": 0,
|
| 23 |
-
"loopback_ip_pv": 0,
|
| 24 |
-
"reserved_ips": 0,
|
| 25 |
-
"reserved_ip_pv": 0,
|
| 26 |
-
"link_local_ips": 0,
|
| 27 |
-
"link_local_ip_pv": 0,
|
| 28 |
-
"multicast_ips": 0,
|
| 29 |
-
"multicast_ip_pv": 0,
|
| 30 |
-
"unspecified_ips": 0,
|
| 31 |
-
"unspecified_ip_pv": 0,
|
| 32 |
-
"invalid_ips": 0,
|
| 33 |
-
"invalid_ip_pv": 0,
|
| 34 |
-
"top_ip_pv_buckets": {
|
| 35 |
-
"1": 0,
|
| 36 |
-
"2-10": 0,
|
| 37 |
-
"11-100": 0,
|
| 38 |
-
"101-1000": 0,
|
| 39 |
-
">1000": 0,
|
| 40 |
-
},
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def _ip_debug_category(ip_address: str) -> str:
|
| 45 |
-
try:
|
| 46 |
-
parsed_ip = ipaddress.ip_address(ip_address.strip())
|
| 47 |
-
except ValueError:
|
| 48 |
-
return "invalid"
|
| 49 |
-
|
| 50 |
-
if parsed_ip.is_global:
|
| 51 |
-
return "global"
|
| 52 |
-
if parsed_ip.is_loopback:
|
| 53 |
-
return "loopback"
|
| 54 |
-
if parsed_ip.is_private:
|
| 55 |
-
return "private"
|
| 56 |
-
if parsed_ip.is_reserved:
|
| 57 |
-
return "reserved"
|
| 58 |
-
if parsed_ip.is_link_local:
|
| 59 |
-
return "link_local"
|
| 60 |
-
if parsed_ip.is_multicast:
|
| 61 |
-
return "multicast"
|
| 62 |
-
if parsed_ip.is_unspecified:
|
| 63 |
-
return "unspecified"
|
| 64 |
-
return "reserved"
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def _ip_pv_bucket(pv: int) -> str:
|
| 68 |
-
if pv <= 1:
|
| 69 |
-
return "1"
|
| 70 |
-
if pv <= 10:
|
| 71 |
-
return "2-10"
|
| 72 |
-
if pv <= 100:
|
| 73 |
-
return "11-100"
|
| 74 |
-
if pv <= 1000:
|
| 75 |
-
return "101-1000"
|
| 76 |
-
return ">1000"
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
class GeoIpCountryReader(Protocol):
|
| 80 |
-
def country(self, ip_address: str) -> Any: ...
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
class GeoIpResolver:
|
| 84 |
-
def __init__(
|
| 85 |
-
self,
|
| 86 |
-
database_path: str | Path | None = None,
|
| 87 |
-
reader: GeoIpCountryReader | None = None,
|
| 88 |
-
) -> None:
|
| 89 |
-
self.database_path = Path(database_path) if database_path else None
|
| 90 |
-
self._reader = reader
|
| 91 |
-
self._load_attempted = reader is not None
|
| 92 |
-
|
| 93 |
-
def resolve_country(self, ip_address: str) -> tuple[str, str]:
|
| 94 |
-
try:
|
| 95 |
-
parsed_ip = ipaddress.ip_address(ip_address.strip())
|
| 96 |
-
except ValueError:
|
| 97 |
-
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 98 |
-
|
| 99 |
-
if not parsed_ip.is_global:
|
| 100 |
-
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 101 |
-
|
| 102 |
-
reader = self._get_reader()
|
| 103 |
-
if reader is None:
|
| 104 |
-
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 105 |
-
|
| 106 |
-
try:
|
| 107 |
-
response = reader.country(str(parsed_ip))
|
| 108 |
-
except Exception:
|
| 109 |
-
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 110 |
-
|
| 111 |
-
country = response.country
|
| 112 |
-
if not getattr(country, "iso_code", None):
|
| 113 |
-
country = response.registered_country
|
| 114 |
-
|
| 115 |
-
code = getattr(country, "iso_code", None)
|
| 116 |
-
if not code:
|
| 117 |
-
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 118 |
-
|
| 119 |
-
return code, getattr(country, "name", None) or code
|
| 120 |
-
|
| 121 |
-
def debug_status(self) -> dict[str, object]:
|
| 122 |
-
return {
|
| 123 |
-
"database_path": str(self.database_path) if self.database_path else "",
|
| 124 |
-
"database_configured": self.database_path is not None,
|
| 125 |
-
"database_exists": self.database_path.exists() if self.database_path else False,
|
| 126 |
-
"load_attempted": self._load_attempted,
|
| 127 |
-
"reader_loaded": self._reader is not None,
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
def _get_reader(self) -> GeoIpCountryReader | None:
|
| 131 |
-
if self._reader is not None:
|
| 132 |
-
return self._reader
|
| 133 |
-
|
| 134 |
-
if self._load_attempted:
|
| 135 |
-
return None
|
| 136 |
-
|
| 137 |
-
self._load_attempted = True
|
| 138 |
-
if self.database_path is None or not self.database_path.exists():
|
| 139 |
-
return None
|
| 140 |
-
|
| 141 |
-
try:
|
| 142 |
-
import geoip2.database
|
| 143 |
-
|
| 144 |
-
self._reader = geoip2.database.Reader(str(self.database_path))
|
| 145 |
-
except Exception:
|
| 146 |
-
return None
|
| 147 |
-
|
| 148 |
-
return self._reader
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
class AnalyticsService:
|
| 152 |
-
def __init__(
|
| 153 |
-
self,
|
| 154 |
-
repository: AnalyticsRepository,
|
| 155 |
-
geoip_database_path: str | Path | None = None,
|
| 156 |
-
geoip_resolver: GeoIpResolver | None = None,
|
| 157 |
-
) -> None:
|
| 158 |
-
self.repository = repository
|
| 159 |
-
self.geoip_resolver = geoip_resolver or GeoIpResolver(geoip_database_path)
|
| 160 |
-
|
| 161 |
-
def get_overview(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
|
| 162 |
-
rows = self.repository.overview_timeseries(filters)
|
| 163 |
-
frame = pd.DataFrame(rows)
|
| 164 |
-
raw_totals = self.repository.overview_totals(filters)
|
| 165 |
-
totals = {
|
| 166 |
-
"pv": int(raw_totals.get("pv", 0)),
|
| 167 |
-
"uv": int(raw_totals.get("uv", 0)),
|
| 168 |
-
"sessions": int(raw_totals.get("sessions", 0)),
|
| 169 |
-
"events": int(raw_totals.get("events", 0)),
|
| 170 |
-
}
|
| 171 |
-
totals["events_per_session"] = (
|
| 172 |
-
round(totals["events"] / totals["sessions"], 2) if totals["sessions"] else 0.0
|
| 173 |
-
)
|
| 174 |
-
totals["sessions_per_visitor"] = (
|
| 175 |
-
round(totals["sessions"] / totals["uv"], 2) if totals["uv"] else 0.0
|
| 176 |
-
)
|
| 177 |
-
return frame, totals
|
| 178 |
-
|
| 179 |
-
def get_benchmark_top(self, filters: QueryFilters) -> pd.DataFrame:
|
| 180 |
-
return pd.DataFrame(self.repository.benchmark_top(filters))
|
| 181 |
-
|
| 182 |
-
def get_filter_distribution(self, filters: QueryFilters) -> pd.DataFrame:
|
| 183 |
-
return pd.DataFrame(self.repository.filter_distribution(filters))
|
| 184 |
-
|
| 185 |
-
def get_funnel(self, filters: QueryFilters) -> pd.DataFrame:
|
| 186 |
-
raw = self.repository.safe_first(self.repository.funnel(filters))
|
| 187 |
-
rows = [
|
| 188 |
-
{"step": "page_view", "sessions": raw.get("step1_page_view", 0)},
|
| 189 |
-
{"step": "benchmark_change", "sessions": raw.get("step2_benchmark_change", 0)},
|
| 190 |
-
{"step": "filter_change_*", "sessions": raw.get("step3_filter_change", 0)},
|
| 191 |
-
{"step": "table_download", "sessions": raw.get("step4_table_download", 0)},
|
| 192 |
-
]
|
| 193 |
-
frame = pd.DataFrame(rows)
|
| 194 |
-
step1 = int(frame.iloc[0]["sessions"]) if not frame.empty else 0
|
| 195 |
-
frame["conversion_rate"] = frame["sessions"].apply(
|
| 196 |
-
lambda x: round((x / step1) * 100, 2) if step1 else 0.0
|
| 197 |
-
)
|
| 198 |
-
return frame
|
| 199 |
-
|
| 200 |
-
def get_new_vs_returning(self, filters: QueryFilters) -> pd.DataFrame:
|
| 201 |
-
frame = pd.DataFrame(self.repository.visitors_new_vs_returning(filters))
|
| 202 |
-
if frame.empty:
|
| 203 |
-
return frame
|
| 204 |
-
frame["visitor_type"] = frame["is_new"].map({True: "new", False: "returning"})
|
| 205 |
-
return frame
|
| 206 |
-
|
| 207 |
-
def get_visitor_locations(self, filters: QueryFilters) -> pd.DataFrame:
|
| 208 |
-
frame, _debug = self.get_visitor_location_details(filters)
|
| 209 |
-
return frame
|
| 210 |
-
|
| 211 |
-
def get_visitor_location_details(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
|
| 212 |
-
locations: dict[tuple[str, str], dict[str, int | str]] = {}
|
| 213 |
-
ip_debug = _empty_ip_debug()
|
| 214 |
-
for row in self.repository.visitor_ip_counts(filters):
|
| 215 |
-
ip = str(row.get("ip", "")).strip()
|
| 216 |
-
if not ip:
|
| 217 |
-
continue
|
| 218 |
-
|
| 219 |
-
pv = int(row.get("pv", 0))
|
| 220 |
-
category = _ip_debug_category(ip)
|
| 221 |
-
ip_debug["total_unique_ips"] = int(ip_debug["total_unique_ips"]) + 1
|
| 222 |
-
ip_debug["total_ip_pv"] = int(ip_debug["total_ip_pv"]) + pv
|
| 223 |
-
ip_debug[f"{category}_ips"] = int(ip_debug[f"{category}_ips"]) + 1
|
| 224 |
-
ip_debug[f"{category}_ip_pv"] = int(ip_debug[f"{category}_ip_pv"]) + pv
|
| 225 |
-
ip_debug["top_ip_pv_buckets"][_ip_pv_bucket(pv)] += 1 # type: ignore[index]
|
| 226 |
-
|
| 227 |
-
code, name = self.geoip_resolver.resolve_country(ip)
|
| 228 |
-
key = (code, name)
|
| 229 |
-
if key not in locations:
|
| 230 |
-
locations[key] = {
|
| 231 |
-
"country_code": code,
|
| 232 |
-
"country_name": name,
|
| 233 |
-
"pv": 0,
|
| 234 |
-
"ip_count": 0,
|
| 235 |
-
}
|
| 236 |
-
|
| 237 |
-
locations[key]["pv"] = int(locations[key]["pv"]) + pv
|
| 238 |
-
locations[key]["ip_count"] = int(locations[key]["ip_count"]) + 1
|
| 239 |
-
|
| 240 |
-
frame = pd.DataFrame(
|
| 241 |
-
locations.values(),
|
| 242 |
-
columns=["country_code", "country_name", "pv", "ip_count"],
|
| 243 |
-
)
|
| 244 |
-
if frame.empty:
|
| 245 |
-
return frame, ip_debug
|
| 246 |
-
frame = frame.sort_values(["pv", "ip_count"], ascending=[False, False]).reset_index(
|
| 247 |
-
drop=True
|
| 248 |
-
)
|
| 249 |
-
return frame, ip_debug
|
| 250 |
-
|
| 251 |
-
def get_geoip_debug_info(self) -> dict[str, object]:
|
| 252 |
-
debug_status = getattr(self.geoip_resolver, "debug_status", None)
|
| 253 |
-
if debug_status is None:
|
| 254 |
-
return {
|
| 255 |
-
"database_path": "",
|
| 256 |
-
"database_configured": False,
|
| 257 |
-
"database_exists": False,
|
| 258 |
-
"load_attempted": False,
|
| 259 |
-
"reader_loaded": False,
|
| 260 |
-
}
|
| 261 |
-
return debug_status()
|
| 262 |
-
|
| 263 |
-
def get_available_benchmarks(self, filters: QueryFilters | None = None) -> list[str]:
|
| 264 |
-
return self.repository.available_benchmarks(filters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard_analytics/ui.py
DELETED
|
@@ -1,481 +0,0 @@
|
|
| 1 |
-
import math
|
| 2 |
-
import tempfile
|
| 3 |
-
import zipfile
|
| 4 |
-
from datetime import UTC, datetime, timedelta
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import Any
|
| 7 |
-
|
| 8 |
-
import gradio as gr
|
| 9 |
-
import pandas as pd
|
| 10 |
-
import plotly.express as px
|
| 11 |
-
import plotly.graph_objects as go
|
| 12 |
-
|
| 13 |
-
from leaderboard_analytics.schemas import Granularity, QueryFilters
|
| 14 |
-
from leaderboard_analytics.services import AnalyticsService
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def _to_utc_datetime(value: Any, fallback: datetime) -> datetime:
|
| 18 |
-
if value is None or value == "":
|
| 19 |
-
return fallback
|
| 20 |
-
|
| 21 |
-
if isinstance(value, datetime):
|
| 22 |
-
dt = value
|
| 23 |
-
elif isinstance(value, (int, float)):
|
| 24 |
-
if isinstance(value, float) and math.isnan(value):
|
| 25 |
-
return fallback
|
| 26 |
-
# Gradio DateTime may return Unix timestamps as numbers.
|
| 27 |
-
dt = datetime.fromtimestamp(value, tz=UTC)
|
| 28 |
-
elif isinstance(value, str):
|
| 29 |
-
dt = datetime.fromisoformat(value)
|
| 30 |
-
else:
|
| 31 |
-
raise ValueError(f"Unsupported datetime value type: {type(value)!r}")
|
| 32 |
-
|
| 33 |
-
# Gradio DateTime may return naive datetime values in local time.
|
| 34 |
-
if dt.tzinfo is None:
|
| 35 |
-
dt = dt.replace(tzinfo=UTC)
|
| 36 |
-
return dt.astimezone(UTC)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def _empty_plot(title: str):
|
| 40 |
-
return px.line(title=title)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def _empty_map(title: str):
|
| 44 |
-
figure = go.Figure()
|
| 45 |
-
_style_visitor_location_map(figure, title)
|
| 46 |
-
return figure
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def _query_range_text(filters: QueryFilters) -> str:
|
| 50 |
-
return f"{filters.start_time.isoformat()} to {filters.end_time.isoformat()}"
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def _write_csv_archive(tables: dict[str, pd.DataFrame]) -> str | None:
|
| 54 |
-
if all(table.empty for table in tables.values()):
|
| 55 |
-
return None
|
| 56 |
-
|
| 57 |
-
archive = tempfile.NamedTemporaryFile(
|
| 58 |
-
prefix="leaderboard-analytics-", suffix=".zip", delete=False
|
| 59 |
-
)
|
| 60 |
-
archive.close()
|
| 61 |
-
with zipfile.ZipFile(archive.name, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
|
| 62 |
-
for name, table in tables.items():
|
| 63 |
-
zip_file.writestr(f"{name}.csv", table.to_csv(index=False))
|
| 64 |
-
return archive.name
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def _visitor_location_top_table(visitor_locations: pd.DataFrame) -> pd.DataFrame:
|
| 68 |
-
if visitor_locations.empty:
|
| 69 |
-
return pd.DataFrame(columns=["Region", "Users"])
|
| 70 |
-
|
| 71 |
-
return (
|
| 72 |
-
visitor_locations.sort_values(["ip_count", "pv"], ascending=[False, False])
|
| 73 |
-
.head(10)
|
| 74 |
-
.rename(columns={"country_name": "Region", "ip_count": "Users"})[["Region", "Users"]]
|
| 75 |
-
.reset_index(drop=True)
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def _visitor_location_debug_text(
|
| 80 |
-
visitor_locations: pd.DataFrame,
|
| 81 |
-
geoip_debug: dict[str, object],
|
| 82 |
-
ip_debug: dict[str, object] | None = None,
|
| 83 |
-
) -> str:
|
| 84 |
-
if visitor_locations.empty:
|
| 85 |
-
total_pv = 0
|
| 86 |
-
total_users = 0
|
| 87 |
-
mapped_regions = 0
|
| 88 |
-
unknown_pv = 0
|
| 89 |
-
unknown_users = 0
|
| 90 |
-
else:
|
| 91 |
-
unknown_rows = visitor_locations[visitor_locations["country_code"] == "Unknown"]
|
| 92 |
-
mapped_rows = visitor_locations[visitor_locations["country_code"] != "Unknown"]
|
| 93 |
-
total_pv = int(visitor_locations["pv"].sum())
|
| 94 |
-
total_users = int(visitor_locations["ip_count"].sum())
|
| 95 |
-
mapped_regions = len(mapped_rows)
|
| 96 |
-
unknown_pv = int(unknown_rows["pv"].sum()) if not unknown_rows.empty else 0
|
| 97 |
-
unknown_users = int(unknown_rows["ip_count"].sum()) if not unknown_rows.empty else 0
|
| 98 |
-
|
| 99 |
-
configured = "yes" if geoip_debug.get("database_configured") else "no"
|
| 100 |
-
exists = "yes" if geoip_debug.get("database_exists") else "no"
|
| 101 |
-
loaded = "yes" if geoip_debug.get("reader_loaded") else "no"
|
| 102 |
-
attempted = "yes" if geoip_debug.get("load_attempted") else "no"
|
| 103 |
-
path = geoip_debug.get("database_path") or "(not configured)"
|
| 104 |
-
ip_debug = ip_debug or {}
|
| 105 |
-
global_ips = int(ip_debug.get("global_ips", 0))
|
| 106 |
-
global_pv = int(ip_debug.get("global_ip_pv", 0))
|
| 107 |
-
private_ips = int(ip_debug.get("private_ips", 0))
|
| 108 |
-
private_pv = int(ip_debug.get("private_ip_pv", 0))
|
| 109 |
-
loopback_ips = int(ip_debug.get("loopback_ips", 0))
|
| 110 |
-
loopback_pv = int(ip_debug.get("loopback_ip_pv", 0))
|
| 111 |
-
invalid_ips = int(ip_debug.get("invalid_ips", 0))
|
| 112 |
-
invalid_pv = int(ip_debug.get("invalid_ip_pv", 0))
|
| 113 |
-
buckets = ip_debug.get("top_ip_pv_buckets", {})
|
| 114 |
-
|
| 115 |
-
return (
|
| 116 |
-
f"GeoIP DB: configured={configured}, exists={exists}, loaded={loaded}, "
|
| 117 |
-
f"load_attempted={attempted} \n"
|
| 118 |
-
f"GeoIP path: `{path}` \n"
|
| 119 |
-
f"Total location PV: {total_pv} | Users/IPs: {total_users} | "
|
| 120 |
-
f"Mapped regions: {mapped_regions} \n"
|
| 121 |
-
f"Unknown PV: {unknown_pv} | Unknown users/IPs: {unknown_users} \n"
|
| 122 |
-
f"Public IPs: {global_ips} ({global_pv} PV) | Private IPs: {private_ips} "
|
| 123 |
-
f"({private_pv} PV) \n"
|
| 124 |
-
f"Loopback IPs: {loopback_ips} ({loopback_pv} PV) | Invalid IPs: {invalid_ips} "
|
| 125 |
-
f"({invalid_pv} PV) \n"
|
| 126 |
-
f"PV/IP buckets: {buckets}"
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def _style_visitor_location_map(figure: go.Figure, title: str) -> None:
|
| 131 |
-
figure.update_geos(
|
| 132 |
-
projection_type="mercator",
|
| 133 |
-
showframe=False,
|
| 134 |
-
showcoastlines=True,
|
| 135 |
-
coastlinecolor="#cfd6df",
|
| 136 |
-
coastlinewidth=0.6,
|
| 137 |
-
showcountries=True,
|
| 138 |
-
countrycolor="#cfd6df",
|
| 139 |
-
countrywidth=0.7,
|
| 140 |
-
showland=True,
|
| 141 |
-
landcolor="#eef2f7",
|
| 142 |
-
showocean=True,
|
| 143 |
-
oceancolor="#f8fafc",
|
| 144 |
-
showlakes=True,
|
| 145 |
-
lakecolor="#f8fafc",
|
| 146 |
-
bgcolor="#ffffff",
|
| 147 |
-
lataxis_range=[-55, 75],
|
| 148 |
-
lonaxis_range=[-180, 180],
|
| 149 |
-
)
|
| 150 |
-
figure.update_layout(
|
| 151 |
-
title={"text": title, "x": 0.02, "xanchor": "left"},
|
| 152 |
-
height=560,
|
| 153 |
-
paper_bgcolor="#ffffff",
|
| 154 |
-
plot_bgcolor="#ffffff",
|
| 155 |
-
font={"color": "#1f2937"},
|
| 156 |
-
margin={"l": 0, "r": 0, "t": 52, "b": 0},
|
| 157 |
-
showlegend=False,
|
| 158 |
-
hoverlabel={
|
| 159 |
-
"bgcolor": "#ffffff",
|
| 160 |
-
"bordercolor": "#3b82f6",
|
| 161 |
-
"font_color": "#111827",
|
| 162 |
-
},
|
| 163 |
-
)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
def _visitor_location_map(visitor_locations: pd.DataFrame, range_text: str) -> go.Figure:
|
| 167 |
-
map_df = (
|
| 168 |
-
visitor_locations[visitor_locations["country_code"] != "Unknown"].copy()
|
| 169 |
-
if not visitor_locations.empty
|
| 170 |
-
else visitor_locations.copy()
|
| 171 |
-
)
|
| 172 |
-
if map_df.empty:
|
| 173 |
-
return _empty_map(f"Visitor locations by country (no mapped data for {range_text})")
|
| 174 |
-
|
| 175 |
-
max_pv = max(int(map_df["pv"].max()), 1)
|
| 176 |
-
size_ref = 2.0 * max_pv / (52**2)
|
| 177 |
-
figure = go.Figure(
|
| 178 |
-
go.Scattergeo(
|
| 179 |
-
locationmode="country names",
|
| 180 |
-
locations=map_df["country_name"],
|
| 181 |
-
mode="markers",
|
| 182 |
-
text=map_df["country_name"],
|
| 183 |
-
customdata=map_df[["country_code", "pv", "ip_count"]],
|
| 184 |
-
hovertemplate=(
|
| 185 |
-
"<b>%{text}</b><br>"
|
| 186 |
-
"Country code: %{customdata[0]}<br>"
|
| 187 |
-
"PV: %{customdata[1]:,}<br>"
|
| 188 |
-
"Users/IPs: %{customdata[2]:,}<extra></extra>"
|
| 189 |
-
),
|
| 190 |
-
marker={
|
| 191 |
-
"size": map_df["pv"],
|
| 192 |
-
"sizemode": "area",
|
| 193 |
-
"sizeref": size_ref,
|
| 194 |
-
"sizemin": 8,
|
| 195 |
-
"color": "rgba(59, 130, 246, 0.55)",
|
| 196 |
-
"line": {"color": "rgba(37, 99, 235, 0.92)", "width": 1.2},
|
| 197 |
-
},
|
| 198 |
-
)
|
| 199 |
-
)
|
| 200 |
-
_style_visitor_location_map(figure, "Visitor locations by country")
|
| 201 |
-
figure.add_annotation(
|
| 202 |
-
x=0.02,
|
| 203 |
-
y=0.08,
|
| 204 |
-
xref="paper",
|
| 205 |
-
yref="paper",
|
| 206 |
-
text=(
|
| 207 |
-
f"Mapped regions: {len(map_df)}<br>"
|
| 208 |
-
f"Mapped PV: {int(map_df['pv'].sum()):,}<br>"
|
| 209 |
-
f"Users/IPs: {int(map_df['ip_count'].sum()):,}"
|
| 210 |
-
),
|
| 211 |
-
showarrow=False,
|
| 212 |
-
align="left",
|
| 213 |
-
bgcolor="rgba(255, 255, 255, 0.88)",
|
| 214 |
-
bordercolor="rgba(148, 163, 184, 0.55)",
|
| 215 |
-
borderwidth=1,
|
| 216 |
-
font={"color": "#1f2937", "size": 12},
|
| 217 |
-
)
|
| 218 |
-
return figure
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
| 222 |
-
default_end = datetime.now(tz=UTC)
|
| 223 |
-
default_start = (default_end - timedelta(days=7)).replace(microsecond=0)
|
| 224 |
-
|
| 225 |
-
def load_benchmarks() -> object:
|
| 226 |
-
try:
|
| 227 |
-
benchmarks = service.get_available_benchmarks()
|
| 228 |
-
except Exception:
|
| 229 |
-
benchmarks = []
|
| 230 |
-
return gr.update(choices=[""] + benchmarks, value="")
|
| 231 |
-
|
| 232 |
-
def query(
|
| 233 |
-
start_time: datetime | str | None,
|
| 234 |
-
end_time: datetime | str | None,
|
| 235 |
-
benchmark: str,
|
| 236 |
-
granularity: str,
|
| 237 |
-
) -> tuple[
|
| 238 |
-
object,
|
| 239 |
-
object,
|
| 240 |
-
object,
|
| 241 |
-
object,
|
| 242 |
-
object,
|
| 243 |
-
object,
|
| 244 |
-
object,
|
| 245 |
-
object,
|
| 246 |
-
object,
|
| 247 |
-
object,
|
| 248 |
-
object,
|
| 249 |
-
object,
|
| 250 |
-
object,
|
| 251 |
-
object,
|
| 252 |
-
object,
|
| 253 |
-
object,
|
| 254 |
-
]:
|
| 255 |
-
try:
|
| 256 |
-
filters = QueryFilters(
|
| 257 |
-
start_time=_to_utc_datetime(start_time, default_start),
|
| 258 |
-
end_time=_to_utc_datetime(end_time, default_end),
|
| 259 |
-
benchmark=benchmark or None,
|
| 260 |
-
granularity=Granularity(granularity),
|
| 261 |
-
)
|
| 262 |
-
overview_df, totals = service.get_overview(filters)
|
| 263 |
-
benchmark_df = service.get_benchmark_top(filters)
|
| 264 |
-
filter_df = service.get_filter_distribution(filters)
|
| 265 |
-
funnel_df = service.get_funnel(filters)
|
| 266 |
-
visitors_df = service.get_new_vs_returning(filters)
|
| 267 |
-
visitor_locations_df, ip_debug = service.get_visitor_location_details(filters)
|
| 268 |
-
visitor_locations_top_df = _visitor_location_top_table(visitor_locations_df)
|
| 269 |
-
visitor_locations_debug = _visitor_location_debug_text(
|
| 270 |
-
visitor_locations_df,
|
| 271 |
-
service.get_geoip_debug_info(),
|
| 272 |
-
ip_debug,
|
| 273 |
-
)
|
| 274 |
-
|
| 275 |
-
range_text = _query_range_text(filters)
|
| 276 |
-
if (
|
| 277 |
-
overview_df.empty
|
| 278 |
-
and benchmark_df.empty
|
| 279 |
-
and filter_df.empty
|
| 280 |
-
and visitors_df.empty
|
| 281 |
-
and visitor_locations_df.empty
|
| 282 |
-
):
|
| 283 |
-
metrics = f"No data for {range_text}."
|
| 284 |
-
else:
|
| 285 |
-
metrics = (
|
| 286 |
-
f"Range: {range_text} \n"
|
| 287 |
-
f"PV: {totals['pv']} | UV: {totals['uv']} | Sessions: {totals['sessions']} | "
|
| 288 |
-
f"Events/Session: {totals['events_per_session']} | "
|
| 289 |
-
f"Sessions/Visitor: {totals['sessions_per_visitor']}"
|
| 290 |
-
)
|
| 291 |
-
|
| 292 |
-
overview_plot = (
|
| 293 |
-
px.line(
|
| 294 |
-
overview_df,
|
| 295 |
-
x="period",
|
| 296 |
-
y=["pv", "uv", "session_count"],
|
| 297 |
-
title="Traffic overview",
|
| 298 |
-
)
|
| 299 |
-
if not overview_df.empty
|
| 300 |
-
else _empty_plot(f"Traffic overview (no data for {range_text})")
|
| 301 |
-
)
|
| 302 |
-
benchmark_plot = (
|
| 303 |
-
px.bar(benchmark_df, x="benchmark", y="count", title="Benchmark Top")
|
| 304 |
-
if not benchmark_df.empty
|
| 305 |
-
else px.bar(title=f"Benchmark Top (no data for {range_text})")
|
| 306 |
-
)
|
| 307 |
-
filter_plot = (
|
| 308 |
-
px.bar(filter_df, x="event_name", y="count", title="Filter usage")
|
| 309 |
-
if not filter_df.empty
|
| 310 |
-
else px.bar(title=f"Filter usage (no data for {range_text})")
|
| 311 |
-
)
|
| 312 |
-
funnel_plot = px.funnel(funnel_df, x="sessions", y="step", title="Session funnel")
|
| 313 |
-
visitor_plot = (
|
| 314 |
-
px.bar(
|
| 315 |
-
visitors_df,
|
| 316 |
-
x="period",
|
| 317 |
-
y="visitor_count",
|
| 318 |
-
color="visitor_type",
|
| 319 |
-
barmode="group",
|
| 320 |
-
title="New vs returning visitors",
|
| 321 |
-
)
|
| 322 |
-
if not visitors_df.empty
|
| 323 |
-
else px.bar(title=f"New vs returning visitors (no data for {range_text})")
|
| 324 |
-
)
|
| 325 |
-
visitor_locations_plot = _visitor_location_map(visitor_locations_df, range_text)
|
| 326 |
-
csv_archive = _write_csv_archive(
|
| 327 |
-
{
|
| 328 |
-
"overview": overview_df,
|
| 329 |
-
"benchmarks": benchmark_df,
|
| 330 |
-
"filters": filter_df,
|
| 331 |
-
"funnel": funnel_df,
|
| 332 |
-
"visitors": visitors_df,
|
| 333 |
-
"visitor_locations": visitor_locations_df,
|
| 334 |
-
}
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
return (
|
| 338 |
-
metrics,
|
| 339 |
-
overview_plot,
|
| 340 |
-
benchmark_plot,
|
| 341 |
-
filter_plot,
|
| 342 |
-
funnel_plot,
|
| 343 |
-
visitor_plot,
|
| 344 |
-
visitor_locations_plot,
|
| 345 |
-
visitor_locations_debug,
|
| 346 |
-
visitor_locations_top_df,
|
| 347 |
-
overview_df,
|
| 348 |
-
benchmark_df,
|
| 349 |
-
filter_df,
|
| 350 |
-
funnel_df,
|
| 351 |
-
visitors_df,
|
| 352 |
-
visitor_locations_df,
|
| 353 |
-
csv_archive,
|
| 354 |
-
)
|
| 355 |
-
except Exception as exc:
|
| 356 |
-
message = f"Query failed: {exc}"
|
| 357 |
-
empty = pd.DataFrame()
|
| 358 |
-
empty_top = pd.DataFrame(columns=["Region", "Users"])
|
| 359 |
-
return (
|
| 360 |
-
message,
|
| 361 |
-
_empty_plot(message),
|
| 362 |
-
px.bar(title=message),
|
| 363 |
-
px.bar(title=message),
|
| 364 |
-
px.funnel(
|
| 365 |
-
pd.DataFrame({"step": [], "sessions": []}),
|
| 366 |
-
x="sessions",
|
| 367 |
-
y="step",
|
| 368 |
-
title=message,
|
| 369 |
-
),
|
| 370 |
-
px.bar(title=message),
|
| 371 |
-
_empty_map(message),
|
| 372 |
-
message,
|
| 373 |
-
empty_top,
|
| 374 |
-
empty,
|
| 375 |
-
empty,
|
| 376 |
-
empty,
|
| 377 |
-
empty,
|
| 378 |
-
empty,
|
| 379 |
-
empty,
|
| 380 |
-
None,
|
| 381 |
-
)
|
| 382 |
-
|
| 383 |
-
with gr.Blocks() as demo:
|
| 384 |
-
gr.Markdown("# Leaderboard Analytics Dashboard")
|
| 385 |
-
gr.Markdown(
|
| 386 |
-
"Analyze MTEB leaderboard behavior from MongoDB event logs. "
|
| 387 |
-
"All metrics follow event-log-spec definitions."
|
| 388 |
-
)
|
| 389 |
-
|
| 390 |
-
with gr.Row():
|
| 391 |
-
start_time = gr.DateTime(
|
| 392 |
-
label="Start time",
|
| 393 |
-
value=default_start,
|
| 394 |
-
timezone="UTC",
|
| 395 |
-
)
|
| 396 |
-
end_time = gr.DateTime(
|
| 397 |
-
label="End time",
|
| 398 |
-
value=default_end,
|
| 399 |
-
timezone="UTC",
|
| 400 |
-
)
|
| 401 |
-
benchmark = gr.Dropdown(
|
| 402 |
-
label="Benchmark",
|
| 403 |
-
choices=[""],
|
| 404 |
-
value="",
|
| 405 |
-
allow_custom_value=True,
|
| 406 |
-
)
|
| 407 |
-
granularity = gr.Dropdown(
|
| 408 |
-
label="Granularity",
|
| 409 |
-
choices=[Granularity.DAY.value, Granularity.WEEK.value, Granularity.MONTH.value],
|
| 410 |
-
value=Granularity.DAY.value,
|
| 411 |
-
)
|
| 412 |
-
refresh = gr.Button("Refresh", variant="primary")
|
| 413 |
-
|
| 414 |
-
metrics_text = gr.Markdown(
|
| 415 |
-
"PV: 0 | UV: 0 | Sessions: 0 | Events/Session: 0 | Sessions/Visitor: 0"
|
| 416 |
-
)
|
| 417 |
-
|
| 418 |
-
with gr.Row():
|
| 419 |
-
overview_plot = gr.Plot(label="Traffic Overview")
|
| 420 |
-
benchmark_plot = gr.Plot(label="Benchmark Analysis")
|
| 421 |
-
with gr.Row():
|
| 422 |
-
filter_plot = gr.Plot(label="Filter Behavior")
|
| 423 |
-
funnel_plot = gr.Plot(label="Funnel")
|
| 424 |
-
visitor_plot = gr.Plot(label="Visitor Segmentation")
|
| 425 |
-
with gr.Row():
|
| 426 |
-
with gr.Column(scale=2):
|
| 427 |
-
visitor_locations_plot = gr.Plot(label="Visitor Locations")
|
| 428 |
-
with gr.Column(scale=1):
|
| 429 |
-
visitor_locations_debug = gr.Markdown(
|
| 430 |
-
"GeoIP DB: not checked \n"
|
| 431 |
-
"Total location PV: 0 | Users/IPs: 0 | Mapped regions: 0"
|
| 432 |
-
)
|
| 433 |
-
visitor_locations_top_table = gr.DataFrame(
|
| 434 |
-
label="Top 10 Regions",
|
| 435 |
-
interactive=False,
|
| 436 |
-
wrap=True,
|
| 437 |
-
)
|
| 438 |
-
|
| 439 |
-
with gr.Accordion("Raw data", open=False):
|
| 440 |
-
csv_file = gr.File(label="CSV export")
|
| 441 |
-
overview_table = gr.DataFrame(label="Traffic Overview")
|
| 442 |
-
benchmark_table = gr.DataFrame(label="Benchmark Analysis")
|
| 443 |
-
filter_table = gr.DataFrame(label="Filter Behavior")
|
| 444 |
-
funnel_table = gr.DataFrame(label="Funnel")
|
| 445 |
-
visitor_table = gr.DataFrame(label="Visitor Segmentation")
|
| 446 |
-
visitor_locations_table = gr.DataFrame(label="Visitor Locations")
|
| 447 |
-
|
| 448 |
-
outputs = [
|
| 449 |
-
metrics_text,
|
| 450 |
-
overview_plot,
|
| 451 |
-
benchmark_plot,
|
| 452 |
-
filter_plot,
|
| 453 |
-
funnel_plot,
|
| 454 |
-
visitor_plot,
|
| 455 |
-
visitor_locations_plot,
|
| 456 |
-
visitor_locations_debug,
|
| 457 |
-
visitor_locations_top_table,
|
| 458 |
-
overview_table,
|
| 459 |
-
benchmark_table,
|
| 460 |
-
filter_table,
|
| 461 |
-
funnel_table,
|
| 462 |
-
visitor_table,
|
| 463 |
-
visitor_locations_table,
|
| 464 |
-
csv_file,
|
| 465 |
-
]
|
| 466 |
-
|
| 467 |
-
refresh.click(
|
| 468 |
-
fn=query,
|
| 469 |
-
inputs=[start_time, end_time, benchmark, granularity],
|
| 470 |
-
outputs=outputs,
|
| 471 |
-
)
|
| 472 |
-
|
| 473 |
-
demo.load(fn=load_benchmarks, outputs=benchmark)
|
| 474 |
-
demo.load(
|
| 475 |
-
fn=query,
|
| 476 |
-
inputs=[start_time, end_time, benchmark, granularity],
|
| 477 |
-
outputs=outputs,
|
| 478 |
-
)
|
| 479 |
-
|
| 480 |
-
Path(tempfile.gettempdir()).mkdir(parents=True, exist_ok=True)
|
| 481 |
-
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_geoip_database.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
import gzip
|
| 2 |
-
|
| 3 |
-
from leaderboard_analytics.geoip_database import ensure_geoip_database
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def test_ensure_geoip_database_downloads_and_decompresses_gzip(tmp_path) -> None:
|
| 7 |
-
source = tmp_path / "GeoLite2-Country.mmdb.gz"
|
| 8 |
-
target = tmp_path / "GeoLite2-Country.mmdb"
|
| 9 |
-
expected_bytes = b"fake-mmdb-bytes"
|
| 10 |
-
|
| 11 |
-
with gzip.open(source, "wb") as gzip_file:
|
| 12 |
-
gzip_file.write(expected_bytes)
|
| 13 |
-
|
| 14 |
-
result = ensure_geoip_database(target, source.as_uri())
|
| 15 |
-
|
| 16 |
-
assert result == target
|
| 17 |
-
assert target.read_bytes() == expected_bytes
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def test_ensure_geoip_database_keeps_existing_file(tmp_path) -> None:
|
| 21 |
-
source = tmp_path / "missing.mmdb.gz"
|
| 22 |
-
target = tmp_path / "GeoLite2-Country.mmdb"
|
| 23 |
-
expected_bytes = b"existing-mmdb-bytes"
|
| 24 |
-
target.write_bytes(expected_bytes)
|
| 25 |
-
|
| 26 |
-
result = ensure_geoip_database(target, source.as_uri())
|
| 27 |
-
|
| 28 |
-
assert result == target
|
| 29 |
-
assert target.read_bytes() == expected_bytes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_repositories.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
| 1 |
-
from datetime import UTC, datetime
|
| 2 |
-
|
| 3 |
-
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 4 |
-
from leaderboard_analytics.schemas import QueryFilters
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class CapturingCollection:
|
| 8 |
-
def __init__(self, rows: list[dict] | None = None) -> None:
|
| 9 |
-
self.rows = rows or []
|
| 10 |
-
self.pipeline: list[dict] | None = None
|
| 11 |
-
|
| 12 |
-
def aggregate(self, pipeline: list[dict]):
|
| 13 |
-
self.pipeline = pipeline
|
| 14 |
-
return iter(self.rows)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def _filters() -> QueryFilters:
|
| 18 |
-
return QueryFilters(
|
| 19 |
-
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 20 |
-
end_time=datetime(2026, 1, 31, tzinfo=UTC),
|
| 21 |
-
)
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def test_funnel_pipeline_preserves_ordered_step_logic() -> None:
|
| 25 |
-
collection = CapturingCollection()
|
| 26 |
-
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 27 |
-
|
| 28 |
-
repository.funnel(_filters())
|
| 29 |
-
|
| 30 |
-
assert collection.pipeline is not None
|
| 31 |
-
assert {"$sort": {"session_id": 1, "event_ts": 1}} in collection.pipeline
|
| 32 |
-
assert any(
|
| 33 |
-
"$push" in stage.get("$group", {}).get("events", {}) for stage in collection.pipeline
|
| 34 |
-
)
|
| 35 |
-
assert not any(
|
| 36 |
-
"$addToSet" in str(stage) and "events" in str(stage) for stage in collection.pipeline
|
| 37 |
-
)
|
| 38 |
-
assert any(
|
| 39 |
-
"table_download_at" in str(stage) and "$filter_change_at" in str(stage)
|
| 40 |
-
for stage in collection.pipeline
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def test_new_vs_returning_pipeline_computes_first_seen_before_range_match() -> None:
|
| 45 |
-
collection = CapturingCollection()
|
| 46 |
-
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 47 |
-
|
| 48 |
-
repository.visitors_new_vs_returning(_filters())
|
| 49 |
-
|
| 50 |
-
assert collection.pipeline is not None
|
| 51 |
-
window_index = next(
|
| 52 |
-
i for i, stage in enumerate(collection.pipeline) if "$setWindowFields" in stage
|
| 53 |
-
)
|
| 54 |
-
range_match_index = next(
|
| 55 |
-
i
|
| 56 |
-
for i, stage in enumerate(collection.pipeline)
|
| 57 |
-
if stage.get("$match", {}).get("event_ts") is not None
|
| 58 |
-
)
|
| 59 |
-
assert window_index < range_match_index
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def test_overview_totals_filters_empty_identifiers() -> None:
|
| 63 |
-
collection = CapturingCollection([{"pv": 1, "uv": 1, "sessions": 1, "events": 2}])
|
| 64 |
-
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 65 |
-
|
| 66 |
-
totals = repository.overview_totals(_filters())
|
| 67 |
-
|
| 68 |
-
assert totals == {"pv": 1, "uv": 1, "sessions": 1, "events": 2}
|
| 69 |
-
assert collection.pipeline is not None
|
| 70 |
-
pipeline_text = str(collection.pipeline)
|
| 71 |
-
assert '"$sessions"' in pipeline_text or "'$sessions'" in pipeline_text
|
| 72 |
-
assert '"$visitors"' in pipeline_text or "'$visitors'" in pipeline_text
|
| 73 |
-
assert "$$s" in pipeline_text
|
| 74 |
-
assert "$$v" in pipeline_text
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def test_visitor_ip_counts_groups_page_view_ips_with_existing_filters() -> None:
|
| 78 |
-
collection = CapturingCollection([{"ip": "8.8.8.8", "pv": 3}])
|
| 79 |
-
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 80 |
-
filters = QueryFilters(
|
| 81 |
-
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 82 |
-
end_time=datetime(2026, 1, 31, tzinfo=UTC),
|
| 83 |
-
benchmark="MTEB",
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
rows = repository.visitor_ip_counts(filters)
|
| 87 |
-
|
| 88 |
-
assert rows == [{"ip": "8.8.8.8", "pv": 3}]
|
| 89 |
-
assert collection.pipeline is not None
|
| 90 |
-
pipeline_text = str(collection.pipeline)
|
| 91 |
-
assert "properties.ip" in pipeline_text
|
| 92 |
-
assert "page_view" in pipeline_text
|
| 93 |
-
assert "MTEB" in pipeline_text
|
| 94 |
-
assert "$nin" in pipeline_text
|
| 95 |
-
assert "$properties.ip" in pipeline_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_schemas.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from datetime import UTC, datetime
|
| 2 |
-
|
| 3 |
-
import pytest
|
| 4 |
-
from pydantic import ValidationError
|
| 5 |
-
|
| 6 |
-
from leaderboard_analytics.schemas import QueryFilters
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def test_query_filters_rejects_invalid_time_range() -> None:
|
| 10 |
-
with pytest.raises(
|
| 11 |
-
ValidationError, match="start_time must be earlier than or equal to end_time"
|
| 12 |
-
):
|
| 13 |
-
QueryFilters(
|
| 14 |
-
start_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 15 |
-
end_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 16 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_services.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
| 1 |
-
from datetime import UTC, datetime
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
|
| 4 |
-
from leaderboard_analytics.schemas import QueryFilters
|
| 5 |
-
from leaderboard_analytics.services import AnalyticsService
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class FakeRepository:
|
| 9 |
-
def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
|
| 10 |
-
return [
|
| 11 |
-
{"period": "2026-01-01", "pv": 2, "uv": 1, "session_count": 1, "event_count": 3},
|
| 12 |
-
{"period": "2026-01-02", "pv": 1, "uv": 1, "session_count": 1, "event_count": 2},
|
| 13 |
-
]
|
| 14 |
-
|
| 15 |
-
def overview_totals(self, filters: QueryFilters) -> dict:
|
| 16 |
-
return {"pv": 3, "uv": 1, "sessions": 1, "events": 5}
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class LocationRepository:
|
| 20 |
-
def __init__(self, rows: list[dict]) -> None:
|
| 21 |
-
self.rows = rows
|
| 22 |
-
|
| 23 |
-
def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
|
| 24 |
-
return self.rows
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
class FakeGeoIpResolver:
|
| 28 |
-
def __init__(self, countries: dict[str, tuple[str, str]]) -> None:
|
| 29 |
-
self.countries = countries
|
| 30 |
-
|
| 31 |
-
def resolve_country(self, ip_address: str) -> tuple[str, str]:
|
| 32 |
-
return self.countries[ip_address]
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def test_overview_uses_full_range_distinct_totals() -> None:
|
| 36 |
-
service = AnalyticsService(FakeRepository()) # type: ignore[arg-type]
|
| 37 |
-
filters = QueryFilters(
|
| 38 |
-
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 39 |
-
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
frame, totals = service.get_overview(filters)
|
| 43 |
-
|
| 44 |
-
assert list(frame["period"]) == ["2026-01-01", "2026-01-02"]
|
| 45 |
-
assert totals == {
|
| 46 |
-
"pv": 3,
|
| 47 |
-
"uv": 1,
|
| 48 |
-
"sessions": 1,
|
| 49 |
-
"events": 5,
|
| 50 |
-
"events_per_session": 5.0,
|
| 51 |
-
"sessions_per_visitor": 1.0,
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
def test_visitor_locations_groups_pv_and_ip_count_by_country() -> None:
|
| 56 |
-
repository = LocationRepository(
|
| 57 |
-
[
|
| 58 |
-
{"ip": "8.8.8.8", "pv": 3},
|
| 59 |
-
{"ip": "8.8.4.4", "pv": 2},
|
| 60 |
-
{"ip": "1.1.1.1", "pv": 4},
|
| 61 |
-
]
|
| 62 |
-
)
|
| 63 |
-
resolver = FakeGeoIpResolver(
|
| 64 |
-
{
|
| 65 |
-
"8.8.8.8": ("US", "United States"),
|
| 66 |
-
"8.8.4.4": ("US", "United States"),
|
| 67 |
-
"1.1.1.1": ("AU", "Australia"),
|
| 68 |
-
}
|
| 69 |
-
)
|
| 70 |
-
service = AnalyticsService(
|
| 71 |
-
repository, # type: ignore[arg-type]
|
| 72 |
-
geoip_resolver=resolver, # type: ignore[arg-type]
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
frame = service.get_visitor_locations(
|
| 76 |
-
QueryFilters(
|
| 77 |
-
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 78 |
-
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 79 |
-
)
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
assert frame.to_dict("records") == [
|
| 83 |
-
{"country_code": "US", "country_name": "United States", "pv": 5, "ip_count": 2},
|
| 84 |
-
{"country_code": "AU", "country_name": "Australia", "pv": 4, "ip_count": 1},
|
| 85 |
-
]
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def test_visitor_locations_groups_unresolved_ips_as_unknown() -> None:
|
| 89 |
-
repository = LocationRepository(
|
| 90 |
-
[
|
| 91 |
-
{"ip": "10.0.0.1", "pv": 2},
|
| 92 |
-
{"ip": "not-an-ip", "pv": 1},
|
| 93 |
-
{"ip": "8.8.8.8", "pv": 3},
|
| 94 |
-
]
|
| 95 |
-
)
|
| 96 |
-
service = AnalyticsService(
|
| 97 |
-
repository, # type: ignore[arg-type]
|
| 98 |
-
geoip_database_path=Path("missing-geolite2-country.mmdb"),
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
frame = service.get_visitor_locations(
|
| 102 |
-
QueryFilters(
|
| 103 |
-
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 104 |
-
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 105 |
-
)
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
assert frame.to_dict("records") == [
|
| 109 |
-
{"country_code": "Unknown", "country_name": "Unknown", "pv": 6, "ip_count": 3}
|
| 110 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|