.env.example DELETED
@@ -1,10 +0,0 @@
1
- MONGO_URI=mongodb://localhost:27017
2
- MONGO_DATABASE=event_logger
3
- MONGO_COLLECTION=events
4
- HOST=0.0.0.0
5
- PORT=7860
6
- GRADIO_SHARE=false
7
- GRADIO_SSR_MODE=false
8
- GEOIP_DATABASE_PATH=GeoLite2-Country.mmdb
9
- GEOIP_DATABASE_URL=https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz
10
- GEOIP_AUTO_DOWNLOAD=true
 
 
 
 
 
 
 
 
 
 
 
.gitignore DELETED
@@ -1,216 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[codz]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
- *.egg-info/
25
- .installed.cfg
26
- *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
- *.manifest
33
- *.spec
34
-
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
- .coverage
44
- .coverage.*
45
- .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py.cover
50
- .hypothesis/
51
- .pytest_cache/
52
- .pytest_tmp/
53
- cover/
54
-
55
- # Translations
56
- *.mo
57
- *.pot
58
-
59
- # Django stuff:
60
- *.log
61
- local_settings.py
62
- db.sqlite3
63
- db.sqlite3-journal
64
-
65
- # Flask stuff:
66
- instance/
67
- .webassets-cache
68
-
69
- # Scrapy stuff:
70
- .scrapy
71
-
72
- # Sphinx documentation
73
- docs/_build/
74
-
75
- # PyBuilder
76
- .pybuilder/
77
- target/
78
-
79
- # Jupyter Notebook
80
- .ipynb_checkpoints
81
-
82
- # IPython
83
- profile_default/
84
- ipython_config.py
85
-
86
- # pyenv
87
- # For a library or package, you might want to ignore these files since the code is
88
- # intended to run in multiple environments; otherwise, check them in:
89
- .python-version
90
-
91
- # pipenv
92
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
- # install all needed dependencies.
96
- Pipfile.lock
97
-
98
- # UV
99
- # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
- # This is especially recommended for binary packages to ensure reproducibility, and is more
101
- # commonly ignored for libraries.
102
- uv.lock
103
-
104
- # poetry
105
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
- # This is especially recommended for binary packages to ensure reproducibility, and is more
107
- # commonly ignored for libraries.
108
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
- poetry.lock
110
- poetry.toml
111
-
112
- # pdm
113
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
- # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
115
- # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
116
- #pdm.lock
117
- #pdm.toml
118
- .pdm-python
119
- .pdm-build/
120
-
121
- # pixi
122
- # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
123
- #pixi.lock
124
- # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
125
- # in the .venv directory. It is recommended not to include this directory in version control.
126
- .pixi
127
-
128
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129
- __pypackages__/
130
-
131
- # Celery stuff
132
- celerybeat-schedule
133
- celerybeat.pid
134
-
135
- # SageMath parsed files
136
- *.sage.py
137
-
138
- # Environments
139
- .env
140
- .envrc
141
- .venv
142
- env/
143
- venv/
144
- ENV/
145
- env.bak/
146
- venv.bak/
147
-
148
- # Local GeoIP databases
149
- *.mmdb
150
- *.mmdb.gz
151
-
152
- # Local analytics exports
153
- visitor_ips*.csv
154
-
155
- # Spyder project settings
156
- .spyderproject
157
- .spyproject
158
-
159
- # Rope project settings
160
- .ropeproject
161
-
162
- # mkdocs documentation
163
- /site
164
-
165
- # mypy
166
- .mypy_cache/
167
- .dmypy.json
168
- dmypy.json
169
-
170
- # Pyre type checker
171
- .pyre/
172
-
173
- # pytype static type analyzer
174
- .pytype/
175
-
176
- # Cython debug symbols
177
- cython_debug/
178
-
179
- # PyCharm
180
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
181
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
182
- # and can be added to the global gitignore or merged into this file. For a more nuclear
183
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
184
- .idea/
185
-
186
- # Abstra
187
- # Abstra is an AI-powered process automation framework.
188
- # Ignore directories containing user credentials, local state, and settings.
189
- # Learn more at https://abstra.io/docs
190
- .abstra/
191
-
192
- # Visual Studio Code
193
- # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
194
- # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
195
- # and can be added to the global gitignore or merged into this file. However, if you prefer,
196
- # you could uncomment the following to ignore the entire vscode folder
197
- .vscode/
198
-
199
- # Ruff stuff:
200
- .ruff_cache/
201
-
202
- # PyPI configuration file
203
- .pypirc
204
-
205
- # Cursor
206
- # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
207
- # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
208
- # refer to https://docs.cursor.com/context/ignore-files
209
- .cursorignore
210
- .cursorindexingignore
211
- .cursor
212
-
213
- # Marimo
214
- marimo/_static/
215
- marimo/_lsp/
216
- __marimo__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CHANGELOG.md DELETED
@@ -1,20 +0,0 @@
1
- # Changelog
2
-
3
- All notable changes to this project will be documented in this file.
4
-
5
- ## Unreleased
6
-
7
- ### Added
8
-
9
- - Added full-range overview totals so UV and Sessions are distinct counts across the selected range.
10
- - Added ordered funnel logic that counts each step only when it occurs after the previous required step.
11
- - Added benchmark choices, raw data tables, and CSV export support to the dashboard.
12
- - Added query validation, MongoDB ping checks, and dashboard-friendly error messages.
13
- - Added pytest coverage for metric totals, query validation, and MongoDB aggregation pipeline shape.
14
- - Added CI for formatting, linting, and tests.
15
-
16
- ### Changed
17
-
18
- - Updated new vs returning visitor logic to compute first-seen dates from the full available page-view history before applying the selected reporting range.
19
- - Updated MongoDB aggregation pipelines to prefer an indexed `ts` Date field while retaining fallback support for legacy `timestamp` values.
20
- - Documented recommended MongoDB indexes for production deployments.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile DELETED
@@ -1,19 +0,0 @@
1
- FROM python:3.12-slim
2
-
3
- ENV PYTHONDONTWRITEBYTECODE=1 \
4
- PYTHONUNBUFFERED=1 \
5
- PIP_NO_CACHE_DIR=1 \
6
- HOST=0.0.0.0 \
7
- PORT=7860 \
8
- GRADIO_SHARE=false
9
-
10
- WORKDIR /app
11
-
12
- # Install project dependencies and package from pyproject.toml
13
- COPY pyproject.toml README.md ./
14
- COPY src ./src
15
- RUN pip install --upgrade pip && pip install .
16
-
17
- EXPOSE 7860
18
-
19
- CMD ["leaderboard-analytics"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE DELETED
@@ -1,201 +0,0 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,262 +1,12 @@
1
  ---
2
- title: leaderboard-analytics-service
3
- emoji: 📊
4
- colorFrom: blue
5
  colorTo: green
6
- sdk: gradio
7
- sdk_version: "6.0.0"
8
- python_version: "3.11"
9
- app_file: app.py
10
  pinned: false
 
 
11
  ---
12
 
13
- # Leaderboard Analytics Metrics Spec
14
-
15
- This project analyzes user behavior on the MTEB leaderboard page from event logs in MongoDB.
16
-
17
- The primary purpose of this document is to define **what is measured**, **where each metric comes from**, and **how each metric is calculated**.
18
-
19
- ---
20
-
21
- ## Data Contract
22
-
23
- All analytics are based on the `events` collection and the following stable fields:
24
-
25
- - Core dimensions: `event_name`, `timestamp`, `session_id`
26
- - Preferred event time: `ts` as a MongoDB Date
27
- - Behavior context: `benchmark`, `filters`
28
- - Visitor identity (approximate): `properties.visitor_id`
29
- - Visitor IP for country analysis: `properties.ip`
30
- - Change context: `properties.old_value`, `properties.new_value`, `properties.filter_name`
31
-
32
- Important event names:
33
-
34
- - `page_view`
35
- - `benchmark_change`
36
- - `filter_change_`* (dynamic names, such as `filter_change_task_type`)
37
- - `table_download` (currently may be missing in some deployments)
38
-
39
- ---
40
-
41
- ## Metrics Dictionary
42
-
43
- ### 1) PV (Page Views)
44
-
45
- - **Definition**: Number of page view events.
46
- - **Source fields**: `event_name`
47
- - **Calculation**:
48
- - Filter events where `event_name == "page_view"`
49
- - PV = count of matched events
50
-
51
- ### 2) Sessions
52
-
53
- - **Definition**: Number of unique interaction sessions.
54
- - **Source fields**: `session_id`
55
- - **Calculation**:
56
- - Sessions = count of distinct non-empty `session_id` values in the selected time range
57
-
58
- ### 3) UV (Unique Visitors, Approximate)
59
-
60
- - **Definition**: Number of unique visitors identified by hashed fingerprint.
61
- - **Source fields**: `properties.visitor_id`
62
- - **Calculation**:
63
- - Remove null/empty `properties.visitor_id`
64
- - UV = count of distinct `properties.visitor_id` values in the selected time range
65
-
66
- ### 4) Sessions Per Visitor
67
-
68
- - **Definition**: Average number of sessions per visitor.
69
- - **Source fields**: derived from Sessions and UV
70
- - **Calculation**:
71
- - Sessions Per Visitor = `Sessions / UV`
72
- - If UV is 0, result is 0
73
-
74
- ### 5) Session Depth (Events Per Session)
75
-
76
- - **Definition**: Average interaction intensity per session.
77
- - **Source fields**: all events, `session_id`
78
- - **Calculation**:
79
- - Total Events = count of all events in range
80
- - Session Depth = `Total Events / Sessions`
81
- - If Sessions is 0, result is 0
82
-
83
- ---
84
-
85
- ## Behavior Metrics
86
-
87
- ### 6) Benchmark Popularity
88
-
89
- - **Definition**: Frequency of selected benchmarks.
90
- - **Source fields**: `event_name`, `properties.new_value`
91
- - **Calculation**:
92
- - Filter `event_name == "benchmark_change"`
93
- - Group by `properties.new_value`
94
- - Popularity = event count per benchmark value
95
-
96
- ### 7) Filter Usage Distribution
97
-
98
- - **Definition**: Usage volume by filter event type.
99
- - **Source fields**: `event_name`
100
- - **Calculation**:
101
- - Filter `event_name` matching regex `^filter_change_`
102
- - Group by `event_name`
103
- - Distribution = count per filter event
104
-
105
- ### 8) Filter Session Coverage
106
-
107
- - **Definition**: Number of sessions that used each filter type.
108
- - **Source fields**: `event_name`, `session_id`
109
- - **Calculation**:
110
- - For each `filter_change_`* event type:
111
- - collect distinct non-empty `session_id`
112
- - coverage = distinct session count
113
-
114
- ---
115
-
116
- ## Funnel Metrics
117
-
118
- Recommended session-level funnel:
119
-
120
- 1. `page_view`
121
- 2. `benchmark_change`
122
- 3. `filter_change_`*
123
- 4. `table_download`
124
-
125
- ### 9) Step Session Count
126
-
127
- - **Definition**: Number of sessions that reached each ordered funnel step.
128
- - **Source fields**: `session_id`, `event_name`, `ts` or `timestamp`
129
- - **Calculation**:
130
- - Group events by `session_id`
131
- - Sort events by event time
132
- - Count each cumulative step only when it occurs after the previous required step
133
-
134
- ### 10) Step Conversion Rate
135
-
136
- - **Definition**: Conversion from funnel step 1 (`page_view`) to each step.
137
- - **Source fields**: derived from Step Session Count
138
- - **Calculation**:
139
- - Conversion Rate(step N) = `StepN Sessions / Step1 Sessions * 100%`
140
- - If Step1 Sessions is 0, result is 0%
141
-
142
- ---
143
-
144
- ## Visitor Segmentation Metrics
145
-
146
- ### 11) New Visitors
147
-
148
- - **Definition**: Visitors whose current period contains their first observed visit date.
149
- - **Source fields**: `event_name`, `ts` or `timestamp`, `properties.visitor_id`
150
- - **Calculation**:
151
- - Use `page_view` events only
152
- - For each `visitor_id`, find earliest timestamp (`first_seen`) from the full available dataset
153
- - If event date equals `first_seen` date, classify as `new`
154
- - Count distinct `visitor_id` by period
155
-
156
- ### 12) Returning Visitors
157
-
158
- - **Definition**: Visitors seen after their first observed date.
159
- - **Source fields**: same as New Visitors
160
- - **Calculation**:
161
- - Use same first-seen logic
162
- - If event date is later than first-seen date, classify as `returning`
163
- - Count distinct `visitor_id` by period
164
-
165
- ### 13) Visitor Locations by Country
166
-
167
- - **Definition**: Page view volume by visitor IP country/region.
168
- - **Source fields**: `event_name`, `properties.ip`
169
- - **Calculation**:
170
- - Filter `event_name == "page_view"`
171
- - Remove null/empty `properties.ip`
172
- - Group page views by IP in MongoDB
173
- - Resolve each IP to a country using the local MaxMind GeoLite2 Country database
174
- - Group by `country_code` and `country_name`
175
- - Map color = page view count (`pv`)
176
- - Private, invalid, unresolved, or unconfigured IPs are grouped as `Unknown`
177
-
178
- ---
179
-
180
- ## Time Aggregation Rules
181
-
182
- All trend metrics support these granularities:
183
-
184
- - `day` -> `%Y-%m-%d`
185
- - `week` -> `%G-W%V` (ISO week)
186
- - `month` -> `%Y-%m`
187
-
188
- Time filtering rules:
189
-
190
- - Prefer the indexed MongoDB Date field `ts`
191
- - Fall back to converting legacy `timestamp` values when `ts` is not present
192
- - Keep records where `start_time <= event time <= end_time`
193
-
194
- Optional benchmark filtering:
195
-
196
- - If benchmark filter is provided, add `benchmark == <value>` to match conditions
197
-
198
- ---
199
-
200
- ## Data Quality Notes
201
-
202
- 1. `visitor_id` is an approximate identifier, not a strict user identity.
203
- 2. For `filter_change_`*, `properties.new_value` may not always represent the actual final filter value; prefer `filters` snapshot for behavioral context.
204
- 3. If `table_download` is not instrumented, funnel step 4 will under-report by design.
205
- 4. Total UV and Sessions are distinct counts across the full selected time range. They are not calculated by summing per-period trend values.
206
- 5. Funnel steps are ordered by event time. A session only reaches a later step when that step happens after the previous required step.
207
-
208
- ---
209
-
210
- ## MongoDB Performance Notes
211
-
212
- For production deployments, store event time as a MongoDB Date field named `ts`. Keeping only string timestamps forces aggregation pipelines to convert time values at query time and can reduce index usage.
213
-
214
- Recommended indexes:
215
-
216
- ```javascript
217
- db.events.createIndex({ ts: 1 })
218
- db.events.createIndex({ ts: 1, benchmark: 1 })
219
- db.events.createIndex({ event_name: 1, ts: 1 })
220
- db.events.createIndex({ session_id: 1, ts: 1 })
221
- db.events.createIndex({ "properties.visitor_id": 1, ts: 1 })
222
- db.events.createIndex({ event_name: 1, ts: 1, "properties.ip": 1 })
223
- ```
224
-
225
- Legacy events with only `timestamp` remain supported, but backfilling `ts` is recommended before running this dashboard against large collections.
226
-
227
- ---
228
-
229
- ## Minimal Runtime Notes
230
-
231
- Only required runtime inputs:
232
-
233
- - MongoDB connection URI (`MONGO_URI`)
234
- - Mongo database/collection names (defaults supported)
235
-
236
- Optional visitor location input:
237
-
238
- - `GEOIP_DATABASE_PATH`: path to a local MaxMind `GeoLite2-Country.mmdb` file
239
- - `GEOIP_DATABASE_URL`: URL for a gzipped GeoLite2 Country MMDB download
240
- - `GEOIP_AUTO_DOWNLOAD`: whether to download and decompress the MMDB when missing
241
-
242
- The dashboard does not call an external IP lookup API for visitor lookups. By default,
243
- startup downloads `https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz`
244
- when `GEOIP_DATABASE_PATH` is missing, decompresses it, and uses the resulting MMDB file
245
- locally. Set `GEOIP_AUTO_DOWNLOAD=false` if the runtime cannot access the network or if
246
- you prefer to mount the MMDB yourself. If the database is unavailable, visitor location
247
- rows are grouped as `Unknown`.
248
-
249
- Local commands:
250
-
251
- ```bash
252
- uv sync
253
- uv run leaderboard-analytics
254
- ```
255
-
256
- Run quality checks:
257
-
258
- ```bash
259
- uv run ruff format --check .
260
- uv run ruff check .
261
- uv run pytest
262
- ```
 
1
  ---
2
+ title: Leaderboard Analytics Service
3
+ emoji: 🏃
4
+ colorFrom: green
5
  colorTo: green
6
+ sdk: docker
 
 
 
7
  pinned: false
8
+ license: apache-2.0
9
+ short_description: A backend analytics service for the MTEB Leaderboard
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,15 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
-
4
- # Ensure src-layout package is importable in Hugging Face Spaces runtime.
5
- ROOT_DIR = Path(__file__).resolve().parent
6
- SRC_DIR = ROOT_DIR / "src"
7
- if str(SRC_DIR) not in sys.path:
8
- sys.path.insert(0, str(SRC_DIR))
9
-
10
- from leaderboard_analytics.main import create_demo, launch_demo # noqa: E402
11
-
12
- demo = create_demo()
13
-
14
- if __name__ == "__main__":
15
- launch_demo(demo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml DELETED
@@ -1,46 +0,0 @@
1
- [project]
2
- name = "leaderboard-analytics-service"
3
- version = "0.1.0"
4
- description = "Analytics dashboard for MTEB leaderboard event logs"
5
- readme = "README.md"
6
- requires-python = ">=3.11"
7
- dependencies = [
8
- "gradio>=6.0.0",
9
- "pymongo>=4.10.0",
10
- "pydantic>=2.9.0",
11
- "pydantic-settings>=2.6.0",
12
- "python-dotenv>=1.0.1",
13
- "pandas>=2.2.3",
14
- "plotly>=5.24.1",
15
- "geoip2>=4.8.0",
16
- ]
17
-
18
- [project.optional-dependencies]
19
- dev = [
20
- "pytest>=8.3.0",
21
- "ruff>=0.8.0",
22
- ]
23
-
24
- [tool.ruff]
25
- line-length = 100
26
- target-version = "py311"
27
-
28
- [tool.ruff.lint]
29
- select = ["E", "F", "I", "B", "UP", "C4"]
30
-
31
- [tool.ruff.format]
32
- quote-style = "double"
33
- indent-style = "space"
34
-
35
- [project.scripts]
36
- leaderboard-analytics = "leaderboard_analytics.main:run"
37
-
38
- [build-system]
39
- requires = ["hatchling"]
40
- build-backend = "hatchling.build"
41
-
42
- [tool.hatch.build.targets.wheel]
43
- packages = ["src/leaderboard_analytics"]
44
-
45
- [tool.pytest.ini_options]
46
- pythonpath = ["src"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,8 +0,0 @@
1
- gradio>=6.0.0
2
- pymongo>=4.10.0
3
- pydantic>=2.9.0
4
- pydantic-settings>=2.6.0
5
- python-dotenv>=1.0.1
6
- pandas>=2.2.3
7
- plotly>=5.24.1
8
- geoip2>=4.8.0
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Leaderboard analytics package."""
 
 
src/leaderboard_analytics/config.py DELETED
@@ -1,25 +0,0 @@
1
- from functools import lru_cache
2
-
3
- from pydantic_settings import BaseSettings, SettingsConfigDict
4
-
5
-
6
- class Settings(BaseSettings):
7
- model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
8
-
9
- mongo_uri: str = ""
10
- mongo_database: str = "event_logger"
11
- mongo_collection: str = "events"
12
- host: str = "0.0.0.0"
13
- port: int = 7860
14
- gradio_share: bool = False
15
- gradio_ssr_mode: bool = False
16
- geoip_database_path: str = "GeoLite2-Country.mmdb"
17
- geoip_database_url: str = (
18
- "https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
19
- )
20
- geoip_auto_download: bool = True
21
-
22
-
23
- @lru_cache(maxsize=1)
24
- def get_settings() -> Settings:
25
- return Settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/db.py DELETED
@@ -1,24 +0,0 @@
1
- from pymongo import MongoClient
2
- from pymongo.collection import Collection
3
- from pymongo.database import Database
4
-
5
- from leaderboard_analytics.config import get_settings
6
-
7
-
8
- def get_mongo_client() -> MongoClient:
9
- settings = get_settings()
10
- if not settings.mongo_uri:
11
- raise ValueError("MONGO_URI is not configured. Please set MONGO_URI in .env file.")
12
- client = MongoClient(settings.mongo_uri, serverSelectionTimeoutMS=5000)
13
- client.admin.command("ping")
14
- return client
15
-
16
-
17
- def get_database(client: MongoClient) -> Database:
18
- settings = get_settings()
19
- return client[settings.mongo_database]
20
-
21
-
22
- def get_events_collection(db: Database) -> Collection:
23
- settings = get_settings()
24
- return db[settings.mongo_collection]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/geoip_database.py DELETED
@@ -1,36 +0,0 @@
1
- import gzip
2
- import shutil
3
- import tempfile
4
- from pathlib import Path
5
- from urllib.request import urlopen
6
-
7
- DEFAULT_GEOIP_DATABASE_URL = (
8
- "https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
9
- )
10
-
11
-
12
- def ensure_geoip_database(
13
- database_path: str | Path,
14
- source_url: str = DEFAULT_GEOIP_DATABASE_URL,
15
- *,
16
- auto_download: bool = True,
17
- timeout: float = 30.0,
18
- ) -> Path:
19
- target_path = Path(database_path)
20
- if target_path.exists() or not auto_download:
21
- return target_path
22
-
23
- target_path.parent.mkdir(parents=True, exist_ok=True)
24
- with tempfile.NamedTemporaryFile(
25
- prefix=f"{target_path.name}.",
26
- suffix=".tmp",
27
- dir=target_path.parent,
28
- delete=False,
29
- ) as temp_file:
30
- temp_path = Path(temp_file.name)
31
- with urlopen(source_url, timeout=timeout) as response:
32
- with gzip.GzipFile(fileobj=response) as gzip_file:
33
- shutil.copyfileobj(gzip_file, temp_file)
34
-
35
- temp_path.replace(target_path)
36
- return target_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/main.py DELETED
@@ -1,49 +0,0 @@
1
- from leaderboard_analytics.config import get_settings
2
- from leaderboard_analytics.db import get_database, get_events_collection, get_mongo_client
3
- from leaderboard_analytics.geoip_database import ensure_geoip_database
4
- from leaderboard_analytics.repositories import AnalyticsRepository
5
- from leaderboard_analytics.services import AnalyticsService
6
- from leaderboard_analytics.ui import build_dashboard
7
-
8
-
9
- def create_demo():
10
- settings = get_settings()
11
- client = get_mongo_client()
12
- db = get_database(client)
13
- events_collection = get_events_collection(db)
14
- geoip_database_path = settings.geoip_database_path
15
- try:
16
- geoip_database_path = str(
17
- ensure_geoip_database(
18
- settings.geoip_database_path,
19
- settings.geoip_database_url,
20
- auto_download=settings.geoip_auto_download,
21
- )
22
- )
23
- except Exception as exc:
24
- print(f"GeoIP database download failed: {exc}")
25
-
26
- repository = AnalyticsRepository(events_collection=events_collection)
27
- service = AnalyticsService(
28
- repository=repository,
29
- geoip_database_path=geoip_database_path,
30
- )
31
- return build_dashboard(service=service)
32
-
33
-
34
- def launch_demo(demo) -> None:
35
- settings = get_settings()
36
- demo.launch(
37
- server_name=settings.host,
38
- server_port=settings.port,
39
- share=settings.gradio_share,
40
- ssr_mode=settings.gradio_ssr_mode,
41
- )
42
-
43
-
44
- def run() -> None:
45
- launch_demo(create_demo())
46
-
47
-
48
- if __name__ == "__main__":
49
- run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/repositories.py DELETED
@@ -1,463 +0,0 @@
1
- from collections.abc import Iterable
2
-
3
- from pymongo.collection import Collection
4
-
5
- from leaderboard_analytics.schemas import Granularity, QueryFilters
6
-
7
-
8
- def _period_expression(granularity: Granularity) -> dict:
9
- format_map = {
10
- Granularity.DAY: "%Y-%m-%d",
11
- Granularity.WEEK: "%G-W%V",
12
- Granularity.MONTH: "%Y-%m",
13
- }
14
- return {"$dateToString": {"format": format_map[granularity], "date": "$event_ts"}}
15
-
16
-
17
- def _with_normalized_time() -> dict:
18
- return {
19
- "$addFields": {
20
- "event_ts": {"$ifNull": ["$ts", {"$toDate": "$timestamp"}]},
21
- "visitor_id": "$properties.visitor_id",
22
- }
23
- }
24
-
25
-
26
- def _indexed_time_prefilter(filters: QueryFilters) -> dict:
27
- matcher: dict = {
28
- "$or": [
29
- {"ts": {"$gte": filters.start_time, "$lte": filters.end_time}},
30
- {"ts": None},
31
- {"ts": {"$exists": False}},
32
- ]
33
- }
34
- if filters.benchmark:
35
- matcher["benchmark"] = filters.benchmark
36
- return matcher
37
-
38
-
39
- def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
40
- matcher: dict = {
41
- "event_ts": {
42
- "$gte": filters.start_time,
43
- "$lte": filters.end_time,
44
- }
45
- }
46
- if filters.benchmark:
47
- matcher["benchmark"] = filters.benchmark
48
- return matcher
49
-
50
-
51
- def _non_empty_set_size(field_name: str, variable_name: str) -> dict:
52
- return {
53
- "$size": {
54
- "$filter": {
55
- "input": f"${field_name}",
56
- "as": variable_name,
57
- "cond": {
58
- "$and": [
59
- {"$ne": [f"$${variable_name}", None]},
60
- {"$ne": [f"$${variable_name}", ""]},
61
- ]
62
- },
63
- }
64
- }
65
- }
66
-
67
-
68
- class AnalyticsRepository:
69
- def __init__(self, events_collection: Collection) -> None:
70
- self.events_collection = events_collection
71
-
72
- def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
73
- period_expr = _period_expression(filters.granularity)
74
- pipeline: list[dict] = [
75
- {"$match": _indexed_time_prefilter(filters)},
76
- _with_normalized_time(),
77
- {"$match": _with_time_and_optional_benchmark(filters)},
78
- {
79
- "$group": {
80
- "_id": {"period": period_expr},
81
- "pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
82
- "event_count": {"$sum": 1},
83
- "sessions": {"$addToSet": "$session_id"},
84
- "visitors": {"$addToSet": "$visitor_id"},
85
- }
86
- },
87
- {
88
- "$project": {
89
- "_id": 0,
90
- "period": "$_id.period",
91
- "pv": 1,
92
- "event_count": 1,
93
- "session_count": _non_empty_set_size("sessions", "s"),
94
- "uv": _non_empty_set_size("visitors", "v"),
95
- }
96
- },
97
- {"$sort": {"period": 1}},
98
- ]
99
- return list(self.events_collection.aggregate(pipeline))
100
-
101
- def overview_totals(self, filters: QueryFilters) -> dict:
102
- pipeline: list[dict] = [
103
- {"$match": _indexed_time_prefilter(filters)},
104
- _with_normalized_time(),
105
- {"$match": _with_time_and_optional_benchmark(filters)},
106
- {
107
- "$group": {
108
- "_id": None,
109
- "pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
110
- "events": {"$sum": 1},
111
- "sessions": {"$addToSet": "$session_id"},
112
- "visitors": {"$addToSet": "$visitor_id"},
113
- }
114
- },
115
- {
116
- "$project": {
117
- "_id": 0,
118
- "pv": 1,
119
- "events": 1,
120
- "sessions": _non_empty_set_size("sessions", "s"),
121
- "uv": _non_empty_set_size("visitors", "v"),
122
- }
123
- },
124
- ]
125
- return self.safe_first(self.events_collection.aggregate(pipeline))
126
-
127
- def benchmark_top(self, filters: QueryFilters, limit: int = 20) -> list[dict]:
128
- pipeline: list[dict] = [
129
- {"$match": _indexed_time_prefilter(filters)},
130
- _with_normalized_time(),
131
- {
132
- "$match": {
133
- **_with_time_and_optional_benchmark(filters),
134
- "event_name": "benchmark_change",
135
- }
136
- },
137
- {"$group": {"_id": "$properties.new_value", "count": {"$sum": 1}}},
138
- {"$match": {"_id": {"$nin": [None, ""]}}},
139
- {"$project": {"_id": 0, "benchmark": "$_id", "count": 1}},
140
- {"$sort": {"count": -1}},
141
- {"$limit": limit},
142
- ]
143
- return list(self.events_collection.aggregate(pipeline))
144
-
145
- def filter_distribution(self, filters: QueryFilters) -> list[dict]:
146
- pipeline: list[dict] = [
147
- {"$match": _indexed_time_prefilter(filters)},
148
- _with_normalized_time(),
149
- {
150
- "$match": {
151
- **_with_time_and_optional_benchmark(filters),
152
- "event_name": {"$regex": "^filter_change_"},
153
- }
154
- },
155
- {
156
- "$group": {
157
- "_id": "$event_name",
158
- "count": {"$sum": 1},
159
- "sessions": {"$addToSet": "$session_id"},
160
- }
161
- },
162
- {
163
- "$project": {
164
- "_id": 0,
165
- "event_name": "$_id",
166
- "count": 1,
167
- "session_coverage": _non_empty_set_size("sessions", "s"),
168
- }
169
- },
170
- {"$sort": {"count": -1}},
171
- ]
172
- return list(self.events_collection.aggregate(pipeline))
173
-
174
- def funnel(self, filters: QueryFilters) -> list[dict]:
175
- pipeline: list[dict] = [
176
- {"$match": _indexed_time_prefilter(filters)},
177
- _with_normalized_time(),
178
- {"$match": _with_time_and_optional_benchmark(filters)},
179
- {"$sort": {"session_id": 1, "event_ts": 1}},
180
- {
181
- "$group": {
182
- "_id": "$session_id",
183
- "events": {"$push": {"name": "$event_name", "ts": "$event_ts"}},
184
- }
185
- },
186
- {"$match": {"_id": {"$nin": [None, ""]}}},
187
- {
188
- "$project": {
189
- "events": 1,
190
- "page_view_at": {
191
- "$arrayElemAt": [
192
- {
193
- "$map": {
194
- "input": {
195
- "$filter": {
196
- "input": "$events",
197
- "as": "event",
198
- "cond": {"$eq": ["$$event.name", "page_view"]},
199
- }
200
- },
201
- "as": "event",
202
- "in": "$$event.ts",
203
- }
204
- },
205
- 0,
206
- ]
207
- },
208
- }
209
- },
210
- {
211
- "$project": {
212
- "events": 1,
213
- "page_view_at": 1,
214
- "benchmark_change_at": {
215
- "$arrayElemAt": [
216
- {
217
- "$map": {
218
- "input": {
219
- "$filter": {
220
- "input": "$events",
221
- "as": "event",
222
- "cond": {
223
- "$and": [
224
- {"$eq": ["$$event.name", "benchmark_change"]},
225
- {"$gte": ["$$event.ts", "$page_view_at"]},
226
- ]
227
- },
228
- }
229
- },
230
- "as": "event",
231
- "in": "$$event.ts",
232
- }
233
- },
234
- 0,
235
- ]
236
- },
237
- }
238
- },
239
- {
240
- "$project": {
241
- "events": 1,
242
- "page_view_at": 1,
243
- "benchmark_change_at": 1,
244
- "filter_change_at": {
245
- "$arrayElemAt": [
246
- {
247
- "$map": {
248
- "input": {
249
- "$filter": {
250
- "input": "$events",
251
- "as": "event",
252
- "cond": {
253
- "$and": [
254
- {
255
- "$regexMatch": {
256
- "input": "$$event.name",
257
- "regex": "^filter_change_",
258
- }
259
- },
260
- {
261
- "$gte": [
262
- "$$event.ts",
263
- "$benchmark_change_at",
264
- ]
265
- },
266
- ]
267
- },
268
- }
269
- },
270
- "as": "event",
271
- "in": "$$event.ts",
272
- }
273
- },
274
- 0,
275
- ]
276
- },
277
- }
278
- },
279
- {
280
- "$project": {
281
- "page_view_at": 1,
282
- "benchmark_change_at": 1,
283
- "filter_change_at": 1,
284
- "table_download_at": {
285
- "$arrayElemAt": [
286
- {
287
- "$map": {
288
- "input": {
289
- "$filter": {
290
- "input": "$events",
291
- "as": "event",
292
- "cond": {
293
- "$and": [
294
- {"$eq": ["$$event.name", "table_download"]},
295
- {"$gte": ["$$event.ts", "$filter_change_at"]},
296
- ]
297
- },
298
- }
299
- },
300
- "as": "event",
301
- "in": "$$event.ts",
302
- }
303
- },
304
- 0,
305
- ]
306
- },
307
- }
308
- },
309
- {
310
- "$group": {
311
- "_id": None,
312
- "step1_page_view": {
313
- "$sum": {"$cond": [{"$ne": ["$page_view_at", None]}, 1, 0]}
314
- },
315
- "step2_benchmark_change": {
316
- "$sum": {
317
- "$cond": [
318
- {
319
- "$and": [
320
- {"$ne": ["$page_view_at", None]},
321
- {"$gte": ["$benchmark_change_at", "$page_view_at"]},
322
- ]
323
- },
324
- 1,
325
- 0,
326
- ]
327
- }
328
- },
329
- "step3_filter_change": {
330
- "$sum": {
331
- "$cond": [
332
- {
333
- "$and": [
334
- {"$ne": ["$page_view_at", None]},
335
- {"$gte": ["$benchmark_change_at", "$page_view_at"]},
336
- {"$gte": ["$filter_change_at", "$benchmark_change_at"]},
337
- ]
338
- },
339
- 1,
340
- 0,
341
- ]
342
- }
343
- },
344
- "step4_table_download": {
345
- "$sum": {
346
- "$cond": [
347
- {
348
- "$and": [
349
- {"$ne": ["$page_view_at", None]},
350
- {"$gte": ["$benchmark_change_at", "$page_view_at"]},
351
- {"$gte": ["$filter_change_at", "$benchmark_change_at"]},
352
- {"$gte": ["$table_download_at", "$filter_change_at"]},
353
- ]
354
- },
355
- 1,
356
- 0,
357
- ]
358
- }
359
- },
360
- }
361
- },
362
- {
363
- "$project": {
364
- "_id": 0,
365
- "step1_page_view": 1,
366
- "step2_benchmark_change": 1,
367
- "step3_filter_change": 1,
368
- "step4_table_download": 1,
369
- }
370
- },
371
- ]
372
- return list(self.events_collection.aggregate(pipeline))
373
-
374
- def visitors_new_vs_returning(self, filters: QueryFilters) -> list[dict]:
375
- period_expr = _period_expression(filters.granularity)
376
- pipeline: list[dict] = [
377
- _with_normalized_time(),
378
- {
379
- "$match": {
380
- "event_name": "page_view",
381
- "visitor_id": {"$nin": [None, ""]},
382
- }
383
- },
384
- {
385
- "$setWindowFields": {
386
- "partitionBy": "$visitor_id",
387
- "sortBy": {"event_ts": 1},
388
- "output": {"first_seen": {"$first": "$event_ts"}},
389
- }
390
- },
391
- {"$match": _with_time_and_optional_benchmark(filters)},
392
- {
393
- "$project": {
394
- "period": period_expr,
395
- "is_new": {
396
- "$eq": [
397
- {"$dateToString": {"format": "%Y-%m-%d", "date": "$event_ts"}},
398
- {"$dateToString": {"format": "%Y-%m-%d", "date": "$first_seen"}},
399
- ]
400
- },
401
- "visitor_id": 1,
402
- }
403
- },
404
- {
405
- "$group": {
406
- "_id": {"period": "$period", "is_new": "$is_new"},
407
- "visitors": {"$addToSet": "$visitor_id"},
408
- }
409
- },
410
- {
411
- "$project": {
412
- "_id": 0,
413
- "period": "$_id.period",
414
- "is_new": "$_id.is_new",
415
- "visitor_count": _non_empty_set_size("visitors", "v"),
416
- }
417
- },
418
- {"$sort": {"period": 1, "is_new": -1}},
419
- ]
420
- return list(self.events_collection.aggregate(pipeline))
421
-
422
- def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
423
- pipeline: list[dict] = [
424
- {"$match": _indexed_time_prefilter(filters)},
425
- _with_normalized_time(),
426
- {
427
- "$match": {
428
- **_with_time_and_optional_benchmark(filters),
429
- "event_name": "page_view",
430
- "properties.ip": {"$nin": [None, ""]},
431
- }
432
- },
433
- {"$group": {"_id": "$properties.ip", "pv": {"$sum": 1}}},
434
- {"$project": {"_id": 0, "ip": "$_id", "pv": 1}},
435
- {"$sort": {"pv": -1}},
436
- ]
437
- return list(self.events_collection.aggregate(pipeline))
438
-
439
- def available_benchmarks(
440
- self, filters: QueryFilters | None = None, limit: int = 100
441
- ) -> list[str]:
442
- pipeline: list[dict] = []
443
- if filters is not None:
444
- pipeline.extend(
445
- [
446
- {"$match": _indexed_time_prefilter(filters)},
447
- _with_normalized_time(),
448
- {"$match": _with_time_and_optional_benchmark(filters)},
449
- ]
450
- )
451
- pipeline.extend(
452
- [
453
- {"$match": {"benchmark": {"$nin": [None, ""]}}},
454
- {"$group": {"_id": "$benchmark"}},
455
- {"$sort": {"_id": 1}},
456
- {"$limit": limit},
457
- ]
458
- )
459
- return [row["_id"] for row in self.events_collection.aggregate(pipeline)]
460
-
461
- @staticmethod
462
- def safe_first(items: Iterable[dict]) -> dict:
463
- return next(iter(items), {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/schemas.py DELETED
@@ -1,27 +0,0 @@
1
- from datetime import UTC, datetime
2
- from enum import StrEnum
3
-
4
- from pydantic import BaseModel, Field, model_validator
5
-
6
-
7
- class Granularity(StrEnum):
8
- DAY = "day"
9
- WEEK = "week"
10
- MONTH = "month"
11
-
12
-
13
- class QueryFilters(BaseModel):
14
- start_time: datetime = Field(
15
- default_factory=lambda: datetime.now(tz=UTC).replace(
16
- hour=0, minute=0, second=0, microsecond=0
17
- )
18
- )
19
- end_time: datetime = Field(default_factory=lambda: datetime.now(tz=UTC))
20
- benchmark: str | None = None
21
- granularity: Granularity = Granularity.DAY
22
-
23
- @model_validator(mode="after")
24
- def validate_time_range(self) -> "QueryFilters":
25
- if self.start_time > self.end_time:
26
- raise ValueError("start_time must be earlier than or equal to end_time")
27
- return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/services.py DELETED
@@ -1,264 +0,0 @@
1
- import ipaddress
2
- from pathlib import Path
3
- from typing import Any, Protocol
4
-
5
- import pandas as pd
6
-
7
- from leaderboard_analytics.repositories import AnalyticsRepository
8
- from leaderboard_analytics.schemas import QueryFilters
9
-
10
- UNKNOWN_COUNTRY_CODE = "Unknown"
11
- UNKNOWN_COUNTRY_NAME = "Unknown"
12
-
13
-
14
- def _empty_ip_debug() -> dict[str, object]:
15
- return {
16
- "total_unique_ips": 0,
17
- "total_ip_pv": 0,
18
- "global_ips": 0,
19
- "global_ip_pv": 0,
20
- "private_ips": 0,
21
- "private_ip_pv": 0,
22
- "loopback_ips": 0,
23
- "loopback_ip_pv": 0,
24
- "reserved_ips": 0,
25
- "reserved_ip_pv": 0,
26
- "link_local_ips": 0,
27
- "link_local_ip_pv": 0,
28
- "multicast_ips": 0,
29
- "multicast_ip_pv": 0,
30
- "unspecified_ips": 0,
31
- "unspecified_ip_pv": 0,
32
- "invalid_ips": 0,
33
- "invalid_ip_pv": 0,
34
- "top_ip_pv_buckets": {
35
- "1": 0,
36
- "2-10": 0,
37
- "11-100": 0,
38
- "101-1000": 0,
39
- ">1000": 0,
40
- },
41
- }
42
-
43
-
44
- def _ip_debug_category(ip_address: str) -> str:
45
- try:
46
- parsed_ip = ipaddress.ip_address(ip_address.strip())
47
- except ValueError:
48
- return "invalid"
49
-
50
- if parsed_ip.is_global:
51
- return "global"
52
- if parsed_ip.is_loopback:
53
- return "loopback"
54
- if parsed_ip.is_private:
55
- return "private"
56
- if parsed_ip.is_reserved:
57
- return "reserved"
58
- if parsed_ip.is_link_local:
59
- return "link_local"
60
- if parsed_ip.is_multicast:
61
- return "multicast"
62
- if parsed_ip.is_unspecified:
63
- return "unspecified"
64
- return "reserved"
65
-
66
-
67
- def _ip_pv_bucket(pv: int) -> str:
68
- if pv <= 1:
69
- return "1"
70
- if pv <= 10:
71
- return "2-10"
72
- if pv <= 100:
73
- return "11-100"
74
- if pv <= 1000:
75
- return "101-1000"
76
- return ">1000"
77
-
78
-
79
- class GeoIpCountryReader(Protocol):
80
- def country(self, ip_address: str) -> Any: ...
81
-
82
-
83
- class GeoIpResolver:
84
- def __init__(
85
- self,
86
- database_path: str | Path | None = None,
87
- reader: GeoIpCountryReader | None = None,
88
- ) -> None:
89
- self.database_path = Path(database_path) if database_path else None
90
- self._reader = reader
91
- self._load_attempted = reader is not None
92
-
93
- def resolve_country(self, ip_address: str) -> tuple[str, str]:
94
- try:
95
- parsed_ip = ipaddress.ip_address(ip_address.strip())
96
- except ValueError:
97
- return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
98
-
99
- if not parsed_ip.is_global:
100
- return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
101
-
102
- reader = self._get_reader()
103
- if reader is None:
104
- return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
105
-
106
- try:
107
- response = reader.country(str(parsed_ip))
108
- except Exception:
109
- return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
110
-
111
- country = response.country
112
- if not getattr(country, "iso_code", None):
113
- country = response.registered_country
114
-
115
- code = getattr(country, "iso_code", None)
116
- if not code:
117
- return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
118
-
119
- return code, getattr(country, "name", None) or code
120
-
121
- def debug_status(self) -> dict[str, object]:
122
- return {
123
- "database_path": str(self.database_path) if self.database_path else "",
124
- "database_configured": self.database_path is not None,
125
- "database_exists": self.database_path.exists() if self.database_path else False,
126
- "load_attempted": self._load_attempted,
127
- "reader_loaded": self._reader is not None,
128
- }
129
-
130
- def _get_reader(self) -> GeoIpCountryReader | None:
131
- if self._reader is not None:
132
- return self._reader
133
-
134
- if self._load_attempted:
135
- return None
136
-
137
- self._load_attempted = True
138
- if self.database_path is None or not self.database_path.exists():
139
- return None
140
-
141
- try:
142
- import geoip2.database
143
-
144
- self._reader = geoip2.database.Reader(str(self.database_path))
145
- except Exception:
146
- return None
147
-
148
- return self._reader
149
-
150
-
151
- class AnalyticsService:
152
- def __init__(
153
- self,
154
- repository: AnalyticsRepository,
155
- geoip_database_path: str | Path | None = None,
156
- geoip_resolver: GeoIpResolver | None = None,
157
- ) -> None:
158
- self.repository = repository
159
- self.geoip_resolver = geoip_resolver or GeoIpResolver(geoip_database_path)
160
-
161
- def get_overview(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
162
- rows = self.repository.overview_timeseries(filters)
163
- frame = pd.DataFrame(rows)
164
- raw_totals = self.repository.overview_totals(filters)
165
- totals = {
166
- "pv": int(raw_totals.get("pv", 0)),
167
- "uv": int(raw_totals.get("uv", 0)),
168
- "sessions": int(raw_totals.get("sessions", 0)),
169
- "events": int(raw_totals.get("events", 0)),
170
- }
171
- totals["events_per_session"] = (
172
- round(totals["events"] / totals["sessions"], 2) if totals["sessions"] else 0.0
173
- )
174
- totals["sessions_per_visitor"] = (
175
- round(totals["sessions"] / totals["uv"], 2) if totals["uv"] else 0.0
176
- )
177
- return frame, totals
178
-
179
- def get_benchmark_top(self, filters: QueryFilters) -> pd.DataFrame:
180
- return pd.DataFrame(self.repository.benchmark_top(filters))
181
-
182
- def get_filter_distribution(self, filters: QueryFilters) -> pd.DataFrame:
183
- return pd.DataFrame(self.repository.filter_distribution(filters))
184
-
185
- def get_funnel(self, filters: QueryFilters) -> pd.DataFrame:
186
- raw = self.repository.safe_first(self.repository.funnel(filters))
187
- rows = [
188
- {"step": "page_view", "sessions": raw.get("step1_page_view", 0)},
189
- {"step": "benchmark_change", "sessions": raw.get("step2_benchmark_change", 0)},
190
- {"step": "filter_change_*", "sessions": raw.get("step3_filter_change", 0)},
191
- {"step": "table_download", "sessions": raw.get("step4_table_download", 0)},
192
- ]
193
- frame = pd.DataFrame(rows)
194
- step1 = int(frame.iloc[0]["sessions"]) if not frame.empty else 0
195
- frame["conversion_rate"] = frame["sessions"].apply(
196
- lambda x: round((x / step1) * 100, 2) if step1 else 0.0
197
- )
198
- return frame
199
-
200
- def get_new_vs_returning(self, filters: QueryFilters) -> pd.DataFrame:
201
- frame = pd.DataFrame(self.repository.visitors_new_vs_returning(filters))
202
- if frame.empty:
203
- return frame
204
- frame["visitor_type"] = frame["is_new"].map({True: "new", False: "returning"})
205
- return frame
206
-
207
- def get_visitor_locations(self, filters: QueryFilters) -> pd.DataFrame:
208
- frame, _debug = self.get_visitor_location_details(filters)
209
- return frame
210
-
211
- def get_visitor_location_details(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
212
- locations: dict[tuple[str, str], dict[str, int | str]] = {}
213
- ip_debug = _empty_ip_debug()
214
- for row in self.repository.visitor_ip_counts(filters):
215
- ip = str(row.get("ip", "")).strip()
216
- if not ip:
217
- continue
218
-
219
- pv = int(row.get("pv", 0))
220
- category = _ip_debug_category(ip)
221
- ip_debug["total_unique_ips"] = int(ip_debug["total_unique_ips"]) + 1
222
- ip_debug["total_ip_pv"] = int(ip_debug["total_ip_pv"]) + pv
223
- ip_debug[f"{category}_ips"] = int(ip_debug[f"{category}_ips"]) + 1
224
- ip_debug[f"{category}_ip_pv"] = int(ip_debug[f"{category}_ip_pv"]) + pv
225
- ip_debug["top_ip_pv_buckets"][_ip_pv_bucket(pv)] += 1 # type: ignore[index]
226
-
227
- code, name = self.geoip_resolver.resolve_country(ip)
228
- key = (code, name)
229
- if key not in locations:
230
- locations[key] = {
231
- "country_code": code,
232
- "country_name": name,
233
- "pv": 0,
234
- "ip_count": 0,
235
- }
236
-
237
- locations[key]["pv"] = int(locations[key]["pv"]) + pv
238
- locations[key]["ip_count"] = int(locations[key]["ip_count"]) + 1
239
-
240
- frame = pd.DataFrame(
241
- locations.values(),
242
- columns=["country_code", "country_name", "pv", "ip_count"],
243
- )
244
- if frame.empty:
245
- return frame, ip_debug
246
- frame = frame.sort_values(["pv", "ip_count"], ascending=[False, False]).reset_index(
247
- drop=True
248
- )
249
- return frame, ip_debug
250
-
251
- def get_geoip_debug_info(self) -> dict[str, object]:
252
- debug_status = getattr(self.geoip_resolver, "debug_status", None)
253
- if debug_status is None:
254
- return {
255
- "database_path": "",
256
- "database_configured": False,
257
- "database_exists": False,
258
- "load_attempted": False,
259
- "reader_loaded": False,
260
- }
261
- return debug_status()
262
-
263
- def get_available_benchmarks(self, filters: QueryFilters | None = None) -> list[str]:
264
- return self.repository.available_benchmarks(filters)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard_analytics/ui.py DELETED
@@ -1,481 +0,0 @@
1
- import math
2
- import tempfile
3
- import zipfile
4
- from datetime import UTC, datetime, timedelta
5
- from pathlib import Path
6
- from typing import Any
7
-
8
- import gradio as gr
9
- import pandas as pd
10
- import plotly.express as px
11
- import plotly.graph_objects as go
12
-
13
- from leaderboard_analytics.schemas import Granularity, QueryFilters
14
- from leaderboard_analytics.services import AnalyticsService
15
-
16
-
17
- def _to_utc_datetime(value: Any, fallback: datetime) -> datetime:
18
- if value is None or value == "":
19
- return fallback
20
-
21
- if isinstance(value, datetime):
22
- dt = value
23
- elif isinstance(value, (int, float)):
24
- if isinstance(value, float) and math.isnan(value):
25
- return fallback
26
- # Gradio DateTime may return Unix timestamps as numbers.
27
- dt = datetime.fromtimestamp(value, tz=UTC)
28
- elif isinstance(value, str):
29
- dt = datetime.fromisoformat(value)
30
- else:
31
- raise ValueError(f"Unsupported datetime value type: {type(value)!r}")
32
-
33
- # Gradio DateTime may return naive datetime values in local time.
34
- if dt.tzinfo is None:
35
- dt = dt.replace(tzinfo=UTC)
36
- return dt.astimezone(UTC)
37
-
38
-
39
- def _empty_plot(title: str):
40
- return px.line(title=title)
41
-
42
-
43
- def _empty_map(title: str):
44
- figure = go.Figure()
45
- _style_visitor_location_map(figure, title)
46
- return figure
47
-
48
-
49
- def _query_range_text(filters: QueryFilters) -> str:
50
- return f"{filters.start_time.isoformat()} to {filters.end_time.isoformat()}"
51
-
52
-
53
- def _write_csv_archive(tables: dict[str, pd.DataFrame]) -> str | None:
54
- if all(table.empty for table in tables.values()):
55
- return None
56
-
57
- archive = tempfile.NamedTemporaryFile(
58
- prefix="leaderboard-analytics-", suffix=".zip", delete=False
59
- )
60
- archive.close()
61
- with zipfile.ZipFile(archive.name, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
62
- for name, table in tables.items():
63
- zip_file.writestr(f"{name}.csv", table.to_csv(index=False))
64
- return archive.name
65
-
66
-
67
- def _visitor_location_top_table(visitor_locations: pd.DataFrame) -> pd.DataFrame:
68
- if visitor_locations.empty:
69
- return pd.DataFrame(columns=["Region", "Users"])
70
-
71
- return (
72
- visitor_locations.sort_values(["ip_count", "pv"], ascending=[False, False])
73
- .head(10)
74
- .rename(columns={"country_name": "Region", "ip_count": "Users"})[["Region", "Users"]]
75
- .reset_index(drop=True)
76
- )
77
-
78
-
79
- def _visitor_location_debug_text(
80
- visitor_locations: pd.DataFrame,
81
- geoip_debug: dict[str, object],
82
- ip_debug: dict[str, object] | None = None,
83
- ) -> str:
84
- if visitor_locations.empty:
85
- total_pv = 0
86
- total_users = 0
87
- mapped_regions = 0
88
- unknown_pv = 0
89
- unknown_users = 0
90
- else:
91
- unknown_rows = visitor_locations[visitor_locations["country_code"] == "Unknown"]
92
- mapped_rows = visitor_locations[visitor_locations["country_code"] != "Unknown"]
93
- total_pv = int(visitor_locations["pv"].sum())
94
- total_users = int(visitor_locations["ip_count"].sum())
95
- mapped_regions = len(mapped_rows)
96
- unknown_pv = int(unknown_rows["pv"].sum()) if not unknown_rows.empty else 0
97
- unknown_users = int(unknown_rows["ip_count"].sum()) if not unknown_rows.empty else 0
98
-
99
- configured = "yes" if geoip_debug.get("database_configured") else "no"
100
- exists = "yes" if geoip_debug.get("database_exists") else "no"
101
- loaded = "yes" if geoip_debug.get("reader_loaded") else "no"
102
- attempted = "yes" if geoip_debug.get("load_attempted") else "no"
103
- path = geoip_debug.get("database_path") or "(not configured)"
104
- ip_debug = ip_debug or {}
105
- global_ips = int(ip_debug.get("global_ips", 0))
106
- global_pv = int(ip_debug.get("global_ip_pv", 0))
107
- private_ips = int(ip_debug.get("private_ips", 0))
108
- private_pv = int(ip_debug.get("private_ip_pv", 0))
109
- loopback_ips = int(ip_debug.get("loopback_ips", 0))
110
- loopback_pv = int(ip_debug.get("loopback_ip_pv", 0))
111
- invalid_ips = int(ip_debug.get("invalid_ips", 0))
112
- invalid_pv = int(ip_debug.get("invalid_ip_pv", 0))
113
- buckets = ip_debug.get("top_ip_pv_buckets", {})
114
-
115
- return (
116
- f"GeoIP DB: configured={configured}, exists={exists}, loaded={loaded}, "
117
- f"load_attempted={attempted} \n"
118
- f"GeoIP path: `{path}` \n"
119
- f"Total location PV: {total_pv} | Users/IPs: {total_users} | "
120
- f"Mapped regions: {mapped_regions} \n"
121
- f"Unknown PV: {unknown_pv} | Unknown users/IPs: {unknown_users} \n"
122
- f"Public IPs: {global_ips} ({global_pv} PV) | Private IPs: {private_ips} "
123
- f"({private_pv} PV) \n"
124
- f"Loopback IPs: {loopback_ips} ({loopback_pv} PV) | Invalid IPs: {invalid_ips} "
125
- f"({invalid_pv} PV) \n"
126
- f"PV/IP buckets: {buckets}"
127
- )
128
-
129
-
130
- def _style_visitor_location_map(figure: go.Figure, title: str) -> None:
131
- figure.update_geos(
132
- projection_type="mercator",
133
- showframe=False,
134
- showcoastlines=True,
135
- coastlinecolor="#cfd6df",
136
- coastlinewidth=0.6,
137
- showcountries=True,
138
- countrycolor="#cfd6df",
139
- countrywidth=0.7,
140
- showland=True,
141
- landcolor="#eef2f7",
142
- showocean=True,
143
- oceancolor="#f8fafc",
144
- showlakes=True,
145
- lakecolor="#f8fafc",
146
- bgcolor="#ffffff",
147
- lataxis_range=[-55, 75],
148
- lonaxis_range=[-180, 180],
149
- )
150
- figure.update_layout(
151
- title={"text": title, "x": 0.02, "xanchor": "left"},
152
- height=560,
153
- paper_bgcolor="#ffffff",
154
- plot_bgcolor="#ffffff",
155
- font={"color": "#1f2937"},
156
- margin={"l": 0, "r": 0, "t": 52, "b": 0},
157
- showlegend=False,
158
- hoverlabel={
159
- "bgcolor": "#ffffff",
160
- "bordercolor": "#3b82f6",
161
- "font_color": "#111827",
162
- },
163
- )
164
-
165
-
166
- def _visitor_location_map(visitor_locations: pd.DataFrame, range_text: str) -> go.Figure:
167
- map_df = (
168
- visitor_locations[visitor_locations["country_code"] != "Unknown"].copy()
169
- if not visitor_locations.empty
170
- else visitor_locations.copy()
171
- )
172
- if map_df.empty:
173
- return _empty_map(f"Visitor locations by country (no mapped data for {range_text})")
174
-
175
- max_pv = max(int(map_df["pv"].max()), 1)
176
- size_ref = 2.0 * max_pv / (52**2)
177
- figure = go.Figure(
178
- go.Scattergeo(
179
- locationmode="country names",
180
- locations=map_df["country_name"],
181
- mode="markers",
182
- text=map_df["country_name"],
183
- customdata=map_df[["country_code", "pv", "ip_count"]],
184
- hovertemplate=(
185
- "<b>%{text}</b><br>"
186
- "Country code: %{customdata[0]}<br>"
187
- "PV: %{customdata[1]:,}<br>"
188
- "Users/IPs: %{customdata[2]:,}<extra></extra>"
189
- ),
190
- marker={
191
- "size": map_df["pv"],
192
- "sizemode": "area",
193
- "sizeref": size_ref,
194
- "sizemin": 8,
195
- "color": "rgba(59, 130, 246, 0.55)",
196
- "line": {"color": "rgba(37, 99, 235, 0.92)", "width": 1.2},
197
- },
198
- )
199
- )
200
- _style_visitor_location_map(figure, "Visitor locations by country")
201
- figure.add_annotation(
202
- x=0.02,
203
- y=0.08,
204
- xref="paper",
205
- yref="paper",
206
- text=(
207
- f"Mapped regions: {len(map_df)}<br>"
208
- f"Mapped PV: {int(map_df['pv'].sum()):,}<br>"
209
- f"Users/IPs: {int(map_df['ip_count'].sum()):,}"
210
- ),
211
- showarrow=False,
212
- align="left",
213
- bgcolor="rgba(255, 255, 255, 0.88)",
214
- bordercolor="rgba(148, 163, 184, 0.55)",
215
- borderwidth=1,
216
- font={"color": "#1f2937", "size": 12},
217
- )
218
- return figure
219
-
220
-
221
- def build_dashboard(service: AnalyticsService) -> gr.Blocks:
222
- default_end = datetime.now(tz=UTC)
223
- default_start = (default_end - timedelta(days=7)).replace(microsecond=0)
224
-
225
- def load_benchmarks() -> object:
226
- try:
227
- benchmarks = service.get_available_benchmarks()
228
- except Exception:
229
- benchmarks = []
230
- return gr.update(choices=[""] + benchmarks, value="")
231
-
232
- def query(
233
- start_time: datetime | str | None,
234
- end_time: datetime | str | None,
235
- benchmark: str,
236
- granularity: str,
237
- ) -> tuple[
238
- object,
239
- object,
240
- object,
241
- object,
242
- object,
243
- object,
244
- object,
245
- object,
246
- object,
247
- object,
248
- object,
249
- object,
250
- object,
251
- object,
252
- object,
253
- object,
254
- ]:
255
- try:
256
- filters = QueryFilters(
257
- start_time=_to_utc_datetime(start_time, default_start),
258
- end_time=_to_utc_datetime(end_time, default_end),
259
- benchmark=benchmark or None,
260
- granularity=Granularity(granularity),
261
- )
262
- overview_df, totals = service.get_overview(filters)
263
- benchmark_df = service.get_benchmark_top(filters)
264
- filter_df = service.get_filter_distribution(filters)
265
- funnel_df = service.get_funnel(filters)
266
- visitors_df = service.get_new_vs_returning(filters)
267
- visitor_locations_df, ip_debug = service.get_visitor_location_details(filters)
268
- visitor_locations_top_df = _visitor_location_top_table(visitor_locations_df)
269
- visitor_locations_debug = _visitor_location_debug_text(
270
- visitor_locations_df,
271
- service.get_geoip_debug_info(),
272
- ip_debug,
273
- )
274
-
275
- range_text = _query_range_text(filters)
276
- if (
277
- overview_df.empty
278
- and benchmark_df.empty
279
- and filter_df.empty
280
- and visitors_df.empty
281
- and visitor_locations_df.empty
282
- ):
283
- metrics = f"No data for {range_text}."
284
- else:
285
- metrics = (
286
- f"Range: {range_text} \n"
287
- f"PV: {totals['pv']} | UV: {totals['uv']} | Sessions: {totals['sessions']} | "
288
- f"Events/Session: {totals['events_per_session']} | "
289
- f"Sessions/Visitor: {totals['sessions_per_visitor']}"
290
- )
291
-
292
- overview_plot = (
293
- px.line(
294
- overview_df,
295
- x="period",
296
- y=["pv", "uv", "session_count"],
297
- title="Traffic overview",
298
- )
299
- if not overview_df.empty
300
- else _empty_plot(f"Traffic overview (no data for {range_text})")
301
- )
302
- benchmark_plot = (
303
- px.bar(benchmark_df, x="benchmark", y="count", title="Benchmark Top")
304
- if not benchmark_df.empty
305
- else px.bar(title=f"Benchmark Top (no data for {range_text})")
306
- )
307
- filter_plot = (
308
- px.bar(filter_df, x="event_name", y="count", title="Filter usage")
309
- if not filter_df.empty
310
- else px.bar(title=f"Filter usage (no data for {range_text})")
311
- )
312
- funnel_plot = px.funnel(funnel_df, x="sessions", y="step", title="Session funnel")
313
- visitor_plot = (
314
- px.bar(
315
- visitors_df,
316
- x="period",
317
- y="visitor_count",
318
- color="visitor_type",
319
- barmode="group",
320
- title="New vs returning visitors",
321
- )
322
- if not visitors_df.empty
323
- else px.bar(title=f"New vs returning visitors (no data for {range_text})")
324
- )
325
- visitor_locations_plot = _visitor_location_map(visitor_locations_df, range_text)
326
- csv_archive = _write_csv_archive(
327
- {
328
- "overview": overview_df,
329
- "benchmarks": benchmark_df,
330
- "filters": filter_df,
331
- "funnel": funnel_df,
332
- "visitors": visitors_df,
333
- "visitor_locations": visitor_locations_df,
334
- }
335
- )
336
-
337
- return (
338
- metrics,
339
- overview_plot,
340
- benchmark_plot,
341
- filter_plot,
342
- funnel_plot,
343
- visitor_plot,
344
- visitor_locations_plot,
345
- visitor_locations_debug,
346
- visitor_locations_top_df,
347
- overview_df,
348
- benchmark_df,
349
- filter_df,
350
- funnel_df,
351
- visitors_df,
352
- visitor_locations_df,
353
- csv_archive,
354
- )
355
- except Exception as exc:
356
- message = f"Query failed: {exc}"
357
- empty = pd.DataFrame()
358
- empty_top = pd.DataFrame(columns=["Region", "Users"])
359
- return (
360
- message,
361
- _empty_plot(message),
362
- px.bar(title=message),
363
- px.bar(title=message),
364
- px.funnel(
365
- pd.DataFrame({"step": [], "sessions": []}),
366
- x="sessions",
367
- y="step",
368
- title=message,
369
- ),
370
- px.bar(title=message),
371
- _empty_map(message),
372
- message,
373
- empty_top,
374
- empty,
375
- empty,
376
- empty,
377
- empty,
378
- empty,
379
- empty,
380
- None,
381
- )
382
-
383
- with gr.Blocks() as demo:
384
- gr.Markdown("# Leaderboard Analytics Dashboard")
385
- gr.Markdown(
386
- "Analyze MTEB leaderboard behavior from MongoDB event logs. "
387
- "All metrics follow event-log-spec definitions."
388
- )
389
-
390
- with gr.Row():
391
- start_time = gr.DateTime(
392
- label="Start time",
393
- value=default_start,
394
- timezone="UTC",
395
- )
396
- end_time = gr.DateTime(
397
- label="End time",
398
- value=default_end,
399
- timezone="UTC",
400
- )
401
- benchmark = gr.Dropdown(
402
- label="Benchmark",
403
- choices=[""],
404
- value="",
405
- allow_custom_value=True,
406
- )
407
- granularity = gr.Dropdown(
408
- label="Granularity",
409
- choices=[Granularity.DAY.value, Granularity.WEEK.value, Granularity.MONTH.value],
410
- value=Granularity.DAY.value,
411
- )
412
- refresh = gr.Button("Refresh", variant="primary")
413
-
414
- metrics_text = gr.Markdown(
415
- "PV: 0 | UV: 0 | Sessions: 0 | Events/Session: 0 | Sessions/Visitor: 0"
416
- )
417
-
418
- with gr.Row():
419
- overview_plot = gr.Plot(label="Traffic Overview")
420
- benchmark_plot = gr.Plot(label="Benchmark Analysis")
421
- with gr.Row():
422
- filter_plot = gr.Plot(label="Filter Behavior")
423
- funnel_plot = gr.Plot(label="Funnel")
424
- visitor_plot = gr.Plot(label="Visitor Segmentation")
425
- with gr.Row():
426
- with gr.Column(scale=2):
427
- visitor_locations_plot = gr.Plot(label="Visitor Locations")
428
- with gr.Column(scale=1):
429
- visitor_locations_debug = gr.Markdown(
430
- "GeoIP DB: not checked \n"
431
- "Total location PV: 0 | Users/IPs: 0 | Mapped regions: 0"
432
- )
433
- visitor_locations_top_table = gr.DataFrame(
434
- label="Top 10 Regions",
435
- interactive=False,
436
- wrap=True,
437
- )
438
-
439
- with gr.Accordion("Raw data", open=False):
440
- csv_file = gr.File(label="CSV export")
441
- overview_table = gr.DataFrame(label="Traffic Overview")
442
- benchmark_table = gr.DataFrame(label="Benchmark Analysis")
443
- filter_table = gr.DataFrame(label="Filter Behavior")
444
- funnel_table = gr.DataFrame(label="Funnel")
445
- visitor_table = gr.DataFrame(label="Visitor Segmentation")
446
- visitor_locations_table = gr.DataFrame(label="Visitor Locations")
447
-
448
- outputs = [
449
- metrics_text,
450
- overview_plot,
451
- benchmark_plot,
452
- filter_plot,
453
- funnel_plot,
454
- visitor_plot,
455
- visitor_locations_plot,
456
- visitor_locations_debug,
457
- visitor_locations_top_table,
458
- overview_table,
459
- benchmark_table,
460
- filter_table,
461
- funnel_table,
462
- visitor_table,
463
- visitor_locations_table,
464
- csv_file,
465
- ]
466
-
467
- refresh.click(
468
- fn=query,
469
- inputs=[start_time, end_time, benchmark, granularity],
470
- outputs=outputs,
471
- )
472
-
473
- demo.load(fn=load_benchmarks, outputs=benchmark)
474
- demo.load(
475
- fn=query,
476
- inputs=[start_time, end_time, benchmark, granularity],
477
- outputs=outputs,
478
- )
479
-
480
- Path(tempfile.gettempdir()).mkdir(parents=True, exist_ok=True)
481
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_geoip_database.py DELETED
@@ -1,29 +0,0 @@
1
- import gzip
2
-
3
- from leaderboard_analytics.geoip_database import ensure_geoip_database
4
-
5
-
6
- def test_ensure_geoip_database_downloads_and_decompresses_gzip(tmp_path) -> None:
7
- source = tmp_path / "GeoLite2-Country.mmdb.gz"
8
- target = tmp_path / "GeoLite2-Country.mmdb"
9
- expected_bytes = b"fake-mmdb-bytes"
10
-
11
- with gzip.open(source, "wb") as gzip_file:
12
- gzip_file.write(expected_bytes)
13
-
14
- result = ensure_geoip_database(target, source.as_uri())
15
-
16
- assert result == target
17
- assert target.read_bytes() == expected_bytes
18
-
19
-
20
- def test_ensure_geoip_database_keeps_existing_file(tmp_path) -> None:
21
- source = tmp_path / "missing.mmdb.gz"
22
- target = tmp_path / "GeoLite2-Country.mmdb"
23
- expected_bytes = b"existing-mmdb-bytes"
24
- target.write_bytes(expected_bytes)
25
-
26
- result = ensure_geoip_database(target, source.as_uri())
27
-
28
- assert result == target
29
- assert target.read_bytes() == expected_bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_repositories.py DELETED
@@ -1,95 +0,0 @@
1
- from datetime import UTC, datetime
2
-
3
- from leaderboard_analytics.repositories import AnalyticsRepository
4
- from leaderboard_analytics.schemas import QueryFilters
5
-
6
-
7
- class CapturingCollection:
8
- def __init__(self, rows: list[dict] | None = None) -> None:
9
- self.rows = rows or []
10
- self.pipeline: list[dict] | None = None
11
-
12
- def aggregate(self, pipeline: list[dict]):
13
- self.pipeline = pipeline
14
- return iter(self.rows)
15
-
16
-
17
- def _filters() -> QueryFilters:
18
- return QueryFilters(
19
- start_time=datetime(2026, 1, 1, tzinfo=UTC),
20
- end_time=datetime(2026, 1, 31, tzinfo=UTC),
21
- )
22
-
23
-
24
- def test_funnel_pipeline_preserves_ordered_step_logic() -> None:
25
- collection = CapturingCollection()
26
- repository = AnalyticsRepository(collection) # type: ignore[arg-type]
27
-
28
- repository.funnel(_filters())
29
-
30
- assert collection.pipeline is not None
31
- assert {"$sort": {"session_id": 1, "event_ts": 1}} in collection.pipeline
32
- assert any(
33
- "$push" in stage.get("$group", {}).get("events", {}) for stage in collection.pipeline
34
- )
35
- assert not any(
36
- "$addToSet" in str(stage) and "events" in str(stage) for stage in collection.pipeline
37
- )
38
- assert any(
39
- "table_download_at" in str(stage) and "$filter_change_at" in str(stage)
40
- for stage in collection.pipeline
41
- )
42
-
43
-
44
- def test_new_vs_returning_pipeline_computes_first_seen_before_range_match() -> None:
45
- collection = CapturingCollection()
46
- repository = AnalyticsRepository(collection) # type: ignore[arg-type]
47
-
48
- repository.visitors_new_vs_returning(_filters())
49
-
50
- assert collection.pipeline is not None
51
- window_index = next(
52
- i for i, stage in enumerate(collection.pipeline) if "$setWindowFields" in stage
53
- )
54
- range_match_index = next(
55
- i
56
- for i, stage in enumerate(collection.pipeline)
57
- if stage.get("$match", {}).get("event_ts") is not None
58
- )
59
- assert window_index < range_match_index
60
-
61
-
62
- def test_overview_totals_filters_empty_identifiers() -> None:
63
- collection = CapturingCollection([{"pv": 1, "uv": 1, "sessions": 1, "events": 2}])
64
- repository = AnalyticsRepository(collection) # type: ignore[arg-type]
65
-
66
- totals = repository.overview_totals(_filters())
67
-
68
- assert totals == {"pv": 1, "uv": 1, "sessions": 1, "events": 2}
69
- assert collection.pipeline is not None
70
- pipeline_text = str(collection.pipeline)
71
- assert '"$sessions"' in pipeline_text or "'$sessions'" in pipeline_text
72
- assert '"$visitors"' in pipeline_text or "'$visitors'" in pipeline_text
73
- assert "$$s" in pipeline_text
74
- assert "$$v" in pipeline_text
75
-
76
-
77
- def test_visitor_ip_counts_groups_page_view_ips_with_existing_filters() -> None:
78
- collection = CapturingCollection([{"ip": "8.8.8.8", "pv": 3}])
79
- repository = AnalyticsRepository(collection) # type: ignore[arg-type]
80
- filters = QueryFilters(
81
- start_time=datetime(2026, 1, 1, tzinfo=UTC),
82
- end_time=datetime(2026, 1, 31, tzinfo=UTC),
83
- benchmark="MTEB",
84
- )
85
-
86
- rows = repository.visitor_ip_counts(filters)
87
-
88
- assert rows == [{"ip": "8.8.8.8", "pv": 3}]
89
- assert collection.pipeline is not None
90
- pipeline_text = str(collection.pipeline)
91
- assert "properties.ip" in pipeline_text
92
- assert "page_view" in pipeline_text
93
- assert "MTEB" in pipeline_text
94
- assert "$nin" in pipeline_text
95
- assert "$properties.ip" in pipeline_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_schemas.py DELETED
@@ -1,16 +0,0 @@
1
- from datetime import UTC, datetime
2
-
3
- import pytest
4
- from pydantic import ValidationError
5
-
6
- from leaderboard_analytics.schemas import QueryFilters
7
-
8
-
9
- def test_query_filters_rejects_invalid_time_range() -> None:
10
- with pytest.raises(
11
- ValidationError, match="start_time must be earlier than or equal to end_time"
12
- ):
13
- QueryFilters(
14
- start_time=datetime(2026, 1, 2, tzinfo=UTC),
15
- end_time=datetime(2026, 1, 1, tzinfo=UTC),
16
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_services.py DELETED
@@ -1,110 +0,0 @@
1
- from datetime import UTC, datetime
2
- from pathlib import Path
3
-
4
- from leaderboard_analytics.schemas import QueryFilters
5
- from leaderboard_analytics.services import AnalyticsService
6
-
7
-
8
- class FakeRepository:
9
- def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
10
- return [
11
- {"period": "2026-01-01", "pv": 2, "uv": 1, "session_count": 1, "event_count": 3},
12
- {"period": "2026-01-02", "pv": 1, "uv": 1, "session_count": 1, "event_count": 2},
13
- ]
14
-
15
- def overview_totals(self, filters: QueryFilters) -> dict:
16
- return {"pv": 3, "uv": 1, "sessions": 1, "events": 5}
17
-
18
-
19
- class LocationRepository:
20
- def __init__(self, rows: list[dict]) -> None:
21
- self.rows = rows
22
-
23
- def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
24
- return self.rows
25
-
26
-
27
- class FakeGeoIpResolver:
28
- def __init__(self, countries: dict[str, tuple[str, str]]) -> None:
29
- self.countries = countries
30
-
31
- def resolve_country(self, ip_address: str) -> tuple[str, str]:
32
- return self.countries[ip_address]
33
-
34
-
35
- def test_overview_uses_full_range_distinct_totals() -> None:
36
- service = AnalyticsService(FakeRepository()) # type: ignore[arg-type]
37
- filters = QueryFilters(
38
- start_time=datetime(2026, 1, 1, tzinfo=UTC),
39
- end_time=datetime(2026, 1, 2, tzinfo=UTC),
40
- )
41
-
42
- frame, totals = service.get_overview(filters)
43
-
44
- assert list(frame["period"]) == ["2026-01-01", "2026-01-02"]
45
- assert totals == {
46
- "pv": 3,
47
- "uv": 1,
48
- "sessions": 1,
49
- "events": 5,
50
- "events_per_session": 5.0,
51
- "sessions_per_visitor": 1.0,
52
- }
53
-
54
-
55
- def test_visitor_locations_groups_pv_and_ip_count_by_country() -> None:
56
- repository = LocationRepository(
57
- [
58
- {"ip": "8.8.8.8", "pv": 3},
59
- {"ip": "8.8.4.4", "pv": 2},
60
- {"ip": "1.1.1.1", "pv": 4},
61
- ]
62
- )
63
- resolver = FakeGeoIpResolver(
64
- {
65
- "8.8.8.8": ("US", "United States"),
66
- "8.8.4.4": ("US", "United States"),
67
- "1.1.1.1": ("AU", "Australia"),
68
- }
69
- )
70
- service = AnalyticsService(
71
- repository, # type: ignore[arg-type]
72
- geoip_resolver=resolver, # type: ignore[arg-type]
73
- )
74
-
75
- frame = service.get_visitor_locations(
76
- QueryFilters(
77
- start_time=datetime(2026, 1, 1, tzinfo=UTC),
78
- end_time=datetime(2026, 1, 2, tzinfo=UTC),
79
- )
80
- )
81
-
82
- assert frame.to_dict("records") == [
83
- {"country_code": "US", "country_name": "United States", "pv": 5, "ip_count": 2},
84
- {"country_code": "AU", "country_name": "Australia", "pv": 4, "ip_count": 1},
85
- ]
86
-
87
-
88
- def test_visitor_locations_groups_unresolved_ips_as_unknown() -> None:
89
- repository = LocationRepository(
90
- [
91
- {"ip": "10.0.0.1", "pv": 2},
92
- {"ip": "not-an-ip", "pv": 1},
93
- {"ip": "8.8.8.8", "pv": 3},
94
- ]
95
- )
96
- service = AnalyticsService(
97
- repository, # type: ignore[arg-type]
98
- geoip_database_path=Path("missing-geolite2-country.mmdb"),
99
- )
100
-
101
- frame = service.get_visitor_locations(
102
- QueryFilters(
103
- start_time=datetime(2026, 1, 1, tzinfo=UTC),
104
- end_time=datetime(2026, 1, 2, tzinfo=UTC),
105
- )
106
- )
107
-
108
- assert frame.to_dict("records") == [
109
- {"country_code": "Unknown", "country_name": "Unknown", "pv": 6, "ip_count": 3}
110
- ]