Spaces:

SmileXing
/

leaderboard-analytics-service

Running

App Files Files Community

init

by SmileXing - opened Apr 16

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

-2412

This PR is in draft mode

Files changed (22) hide show

.env.example +0 -10
.gitignore +0 -216
CHANGELOG.md +0 -20
Dockerfile +0 -19
LICENSE +0 -201
README.md +7 -257
app.py +0 -15
pyproject.toml +0 -46
requirements.txt +0 -8
src/leaderboard_analytics/__init__.py +0 -1
src/leaderboard_analytics/config.py +0 -25
src/leaderboard_analytics/db.py +0 -24
src/leaderboard_analytics/geoip_database.py +0 -36
src/leaderboard_analytics/main.py +0 -49
src/leaderboard_analytics/repositories.py +0 -463
src/leaderboard_analytics/schemas.py +0 -27
src/leaderboard_analytics/services.py +0 -264
src/leaderboard_analytics/ui.py +0 -481
tests/test_geoip_database.py +0 -29
tests/test_repositories.py +0 -95
tests/test_schemas.py +0 -16
tests/test_services.py +0 -110

.env.example DELETED Viewed

@@ -1,10 +0,0 @@
-MONGO_URI=mongodb://localhost:27017
-MONGO_DATABASE=event_logger
-MONGO_COLLECTION=events
-HOST=0.0.0.0
-PORT=7860
-GRADIO_SHARE=false
-GRADIO_SSR_MODE=false
-GEOIP_DATABASE_PATH=GeoLite2-Country.mmdb
-GEOIP_DATABASE_URL=https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz
-GEOIP_AUTO_DOWNLOAD=true

.gitignore DELETED Viewed

@@ -1,216 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[codz]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py.cover
-.hypothesis/
-.pytest_cache/
-.pytest_tmp/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-.python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-Pipfile.lock
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-uv.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-poetry.lock
-poetry.toml
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
-#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
-#pdm.lock
-#pdm.toml
-.pdm-python
-.pdm-build/
-# pixi
-#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
-#pixi.lock
-#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
-#   in the .venv directory. It is recommended not to include this directory in version control.
-.pixi
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.envrc
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Local GeoIP databases
-*.mmdb
-*.mmdb.gz
-# Local analytics exports
-visitor_ips*.csv
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
-# Abstra
-# Abstra is an AI-powered process automation framework.
-# Ignore directories containing user credentials, local state, and settings.
-# Learn more at https://abstra.io/docs
-.abstra/
-# Visual Studio Code
-#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
-#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
-#  and can be added to the global gitignore or merged into this file. However, if you prefer,
-#  you could uncomment the following to ignore the entire vscode folder
-.vscode/
-# Ruff stuff:
-.ruff_cache/
-# PyPI configuration file
-.pypirc
-# Cursor
-#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
-#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
-#  refer to https://docs.cursor.com/context/ignore-files
-.cursorignore
-.cursorindexingignore
-.cursor
-# Marimo
-marimo/_static/
-marimo/_lsp/
-__marimo__/

CHANGELOG.md DELETED Viewed

@@ -1,20 +0,0 @@
-# Changelog
-All notable changes to this project will be documented in this file.
-## Unreleased
-### Added
-- Added full-range overview totals so UV and Sessions are distinct counts across the selected range.
-- Added ordered funnel logic that counts each step only when it occurs after the previous required step.
-- Added benchmark choices, raw data tables, and CSV export support to the dashboard.
-- Added query validation, MongoDB ping checks, and dashboard-friendly error messages.
-- Added pytest coverage for metric totals, query validation, and MongoDB aggregation pipeline shape.
-- Added CI for formatting, linting, and tests.
-### Changed
-- Updated new vs returning visitor logic to compute first-seen dates from the full available page-view history before applying the selected reporting range.
-- Updated MongoDB aggregation pipelines to prefer an indexed `ts` Date field while retaining fallback support for legacy `timestamp` values.
-- Documented recommended MongoDB indexes for production deployments.

Dockerfile DELETED Viewed

@@ -1,19 +0,0 @@
-FROM python:3.12-slim
-ENV PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1 \
-    HOST=0.0.0.0 \
-    PORT=7860 \
-    GRADIO_SHARE=false
-WORKDIR /app
-# Install project dependencies and package from pyproject.toml
-COPY pyproject.toml README.md ./
-COPY src ./src
-RUN pip install --upgrade pip && pip install .
-EXPOSE 7860
-CMD ["leaderboard-analytics"]

LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

README.md CHANGED Viewed

@@ -1,262 +1,12 @@
 ---
-title: leaderboard-analytics-service
-emoji: 📊
-colorFrom: blue
 colorTo: green
-sdk: gradio
-sdk_version: "6.0.0"
-python_version: "3.11"
-app_file: app.py
 pinned: false
 ---
-# Leaderboard Analytics Metrics Spec
-This project analyzes user behavior on the MTEB leaderboard page from event logs in MongoDB.
-The primary purpose of this document is to define **what is measured**, **where each metric comes from**, and **how each metric is calculated**.
----
-## Data Contract
-All analytics are based on the `events` collection and the following stable fields:
-- Core dimensions: `event_name`, `timestamp`, `session_id`
-- Preferred event time: `ts` as a MongoDB Date
-- Behavior context: `benchmark`, `filters`
-- Visitor identity (approximate): `properties.visitor_id`
-- Visitor IP for country analysis: `properties.ip`
-- Change context: `properties.old_value`, `properties.new_value`, `properties.filter_name`
-Important event names:
-- `page_view`
-- `benchmark_change`
-- `filter_change_`* (dynamic names, such as `filter_change_task_type`)
-- `table_download` (currently may be missing in some deployments)
----
-## Metrics Dictionary
-### 1) PV (Page Views)
-- **Definition**: Number of page view events.
-- **Source fields**: `event_name`
-- **Calculation**:
-  - Filter events where `event_name == "page_view"`
-  - PV = count of matched events
-### 2) Sessions
-- **Definition**: Number of unique interaction sessions.
-- **Source fields**: `session_id`
-- **Calculation**:
-  - Sessions = count of distinct non-empty `session_id` values in the selected time range
-### 3) UV (Unique Visitors, Approximate)
-- **Definition**: Number of unique visitors identified by hashed fingerprint.
-- **Source fields**: `properties.visitor_id`
-- **Calculation**:
-  - Remove null/empty `properties.visitor_id`
-  - UV = count of distinct `properties.visitor_id` values in the selected time range
-### 4) Sessions Per Visitor
-- **Definition**: Average number of sessions per visitor.
-- **Source fields**: derived from Sessions and UV
-- **Calculation**:
-  - Sessions Per Visitor = `Sessions / UV`
-  - If UV is 0, result is 0
-### 5) Session Depth (Events Per Session)
-- **Definition**: Average interaction intensity per session.
-- **Source fields**: all events, `session_id`
-- **Calculation**:
-  - Total Events = count of all events in range
-  - Session Depth = `Total Events / Sessions`
-  - If Sessions is 0, result is 0
----
-## Behavior Metrics
-### 6) Benchmark Popularity
-- **Definition**: Frequency of selected benchmarks.
-- **Source fields**: `event_name`, `properties.new_value`
-- **Calculation**:
-  - Filter `event_name == "benchmark_change"`
-  - Group by `properties.new_value`
-  - Popularity = event count per benchmark value
-### 7) Filter Usage Distribution
-- **Definition**: Usage volume by filter event type.
-- **Source fields**: `event_name`
-- **Calculation**:
-  - Filter `event_name` matching regex `^filter_change_`
-  - Group by `event_name`
-  - Distribution = count per filter event
-### 8) Filter Session Coverage
-- **Definition**: Number of sessions that used each filter type.
-- **Source fields**: `event_name`, `session_id`
-- **Calculation**:
-  - For each `filter_change_`* event type:
-    - collect distinct non-empty `session_id`
-    - coverage = distinct session count
----
-## Funnel Metrics
-Recommended session-level funnel:
-1. `page_view`
-2. `benchmark_change`
-3. `filter_change_`*
-4. `table_download`
-### 9) Step Session Count
-- **Definition**: Number of sessions that reached each ordered funnel step.
-- **Source fields**: `session_id`, `event_name`, `ts` or `timestamp`
-- **Calculation**:
-  - Group events by `session_id`
-  - Sort events by event time
-  - Count each cumulative step only when it occurs after the previous required step
-### 10) Step Conversion Rate
-- **Definition**: Conversion from funnel step 1 (`page_view`) to each step.
-- **Source fields**: derived from Step Session Count
-- **Calculation**:
-  - Conversion Rate(step N) = `StepN Sessions / Step1 Sessions * 100%`
-  - If Step1 Sessions is 0, result is 0%
----
-## Visitor Segmentation Metrics
-### 11) New Visitors
-- **Definition**: Visitors whose current period contains their first observed visit date.
-- **Source fields**: `event_name`, `ts` or `timestamp`, `properties.visitor_id`
-- **Calculation**:
-  - Use `page_view` events only
-  - For each `visitor_id`, find earliest timestamp (`first_seen`) from the full available dataset
-  - If event date equals `first_seen` date, classify as `new`
-  - Count distinct `visitor_id` by period
-### 12) Returning Visitors
-- **Definition**: Visitors seen after their first observed date.
-- **Source fields**: same as New Visitors
-- **Calculation**:
-  - Use same first-seen logic
-  - If event date is later than first-seen date, classify as `returning`
-  - Count distinct `visitor_id` by period
-### 13) Visitor Locations by Country
-- **Definition**: Page view volume by visitor IP country/region.
-- **Source fields**: `event_name`, `properties.ip`
-- **Calculation**:
-  - Filter `event_name == "page_view"`
-  - Remove null/empty `properties.ip`
-  - Group page views by IP in MongoDB
-  - Resolve each IP to a country using the local MaxMind GeoLite2 Country database
-  - Group by `country_code` and `country_name`
-  - Map color = page view count (`pv`)
-  - Private, invalid, unresolved, or unconfigured IPs are grouped as `Unknown`
----
-## Time Aggregation Rules
-All trend metrics support these granularities:
-- `day` -> `%Y-%m-%d`
-- `week` -> `%G-W%V` (ISO week)
-- `month` -> `%Y-%m`
-Time filtering rules:
-- Prefer the indexed MongoDB Date field `ts`
-- Fall back to converting legacy `timestamp` values when `ts` is not present
-- Keep records where `start_time <= event time <= end_time`
-Optional benchmark filtering:
-- If benchmark filter is provided, add `benchmark == <value>` to match conditions
----
-## Data Quality Notes
-1. `visitor_id` is an approximate identifier, not a strict user identity.
-2. For `filter_change_`*, `properties.new_value` may not always represent the actual final filter value; prefer `filters` snapshot for behavioral context.
-3. If `table_download` is not instrumented, funnel step 4 will under-report by design.
-4. Total UV and Sessions are distinct counts across the full selected time range. They are not calculated by summing per-period trend values.
-5. Funnel steps are ordered by event time. A session only reaches a later step when that step happens after the previous required step.
----
-## MongoDB Performance Notes
-For production deployments, store event time as a MongoDB Date field named `ts`. Keeping only string timestamps forces aggregation pipelines to convert time values at query time and can reduce index usage.
-Recommended indexes:
-```javascript
-db.events.createIndex({ ts: 1 })
-db.events.createIndex({ ts: 1, benchmark: 1 })
-db.events.createIndex({ event_name: 1, ts: 1 })
-db.events.createIndex({ session_id: 1, ts: 1 })
-db.events.createIndex({ "properties.visitor_id": 1, ts: 1 })
-db.events.createIndex({ event_name: 1, ts: 1, "properties.ip": 1 })
-```
-Legacy events with only `timestamp` remain supported, but backfilling `ts` is recommended before running this dashboard against large collections.
----
-## Minimal Runtime Notes
-Only required runtime inputs:
-- MongoDB connection URI (`MONGO_URI`)
-- Mongo database/collection names (defaults supported)
-Optional visitor location input:
-- `GEOIP_DATABASE_PATH`: path to a local MaxMind `GeoLite2-Country.mmdb` file
-- `GEOIP_DATABASE_URL`: URL for a gzipped GeoLite2 Country MMDB download
-- `GEOIP_AUTO_DOWNLOAD`: whether to download and decompress the MMDB when missing
-The dashboard does not call an external IP lookup API for visitor lookups. By default,
-startup downloads `https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz`
-when `GEOIP_DATABASE_PATH` is missing, decompresses it, and uses the resulting MMDB file
-locally. Set `GEOIP_AUTO_DOWNLOAD=false` if the runtime cannot access the network or if
-you prefer to mount the MMDB yourself. If the database is unavailable, visitor location
-rows are grouped as `Unknown`.
-Local commands:
-```bash
-uv sync
-uv run leaderboard-analytics
-```
-Run quality checks:
-```bash
-uv run ruff format --check .
-uv run ruff check .
-uv run pytest
-```

 ---
+title: Leaderboard Analytics Service
+emoji: 🏃
+colorFrom: green
 colorTo: green
+sdk: docker
 pinned: false
+license: apache-2.0
+short_description: A backend analytics service for the MTEB Leaderboard
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py DELETED Viewed

@@ -1,15 +0,0 @@
-import sys
-from pathlib import Path
-# Ensure src-layout package is importable in Hugging Face Spaces runtime.
-ROOT_DIR = Path(__file__).resolve().parent
-SRC_DIR = ROOT_DIR / "src"
-if str(SRC_DIR) not in sys.path:
-    sys.path.insert(0, str(SRC_DIR))
-from leaderboard_analytics.main import create_demo, launch_demo  # noqa: E402
-demo = create_demo()
-if __name__ == "__main__":
-    launch_demo(demo)

pyproject.toml DELETED Viewed

@@ -1,46 +0,0 @@
-[project]
-name = "leaderboard-analytics-service"
-version = "0.1.0"
-description = "Analytics dashboard for MTEB leaderboard event logs"
-readme = "README.md"
-requires-python = ">=3.11"
-dependencies = [
-  "gradio>=6.0.0",
-  "pymongo>=4.10.0",
-  "pydantic>=2.9.0",
-  "pydantic-settings>=2.6.0",
-  "python-dotenv>=1.0.1",
-  "pandas>=2.2.3",
-  "plotly>=5.24.1",
-  "geoip2>=4.8.0",
-]
-[project.optional-dependencies]
-dev = [
-  "pytest>=8.3.0",
-  "ruff>=0.8.0",
-]
-[tool.ruff]
-line-length = 100
-target-version = "py311"
-[tool.ruff.lint]
-select = ["E", "F", "I", "B", "UP", "C4"]
-[tool.ruff.format]
-quote-style = "double"
-indent-style = "space"
-[project.scripts]
-leaderboard-analytics = "leaderboard_analytics.main:run"
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-[tool.hatch.build.targets.wheel]
-packages = ["src/leaderboard_analytics"]
-[tool.pytest.ini_options]
-pythonpath = ["src"]

requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-gradio>=6.0.0
-pymongo>=4.10.0
-pydantic>=2.9.0
-pydantic-settings>=2.6.0
-python-dotenv>=1.0.1
-pandas>=2.2.3
-plotly>=5.24.1
-geoip2>=4.8.0

src/leaderboard_analytics/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Leaderboard analytics package."""

src/leaderboard_analytics/config.py DELETED Viewed

@@ -1,25 +0,0 @@
-from functools import lru_cache
-from pydantic_settings import BaseSettings, SettingsConfigDict
-class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
-    mongo_uri: str = ""
-    mongo_database: str = "event_logger"
-    mongo_collection: str = "events"
-    host: str = "0.0.0.0"
-    port: int = 7860
-    gradio_share: bool = False
-    gradio_ssr_mode: bool = False
-    geoip_database_path: str = "GeoLite2-Country.mmdb"
-    geoip_database_url: str = (
-        "https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
-    )
-    geoip_auto_download: bool = True
-@lru_cache(maxsize=1)
-def get_settings() -> Settings:
-    return Settings()

src/leaderboard_analytics/db.py DELETED Viewed

@@ -1,24 +0,0 @@
-from pymongo import MongoClient
-from pymongo.collection import Collection
-from pymongo.database import Database
-from leaderboard_analytics.config import get_settings
-def get_mongo_client() -> MongoClient:
-    settings = get_settings()
-    if not settings.mongo_uri:
-        raise ValueError("MONGO_URI is not configured. Please set MONGO_URI in .env file.")
-    client = MongoClient(settings.mongo_uri, serverSelectionTimeoutMS=5000)
-    client.admin.command("ping")
-    return client
-def get_database(client: MongoClient) -> Database:
-    settings = get_settings()
-    return client[settings.mongo_database]
-def get_events_collection(db: Database) -> Collection:
-    settings = get_settings()
-    return db[settings.mongo_collection]

src/leaderboard_analytics/geoip_database.py DELETED Viewed

@@ -1,36 +0,0 @@
-import gzip
-import shutil
-import tempfile
-from pathlib import Path
-from urllib.request import urlopen
-DEFAULT_GEOIP_DATABASE_URL = (
-    "https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
-)
-def ensure_geoip_database(
-    database_path: str | Path,
-    source_url: str = DEFAULT_GEOIP_DATABASE_URL,
-    *,
-    auto_download: bool = True,
-    timeout: float = 30.0,
-) -> Path:
-    target_path = Path(database_path)
-    if target_path.exists() or not auto_download:
-        return target_path
-    target_path.parent.mkdir(parents=True, exist_ok=True)
-    with tempfile.NamedTemporaryFile(
-        prefix=f"{target_path.name}.",
-        suffix=".tmp",
-        dir=target_path.parent,
-        delete=False,
-    ) as temp_file:
-        temp_path = Path(temp_file.name)
-        with urlopen(source_url, timeout=timeout) as response:
-            with gzip.GzipFile(fileobj=response) as gzip_file:
-                shutil.copyfileobj(gzip_file, temp_file)
-    temp_path.replace(target_path)
-    return target_path

src/leaderboard_analytics/main.py DELETED Viewed

@@ -1,49 +0,0 @@
-from leaderboard_analytics.config import get_settings
-from leaderboard_analytics.db import get_database, get_events_collection, get_mongo_client
-from leaderboard_analytics.geoip_database import ensure_geoip_database
-from leaderboard_analytics.repositories import AnalyticsRepository
-from leaderboard_analytics.services import AnalyticsService
-from leaderboard_analytics.ui import build_dashboard
-def create_demo():
-    settings = get_settings()
-    client = get_mongo_client()
-    db = get_database(client)
-    events_collection = get_events_collection(db)
-    geoip_database_path = settings.geoip_database_path
-    try:
-        geoip_database_path = str(
-            ensure_geoip_database(
-                settings.geoip_database_path,
-                settings.geoip_database_url,
-                auto_download=settings.geoip_auto_download,
-            )
-        )
-    except Exception as exc:
-        print(f"GeoIP database download failed: {exc}")
-    repository = AnalyticsRepository(events_collection=events_collection)
-    service = AnalyticsService(
-        repository=repository,
-        geoip_database_path=geoip_database_path,
-    )
-    return build_dashboard(service=service)
-def launch_demo(demo) -> None:
-    settings = get_settings()
-    demo.launch(
-        server_name=settings.host,
-        server_port=settings.port,
-        share=settings.gradio_share,
-        ssr_mode=settings.gradio_ssr_mode,
-    )
-def run() -> None:
-    launch_demo(create_demo())
-if __name__ == "__main__":
-    run()

src/leaderboard_analytics/repositories.py DELETED Viewed

@@ -1,463 +0,0 @@
-from collections.abc import Iterable
-from pymongo.collection import Collection
-from leaderboard_analytics.schemas import Granularity, QueryFilters
-def _period_expression(granularity: Granularity) -> dict:
-    format_map = {
-        Granularity.DAY: "%Y-%m-%d",
-        Granularity.WEEK: "%G-W%V",
-        Granularity.MONTH: "%Y-%m",
-    }
-    return {"$dateToString": {"format": format_map[granularity], "date": "$event_ts"}}
-def _with_normalized_time() -> dict:
-    return {
-        "$addFields": {
-            "event_ts": {"$ifNull": ["$ts", {"$toDate": "$timestamp"}]},
-            "visitor_id": "$properties.visitor_id",
-        }
-    }
-def _indexed_time_prefilter(filters: QueryFilters) -> dict:
-    matcher: dict = {
-        "$or": [
-            {"ts": {"$gte": filters.start_time, "$lte": filters.end_time}},
-            {"ts": None},
-            {"ts": {"$exists": False}},
-        ]
-    }
-    if filters.benchmark:
-        matcher["benchmark"] = filters.benchmark
-    return matcher
-def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
-    matcher: dict = {
-        "event_ts": {
-            "$gte": filters.start_time,
-            "$lte": filters.end_time,
-        }
-    }
-    if filters.benchmark:
-        matcher["benchmark"] = filters.benchmark
-    return matcher
-def _non_empty_set_size(field_name: str, variable_name: str) -> dict:
-    return {
-        "$size": {
-            "$filter": {
-                "input": f"${field_name}",
-                "as": variable_name,
-                "cond": {
-                    "$and": [
-                        {"$ne": [f"$${variable_name}", None]},
-                        {"$ne": [f"$${variable_name}", ""]},
-                    ]
-                },
-            }
-        }
-    }
-class AnalyticsRepository:
-    def __init__(self, events_collection: Collection) -> None:
-        self.events_collection = events_collection
-    def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
-        period_expr = _period_expression(filters.granularity)
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {"$match": _with_time_and_optional_benchmark(filters)},
-            {
-                "$group": {
-                    "_id": {"period": period_expr},
-                    "pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
-                    "event_count": {"$sum": 1},
-                    "sessions": {"$addToSet": "$session_id"},
-                    "visitors": {"$addToSet": "$visitor_id"},
-                }
-            },
-            {
-                "$project": {
-                    "_id": 0,
-                    "period": "$_id.period",
-                    "pv": 1,
-                    "event_count": 1,
-                    "session_count": _non_empty_set_size("sessions", "s"),
-                    "uv": _non_empty_set_size("visitors", "v"),
-                }
-            },
-            {"$sort": {"period": 1}},
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def overview_totals(self, filters: QueryFilters) -> dict:
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {"$match": _with_time_and_optional_benchmark(filters)},
-            {
-                "$group": {
-                    "_id": None,
-                    "pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
-                    "events": {"$sum": 1},
-                    "sessions": {"$addToSet": "$session_id"},
-                    "visitors": {"$addToSet": "$visitor_id"},
-                }
-            },
-            {
-                "$project": {
-                    "_id": 0,
-                    "pv": 1,
-                    "events": 1,
-                    "sessions": _non_empty_set_size("sessions", "s"),
-                    "uv": _non_empty_set_size("visitors", "v"),
-                }
-            },
-        ]
-        return self.safe_first(self.events_collection.aggregate(pipeline))
-    def benchmark_top(self, filters: QueryFilters, limit: int = 20) -> list[dict]:
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {
-                "$match": {
-                    **_with_time_and_optional_benchmark(filters),
-                    "event_name": "benchmark_change",
-                }
-            },
-            {"$group": {"_id": "$properties.new_value", "count": {"$sum": 1}}},
-            {"$match": {"_id": {"$nin": [None, ""]}}},
-            {"$project": {"_id": 0, "benchmark": "$_id", "count": 1}},
-            {"$sort": {"count": -1}},
-            {"$limit": limit},
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def filter_distribution(self, filters: QueryFilters) -> list[dict]:
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {
-                "$match": {
-                    **_with_time_and_optional_benchmark(filters),
-                    "event_name": {"$regex": "^filter_change_"},
-                }
-            },
-            {
-                "$group": {
-                    "_id": "$event_name",
-                    "count": {"$sum": 1},
-                    "sessions": {"$addToSet": "$session_id"},
-                }
-            },
-            {
-                "$project": {
-                    "_id": 0,
-                    "event_name": "$_id",
-                    "count": 1,
-                    "session_coverage": _non_empty_set_size("sessions", "s"),
-                }
-            },
-            {"$sort": {"count": -1}},
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def funnel(self, filters: QueryFilters) -> list[dict]:
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {"$match": _with_time_and_optional_benchmark(filters)},
-            {"$sort": {"session_id": 1, "event_ts": 1}},
-            {
-                "$group": {
-                    "_id": "$session_id",
-                    "events": {"$push": {"name": "$event_name", "ts": "$event_ts"}},
-                }
-            },
-            {"$match": {"_id": {"$nin": [None, ""]}}},
-            {
-                "$project": {
-                    "events": 1,
-                    "page_view_at": {
-                        "$arrayElemAt": [
-                            {
-                                "$map": {
-                                    "input": {
-                                        "$filter": {
-                                            "input": "$events",
-                                            "as": "event",
-                                            "cond": {"$eq": ["$$event.name", "page_view"]},
-                                        }
-                                    },
-                                    "as": "event",
-                                    "in": "$$event.ts",
-                                }
-                            },
-                            0,
-                        ]
-                    },
-                }
-            },
-            {
-                "$project": {
-                    "events": 1,
-                    "page_view_at": 1,
-                    "benchmark_change_at": {
-                        "$arrayElemAt": [
-                            {
-                                "$map": {
-                                    "input": {
-                                        "$filter": {
-                                            "input": "$events",
-                                            "as": "event",
-                                            "cond": {
-                                                "$and": [
-                                                    {"$eq": ["$$event.name", "benchmark_change"]},
-                                                    {"$gte": ["$$event.ts", "$page_view_at"]},
-                                                ]
-                                            },
-                                        }
-                                    },
-                                    "as": "event",
-                                    "in": "$$event.ts",
-                                }
-                            },
-                            0,
-                        ]
-                    },
-                }
-            },
-            {
-                "$project": {
-                    "events": 1,
-                    "page_view_at": 1,
-                    "benchmark_change_at": 1,
-                    "filter_change_at": {
-                        "$arrayElemAt": [
-                            {
-                                "$map": {
-                                    "input": {
-                                        "$filter": {
-                                            "input": "$events",
-                                            "as": "event",
-                                            "cond": {
-                                                "$and": [
-                                                    {
-                                                        "$regexMatch": {
-                                                            "input": "$$event.name",
-                                                            "regex": "^filter_change_",
-                                                        }
-                                                    },
-                                                    {
-                                                        "$gte": [
-                                                            "$$event.ts",
-                                                            "$benchmark_change_at",
-                                                        ]
-                                                    },
-                                                ]
-                                            },
-                                        }
-                                    },
-                                    "as": "event",
-                                    "in": "$$event.ts",
-                                }
-                            },
-                            0,
-                        ]
-                    },
-                }
-            },
-            {
-                "$project": {
-                    "page_view_at": 1,
-                    "benchmark_change_at": 1,
-                    "filter_change_at": 1,
-                    "table_download_at": {
-                        "$arrayElemAt": [
-                            {
-                                "$map": {
-                                    "input": {
-                                        "$filter": {
-                                            "input": "$events",
-                                            "as": "event",
-                                            "cond": {
-                                                "$and": [
-                                                    {"$eq": ["$$event.name", "table_download"]},
-                                                    {"$gte": ["$$event.ts", "$filter_change_at"]},
-                                                ]
-                                            },
-                                        }
-                                    },
-                                    "as": "event",
-                                    "in": "$$event.ts",
-                                }
-                            },
-                            0,
-                        ]
-                    },
-                }
-            },
-            {
-                "$group": {
-                    "_id": None,
-                    "step1_page_view": {
-                        "$sum": {"$cond": [{"$ne": ["$page_view_at", None]}, 1, 0]}
-                    },
-                    "step2_benchmark_change": {
-                        "$sum": {
-                            "$cond": [
-                                {
-                                    "$and": [
-                                        {"$ne": ["$page_view_at", None]},
-                                        {"$gte": ["$benchmark_change_at", "$page_view_at"]},
-                                    ]
-                                },
-                                1,
-                                0,
-                            ]
-                        }
-                    },
-                    "step3_filter_change": {
-                        "$sum": {
-                            "$cond": [
-                                {
-                                    "$and": [
-                                        {"$ne": ["$page_view_at", None]},
-                                        {"$gte": ["$benchmark_change_at", "$page_view_at"]},
-                                        {"$gte": ["$filter_change_at", "$benchmark_change_at"]},
-                                    ]
-                                },
-                                1,
-                                0,
-                            ]
-                        }
-                    },
-                    "step4_table_download": {
-                        "$sum": {
-                            "$cond": [
-                                {
-                                    "$and": [
-                                        {"$ne": ["$page_view_at", None]},
-                                        {"$gte": ["$benchmark_change_at", "$page_view_at"]},
-                                        {"$gte": ["$filter_change_at", "$benchmark_change_at"]},
-                                        {"$gte": ["$table_download_at", "$filter_change_at"]},
-                                    ]
-                                },
-                                1,
-                                0,
-                            ]
-                        }
-                    },
-                }
-            },
-            {
-                "$project": {
-                    "_id": 0,
-                    "step1_page_view": 1,
-                    "step2_benchmark_change": 1,
-                    "step3_filter_change": 1,
-                    "step4_table_download": 1,
-                }
-            },
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def visitors_new_vs_returning(self, filters: QueryFilters) -> list[dict]:
-        period_expr = _period_expression(filters.granularity)
-        pipeline: list[dict] = [
-            _with_normalized_time(),
-            {
-                "$match": {
-                    "event_name": "page_view",
-                    "visitor_id": {"$nin": [None, ""]},
-                }
-            },
-            {
-                "$setWindowFields": {
-                    "partitionBy": "$visitor_id",
-                    "sortBy": {"event_ts": 1},
-                    "output": {"first_seen": {"$first": "$event_ts"}},
-                }
-            },
-            {"$match": _with_time_and_optional_benchmark(filters)},
-            {
-                "$project": {
-                    "period": period_expr,
-                    "is_new": {
-                        "$eq": [
-                            {"$dateToString": {"format": "%Y-%m-%d", "date": "$event_ts"}},
-                            {"$dateToString": {"format": "%Y-%m-%d", "date": "$first_seen"}},
-                        ]
-                    },
-                    "visitor_id": 1,
-                }
-            },
-            {
-                "$group": {
-                    "_id": {"period": "$period", "is_new": "$is_new"},
-                    "visitors": {"$addToSet": "$visitor_id"},
-                }
-            },
-            {
-                "$project": {
-                    "_id": 0,
-                    "period": "$_id.period",
-                    "is_new": "$_id.is_new",
-                    "visitor_count": _non_empty_set_size("visitors", "v"),
-                }
-            },
-            {"$sort": {"period": 1, "is_new": -1}},
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
-        pipeline: list[dict] = [
-            {"$match": _indexed_time_prefilter(filters)},
-            _with_normalized_time(),
-            {
-                "$match": {
-                    **_with_time_and_optional_benchmark(filters),
-                    "event_name": "page_view",
-                    "properties.ip": {"$nin": [None, ""]},
-                }
-            },
-            {"$group": {"_id": "$properties.ip", "pv": {"$sum": 1}}},
-            {"$project": {"_id": 0, "ip": "$_id", "pv": 1}},
-            {"$sort": {"pv": -1}},
-        ]
-        return list(self.events_collection.aggregate(pipeline))
-    def available_benchmarks(
-        self, filters: QueryFilters | None = None, limit: int = 100
-    ) -> list[str]:
-        pipeline: list[dict] = []
-        if filters is not None:
-            pipeline.extend(
-                [
-                    {"$match": _indexed_time_prefilter(filters)},
-                    _with_normalized_time(),
-                    {"$match": _with_time_and_optional_benchmark(filters)},
-                ]
-            )
-        pipeline.extend(
-            [
-                {"$match": {"benchmark": {"$nin": [None, ""]}}},
-                {"$group": {"_id": "$benchmark"}},
-                {"$sort": {"_id": 1}},
-                {"$limit": limit},
-            ]
-        )
-        return [row["_id"] for row in self.events_collection.aggregate(pipeline)]
-    @staticmethod
-    def safe_first(items: Iterable[dict]) -> dict:
-        return next(iter(items), {})

src/leaderboard_analytics/schemas.py DELETED Viewed

@@ -1,27 +0,0 @@
-from datetime import UTC, datetime
-from enum import StrEnum
-from pydantic import BaseModel, Field, model_validator
-class Granularity(StrEnum):
-    DAY = "day"
-    WEEK = "week"
-    MONTH = "month"
-class QueryFilters(BaseModel):
-    start_time: datetime = Field(
-        default_factory=lambda: datetime.now(tz=UTC).replace(
-            hour=0, minute=0, second=0, microsecond=0
-        )
-    )
-    end_time: datetime = Field(default_factory=lambda: datetime.now(tz=UTC))
-    benchmark: str | None = None
-    granularity: Granularity = Granularity.DAY
-    @model_validator(mode="after")
-    def validate_time_range(self) -> "QueryFilters":
-        if self.start_time > self.end_time:
-            raise ValueError("start_time must be earlier than or equal to end_time")
-        return self

src/leaderboard_analytics/services.py DELETED Viewed

@@ -1,264 +0,0 @@
-import ipaddress
-from pathlib import Path
-from typing import Any, Protocol
-import pandas as pd
-from leaderboard_analytics.repositories import AnalyticsRepository
-from leaderboard_analytics.schemas import QueryFilters
-UNKNOWN_COUNTRY_CODE = "Unknown"
-UNKNOWN_COUNTRY_NAME = "Unknown"
-def _empty_ip_debug() -> dict[str, object]:
-    return {
-        "total_unique_ips": 0,
-        "total_ip_pv": 0,
-        "global_ips": 0,
-        "global_ip_pv": 0,
-        "private_ips": 0,
-        "private_ip_pv": 0,
-        "loopback_ips": 0,
-        "loopback_ip_pv": 0,
-        "reserved_ips": 0,
-        "reserved_ip_pv": 0,
-        "link_local_ips": 0,
-        "link_local_ip_pv": 0,
-        "multicast_ips": 0,
-        "multicast_ip_pv": 0,
-        "unspecified_ips": 0,
-        "unspecified_ip_pv": 0,
-        "invalid_ips": 0,
-        "invalid_ip_pv": 0,
-        "top_ip_pv_buckets": {
-            "1": 0,
-            "2-10": 0,
-            "11-100": 0,
-            "101-1000": 0,
-            ">1000": 0,
-        },
-    }
-def _ip_debug_category(ip_address: str) -> str:
-    try:
-        parsed_ip = ipaddress.ip_address(ip_address.strip())
-    except ValueError:
-        return "invalid"
-    if parsed_ip.is_global:
-        return "global"
-    if parsed_ip.is_loopback:
-        return "loopback"
-    if parsed_ip.is_private:
-        return "private"
-    if parsed_ip.is_reserved:
-        return "reserved"
-    if parsed_ip.is_link_local:
-        return "link_local"
-    if parsed_ip.is_multicast:
-        return "multicast"
-    if parsed_ip.is_unspecified:
-        return "unspecified"
-    return "reserved"
-def _ip_pv_bucket(pv: int) -> str:
-    if pv <= 1:
-        return "1"
-    if pv <= 10:
-        return "2-10"
-    if pv <= 100:
-        return "11-100"
-    if pv <= 1000:
-        return "101-1000"
-    return ">1000"
-class GeoIpCountryReader(Protocol):
-    def country(self, ip_address: str) -> Any: ...
-class GeoIpResolver:
-    def __init__(
-        self,
-        database_path: str | Path | None = None,
-        reader: GeoIpCountryReader | None = None,
-    ) -> None:
-        self.database_path = Path(database_path) if database_path else None
-        self._reader = reader
-        self._load_attempted = reader is not None
-    def resolve_country(self, ip_address: str) -> tuple[str, str]:
-        try:
-            parsed_ip = ipaddress.ip_address(ip_address.strip())
-        except ValueError:
-            return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
-        if not parsed_ip.is_global:
-            return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
-        reader = self._get_reader()
-        if reader is None:
-            return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
-        try:
-            response = reader.country(str(parsed_ip))
-        except Exception:
-            return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
-        country = response.country
-        if not getattr(country, "iso_code", None):
-            country = response.registered_country
-        code = getattr(country, "iso_code", None)
-        if not code:
-            return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
-        return code, getattr(country, "name", None) or code
-    def debug_status(self) -> dict[str, object]:
-        return {
-            "database_path": str(self.database_path) if self.database_path else "",
-            "database_configured": self.database_path is not None,
-            "database_exists": self.database_path.exists() if self.database_path else False,
-            "load_attempted": self._load_attempted,
-            "reader_loaded": self._reader is not None,
-        }
-    def _get_reader(self) -> GeoIpCountryReader | None:
-        if self._reader is not None:
-            return self._reader
-        if self._load_attempted:
-            return None
-        self._load_attempted = True
-        if self.database_path is None or not self.database_path.exists():
-            return None
-        try:
-            import geoip2.database
-            self._reader = geoip2.database.Reader(str(self.database_path))
-        except Exception:
-            return None
-        return self._reader
-class AnalyticsService:
-    def __init__(
-        self,
-        repository: AnalyticsRepository,
-        geoip_database_path: str | Path | None = None,
-        geoip_resolver: GeoIpResolver | None = None,
-    ) -> None:
-        self.repository = repository
-        self.geoip_resolver = geoip_resolver or GeoIpResolver(geoip_database_path)
-    def get_overview(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
-        rows = self.repository.overview_timeseries(filters)
-        frame = pd.DataFrame(rows)
-        raw_totals = self.repository.overview_totals(filters)
-        totals = {
-            "pv": int(raw_totals.get("pv", 0)),
-            "uv": int(raw_totals.get("uv", 0)),
-            "sessions": int(raw_totals.get("sessions", 0)),
-            "events": int(raw_totals.get("events", 0)),
-        }
-        totals["events_per_session"] = (
-            round(totals["events"] / totals["sessions"], 2) if totals["sessions"] else 0.0
-        )
-        totals["sessions_per_visitor"] = (
-            round(totals["sessions"] / totals["uv"], 2) if totals["uv"] else 0.0
-        )
-        return frame, totals
-    def get_benchmark_top(self, filters: QueryFilters) -> pd.DataFrame:
-        return pd.DataFrame(self.repository.benchmark_top(filters))
-    def get_filter_distribution(self, filters: QueryFilters) -> pd.DataFrame:
-        return pd.DataFrame(self.repository.filter_distribution(filters))
-    def get_funnel(self, filters: QueryFilters) -> pd.DataFrame:
-        raw = self.repository.safe_first(self.repository.funnel(filters))
-        rows = [
-            {"step": "page_view", "sessions": raw.get("step1_page_view", 0)},
-            {"step": "benchmark_change", "sessions": raw.get("step2_benchmark_change", 0)},
-            {"step": "filter_change_*", "sessions": raw.get("step3_filter_change", 0)},
-            {"step": "table_download", "sessions": raw.get("step4_table_download", 0)},
-        ]
-        frame = pd.DataFrame(rows)
-        step1 = int(frame.iloc[0]["sessions"]) if not frame.empty else 0
-        frame["conversion_rate"] = frame["sessions"].apply(
-            lambda x: round((x / step1) * 100, 2) if step1 else 0.0
-        )
-        return frame
-    def get_new_vs_returning(self, filters: QueryFilters) -> pd.DataFrame:
-        frame = pd.DataFrame(self.repository.visitors_new_vs_returning(filters))
-        if frame.empty:
-            return frame
-        frame["visitor_type"] = frame["is_new"].map({True: "new", False: "returning"})
-        return frame
-    def get_visitor_locations(self, filters: QueryFilters) -> pd.DataFrame:
-        frame, _debug = self.get_visitor_location_details(filters)
-        return frame
-    def get_visitor_location_details(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
-        locations: dict[tuple[str, str], dict[str, int | str]] = {}
-        ip_debug = _empty_ip_debug()
-        for row in self.repository.visitor_ip_counts(filters):
-            ip = str(row.get("ip", "")).strip()
-            if not ip:
-                continue
-            pv = int(row.get("pv", 0))
-            category = _ip_debug_category(ip)
-            ip_debug["total_unique_ips"] = int(ip_debug["total_unique_ips"]) + 1
-            ip_debug["total_ip_pv"] = int(ip_debug["total_ip_pv"]) + pv
-            ip_debug[f"{category}_ips"] = int(ip_debug[f"{category}_ips"]) + 1
-            ip_debug[f"{category}_ip_pv"] = int(ip_debug[f"{category}_ip_pv"]) + pv
-            ip_debug["top_ip_pv_buckets"][_ip_pv_bucket(pv)] += 1  # type: ignore[index]
-            code, name = self.geoip_resolver.resolve_country(ip)
-            key = (code, name)
-            if key not in locations:
-                locations[key] = {
-                    "country_code": code,
-                    "country_name": name,
-                    "pv": 0,
-                    "ip_count": 0,
-                }
-            locations[key]["pv"] = int(locations[key]["pv"]) + pv
-            locations[key]["ip_count"] = int(locations[key]["ip_count"]) + 1
-        frame = pd.DataFrame(
-            locations.values(),
-            columns=["country_code", "country_name", "pv", "ip_count"],
-        )
-        if frame.empty:
-            return frame, ip_debug
-        frame = frame.sort_values(["pv", "ip_count"], ascending=[False, False]).reset_index(
-            drop=True
-        )
-        return frame, ip_debug
-    def get_geoip_debug_info(self) -> dict[str, object]:
-        debug_status = getattr(self.geoip_resolver, "debug_status", None)
-        if debug_status is None:
-            return {
-                "database_path": "",
-                "database_configured": False,
-                "database_exists": False,
-                "load_attempted": False,
-                "reader_loaded": False,
-            }
-        return debug_status()
-    def get_available_benchmarks(self, filters: QueryFilters | None = None) -> list[str]:
-        return self.repository.available_benchmarks(filters)

src/leaderboard_analytics/ui.py DELETED Viewed

@@ -1,481 +0,0 @@
-import math
-import tempfile
-import zipfile
-from datetime import UTC, datetime, timedelta
-from pathlib import Path
-from typing import Any
-import gradio as gr
-import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-from leaderboard_analytics.schemas import Granularity, QueryFilters
-from leaderboard_analytics.services import AnalyticsService
-def _to_utc_datetime(value: Any, fallback: datetime) -> datetime:
-    if value is None or value == "":
-        return fallback
-    if isinstance(value, datetime):
-        dt = value
-    elif isinstance(value, (int, float)):
-        if isinstance(value, float) and math.isnan(value):
-            return fallback
-        # Gradio DateTime may return Unix timestamps as numbers.
-        dt = datetime.fromtimestamp(value, tz=UTC)
-    elif isinstance(value, str):
-        dt = datetime.fromisoformat(value)
-    else:
-        raise ValueError(f"Unsupported datetime value type: {type(value)!r}")
-    # Gradio DateTime may return naive datetime values in local time.
-    if dt.tzinfo is None:
-        dt = dt.replace(tzinfo=UTC)
-    return dt.astimezone(UTC)
-def _empty_plot(title: str):
-    return px.line(title=title)
-def _empty_map(title: str):
-    figure = go.Figure()
-    _style_visitor_location_map(figure, title)
-    return figure
-def _query_range_text(filters: QueryFilters) -> str:
-    return f"{filters.start_time.isoformat()} to {filters.end_time.isoformat()}"
-def _write_csv_archive(tables: dict[str, pd.DataFrame]) -> str | None:
-    if all(table.empty for table in tables.values()):
-        return None
-    archive = tempfile.NamedTemporaryFile(
-        prefix="leaderboard-analytics-", suffix=".zip", delete=False
-    )
-    archive.close()
-    with zipfile.ZipFile(archive.name, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
-        for name, table in tables.items():
-            zip_file.writestr(f"{name}.csv", table.to_csv(index=False))
-    return archive.name
-def _visitor_location_top_table(visitor_locations: pd.DataFrame) -> pd.DataFrame:
-    if visitor_locations.empty:
-        return pd.DataFrame(columns=["Region", "Users"])
-    return (
-        visitor_locations.sort_values(["ip_count", "pv"], ascending=[False, False])
-        .head(10)
-        .rename(columns={"country_name": "Region", "ip_count": "Users"})[["Region", "Users"]]
-        .reset_index(drop=True)
-    )
-def _visitor_location_debug_text(
-    visitor_locations: pd.DataFrame,
-    geoip_debug: dict[str, object],
-    ip_debug: dict[str, object] | None = None,
-) -> str:
-    if visitor_locations.empty:
-        total_pv = 0
-        total_users = 0
-        mapped_regions = 0
-        unknown_pv = 0
-        unknown_users = 0
-    else:
-        unknown_rows = visitor_locations[visitor_locations["country_code"] == "Unknown"]
-        mapped_rows = visitor_locations[visitor_locations["country_code"] != "Unknown"]
-        total_pv = int(visitor_locations["pv"].sum())
-        total_users = int(visitor_locations["ip_count"].sum())
-        mapped_regions = len(mapped_rows)
-        unknown_pv = int(unknown_rows["pv"].sum()) if not unknown_rows.empty else 0
-        unknown_users = int(unknown_rows["ip_count"].sum()) if not unknown_rows.empty else 0
-    configured = "yes" if geoip_debug.get("database_configured") else "no"
-    exists = "yes" if geoip_debug.get("database_exists") else "no"
-    loaded = "yes" if geoip_debug.get("reader_loaded") else "no"
-    attempted = "yes" if geoip_debug.get("load_attempted") else "no"
-    path = geoip_debug.get("database_path") or "(not configured)"
-    ip_debug = ip_debug or {}
-    global_ips = int(ip_debug.get("global_ips", 0))
-    global_pv = int(ip_debug.get("global_ip_pv", 0))
-    private_ips = int(ip_debug.get("private_ips", 0))
-    private_pv = int(ip_debug.get("private_ip_pv", 0))
-    loopback_ips = int(ip_debug.get("loopback_ips", 0))
-    loopback_pv = int(ip_debug.get("loopback_ip_pv", 0))
-    invalid_ips = int(ip_debug.get("invalid_ips", 0))
-    invalid_pv = int(ip_debug.get("invalid_ip_pv", 0))
-    buckets = ip_debug.get("top_ip_pv_buckets", {})
-    return (
-        f"GeoIP DB: configured={configured}, exists={exists}, loaded={loaded}, "
-        f"load_attempted={attempted}  \n"
-        f"GeoIP path: `{path}`  \n"
-        f"Total location PV: {total_pv} | Users/IPs: {total_users} | "
-        f"Mapped regions: {mapped_regions}  \n"
-        f"Unknown PV: {unknown_pv} | Unknown users/IPs: {unknown_users}  \n"
-        f"Public IPs: {global_ips} ({global_pv} PV) | Private IPs: {private_ips} "
-        f"({private_pv} PV)  \n"
-        f"Loopback IPs: {loopback_ips} ({loopback_pv} PV) | Invalid IPs: {invalid_ips} "
-        f"({invalid_pv} PV)  \n"
-        f"PV/IP buckets: {buckets}"
-    )
-def _style_visitor_location_map(figure: go.Figure, title: str) -> None:
-    figure.update_geos(
-        projection_type="mercator",
-        showframe=False,
-        showcoastlines=True,
-        coastlinecolor="#cfd6df",
-        coastlinewidth=0.6,
-        showcountries=True,
-        countrycolor="#cfd6df",
-        countrywidth=0.7,
-        showland=True,
-        landcolor="#eef2f7",
-        showocean=True,
-        oceancolor="#f8fafc",
-        showlakes=True,
-        lakecolor="#f8fafc",
-        bgcolor="#ffffff",
-        lataxis_range=[-55, 75],
-        lonaxis_range=[-180, 180],
-    )
-    figure.update_layout(
-        title={"text": title, "x": 0.02, "xanchor": "left"},
-        height=560,
-        paper_bgcolor="#ffffff",
-        plot_bgcolor="#ffffff",
-        font={"color": "#1f2937"},
-        margin={"l": 0, "r": 0, "t": 52, "b": 0},
-        showlegend=False,
-        hoverlabel={
-            "bgcolor": "#ffffff",
-            "bordercolor": "#3b82f6",
-            "font_color": "#111827",
-        },
-    )
-def _visitor_location_map(visitor_locations: pd.DataFrame, range_text: str) -> go.Figure:
-    map_df = (
-        visitor_locations[visitor_locations["country_code"] != "Unknown"].copy()
-        if not visitor_locations.empty
-        else visitor_locations.copy()
-    )
-    if map_df.empty:
-        return _empty_map(f"Visitor locations by country (no mapped data for {range_text})")
-    max_pv = max(int(map_df["pv"].max()), 1)
-    size_ref = 2.0 * max_pv / (52**2)
-    figure = go.Figure(
-        go.Scattergeo(
-            locationmode="country names",
-            locations=map_df["country_name"],
-            mode="markers",
-            text=map_df["country_name"],
-            customdata=map_df[["country_code", "pv", "ip_count"]],
-            hovertemplate=(
-                "<b>%{text}</b><br>"
-                "Country code: %{customdata[0]}<br>"
-                "PV: %{customdata[1]:,}<br>"
-                "Users/IPs: %{customdata[2]:,}<extra></extra>"
-            ),
-            marker={
-                "size": map_df["pv"],
-                "sizemode": "area",
-                "sizeref": size_ref,
-                "sizemin": 8,
-                "color": "rgba(59, 130, 246, 0.55)",
-                "line": {"color": "rgba(37, 99, 235, 0.92)", "width": 1.2},
-            },
-        )
-    )
-    _style_visitor_location_map(figure, "Visitor locations by country")
-    figure.add_annotation(
-        x=0.02,
-        y=0.08,
-        xref="paper",
-        yref="paper",
-        text=(
-            f"Mapped regions: {len(map_df)}<br>"
-            f"Mapped PV: {int(map_df['pv'].sum()):,}<br>"
-            f"Users/IPs: {int(map_df['ip_count'].sum()):,}"
-        ),
-        showarrow=False,
-        align="left",
-        bgcolor="rgba(255, 255, 255, 0.88)",
-        bordercolor="rgba(148, 163, 184, 0.55)",
-        borderwidth=1,
-        font={"color": "#1f2937", "size": 12},
-    )
-    return figure
-def build_dashboard(service: AnalyticsService) -> gr.Blocks:
-    default_end = datetime.now(tz=UTC)
-    default_start = (default_end - timedelta(days=7)).replace(microsecond=0)
-    def load_benchmarks() -> object:
-        try:
-            benchmarks = service.get_available_benchmarks()
-        except Exception:
-            benchmarks = []
-        return gr.update(choices=[""] + benchmarks, value="")
-    def query(
-        start_time: datetime | str | None,
-        end_time: datetime | str | None,
-        benchmark: str,
-        granularity: str,
-    ) -> tuple[
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-        object,
-    ]:
-        try:
-            filters = QueryFilters(
-                start_time=_to_utc_datetime(start_time, default_start),
-                end_time=_to_utc_datetime(end_time, default_end),
-                benchmark=benchmark or None,
-                granularity=Granularity(granularity),
-            )
-            overview_df, totals = service.get_overview(filters)
-            benchmark_df = service.get_benchmark_top(filters)
-            filter_df = service.get_filter_distribution(filters)
-            funnel_df = service.get_funnel(filters)
-            visitors_df = service.get_new_vs_returning(filters)
-            visitor_locations_df, ip_debug = service.get_visitor_location_details(filters)
-            visitor_locations_top_df = _visitor_location_top_table(visitor_locations_df)
-            visitor_locations_debug = _visitor_location_debug_text(
-                visitor_locations_df,
-                service.get_geoip_debug_info(),
-                ip_debug,
-            )
-            range_text = _query_range_text(filters)
-            if (
-                overview_df.empty
-                and benchmark_df.empty
-                and filter_df.empty
-                and visitors_df.empty
-                and visitor_locations_df.empty
-            ):
-                metrics = f"No data for {range_text}."
-            else:
-                metrics = (
-                    f"Range: {range_text}  \n"
-                    f"PV: {totals['pv']} | UV: {totals['uv']} | Sessions: {totals['sessions']} | "
-                    f"Events/Session: {totals['events_per_session']} | "
-                    f"Sessions/Visitor: {totals['sessions_per_visitor']}"
-                )
-            overview_plot = (
-                px.line(
-                    overview_df,
-                    x="period",
-                    y=["pv", "uv", "session_count"],
-                    title="Traffic overview",
-                )
-                if not overview_df.empty
-                else _empty_plot(f"Traffic overview (no data for {range_text})")
-            )
-            benchmark_plot = (
-                px.bar(benchmark_df, x="benchmark", y="count", title="Benchmark Top")
-                if not benchmark_df.empty
-                else px.bar(title=f"Benchmark Top (no data for {range_text})")
-            )
-            filter_plot = (
-                px.bar(filter_df, x="event_name", y="count", title="Filter usage")
-                if not filter_df.empty
-                else px.bar(title=f"Filter usage (no data for {range_text})")
-            )
-            funnel_plot = px.funnel(funnel_df, x="sessions", y="step", title="Session funnel")
-            visitor_plot = (
-                px.bar(
-                    visitors_df,
-                    x="period",
-                    y="visitor_count",
-                    color="visitor_type",
-                    barmode="group",
-                    title="New vs returning visitors",
-                )
-                if not visitors_df.empty
-                else px.bar(title=f"New vs returning visitors (no data for {range_text})")
-            )
-            visitor_locations_plot = _visitor_location_map(visitor_locations_df, range_text)
-            csv_archive = _write_csv_archive(
-                {
-                    "overview": overview_df,
-                    "benchmarks": benchmark_df,
-                    "filters": filter_df,
-                    "funnel": funnel_df,
-                    "visitors": visitors_df,
-                    "visitor_locations": visitor_locations_df,
-                }
-            )
-            return (
-                metrics,
-                overview_plot,
-                benchmark_plot,
-                filter_plot,
-                funnel_plot,
-                visitor_plot,
-                visitor_locations_plot,
-                visitor_locations_debug,
-                visitor_locations_top_df,
-                overview_df,
-                benchmark_df,
-                filter_df,
-                funnel_df,
-                visitors_df,
-                visitor_locations_df,
-                csv_archive,
-            )
-        except Exception as exc:
-            message = f"Query failed: {exc}"
-            empty = pd.DataFrame()
-            empty_top = pd.DataFrame(columns=["Region", "Users"])
-            return (
-                message,
-                _empty_plot(message),
-                px.bar(title=message),
-                px.bar(title=message),
-                px.funnel(
-                    pd.DataFrame({"step": [], "sessions": []}),
-                    x="sessions",
-                    y="step",
-                    title=message,
-                ),
-                px.bar(title=message),
-                _empty_map(message),
-                message,
-                empty_top,
-                empty,
-                empty,
-                empty,
-                empty,
-                empty,
-                empty,
-                None,
-            )
-    with gr.Blocks() as demo:
-        gr.Markdown("# Leaderboard Analytics Dashboard")
-        gr.Markdown(
-            "Analyze MTEB leaderboard behavior from MongoDB event logs. "
-            "All metrics follow event-log-spec definitions."
-        )
-        with gr.Row():
-            start_time = gr.DateTime(
-                label="Start time",
-                value=default_start,
-                timezone="UTC",
-            )
-            end_time = gr.DateTime(
-                label="End time",
-                value=default_end,
-                timezone="UTC",
-            )
-            benchmark = gr.Dropdown(
-                label="Benchmark",
-                choices=[""],
-                value="",
-                allow_custom_value=True,
-            )
-            granularity = gr.Dropdown(
-                label="Granularity",
-                choices=[Granularity.DAY.value, Granularity.WEEK.value, Granularity.MONTH.value],
-                value=Granularity.DAY.value,
-            )
-            refresh = gr.Button("Refresh", variant="primary")
-        metrics_text = gr.Markdown(
-            "PV: 0 | UV: 0 | Sessions: 0 | Events/Session: 0 | Sessions/Visitor: 0"
-        )
-        with gr.Row():
-            overview_plot = gr.Plot(label="Traffic Overview")
-            benchmark_plot = gr.Plot(label="Benchmark Analysis")
-        with gr.Row():
-            filter_plot = gr.Plot(label="Filter Behavior")
-            funnel_plot = gr.Plot(label="Funnel")
-        visitor_plot = gr.Plot(label="Visitor Segmentation")
-        with gr.Row():
-            with gr.Column(scale=2):
-                visitor_locations_plot = gr.Plot(label="Visitor Locations")
-            with gr.Column(scale=1):
-                visitor_locations_debug = gr.Markdown(
-                    "GeoIP DB: not checked  \n"
-                    "Total location PV: 0 | Users/IPs: 0 | Mapped regions: 0"
-                )
-                visitor_locations_top_table = gr.DataFrame(
-                    label="Top 10 Regions",
-                    interactive=False,
-                    wrap=True,
-                )
-        with gr.Accordion("Raw data", open=False):
-            csv_file = gr.File(label="CSV export")
-            overview_table = gr.DataFrame(label="Traffic Overview")
-            benchmark_table = gr.DataFrame(label="Benchmark Analysis")
-            filter_table = gr.DataFrame(label="Filter Behavior")
-            funnel_table = gr.DataFrame(label="Funnel")
-            visitor_table = gr.DataFrame(label="Visitor Segmentation")
-            visitor_locations_table = gr.DataFrame(label="Visitor Locations")
-        outputs = [
-            metrics_text,
-            overview_plot,
-            benchmark_plot,
-            filter_plot,
-            funnel_plot,
-            visitor_plot,
-            visitor_locations_plot,
-            visitor_locations_debug,
-            visitor_locations_top_table,
-            overview_table,
-            benchmark_table,
-            filter_table,
-            funnel_table,
-            visitor_table,
-            visitor_locations_table,
-            csv_file,
-        ]
-        refresh.click(
-            fn=query,
-            inputs=[start_time, end_time, benchmark, granularity],
-            outputs=outputs,
-        )
-        demo.load(fn=load_benchmarks, outputs=benchmark)
-        demo.load(
-            fn=query,
-            inputs=[start_time, end_time, benchmark, granularity],
-            outputs=outputs,
-        )
-    Path(tempfile.gettempdir()).mkdir(parents=True, exist_ok=True)
-    return demo

tests/test_geoip_database.py DELETED Viewed

@@ -1,29 +0,0 @@
-import gzip
-from leaderboard_analytics.geoip_database import ensure_geoip_database
-def test_ensure_geoip_database_downloads_and_decompresses_gzip(tmp_path) -> None:
-    source = tmp_path / "GeoLite2-Country.mmdb.gz"
-    target = tmp_path / "GeoLite2-Country.mmdb"
-    expected_bytes = b"fake-mmdb-bytes"
-    with gzip.open(source, "wb") as gzip_file:
-        gzip_file.write(expected_bytes)
-    result = ensure_geoip_database(target, source.as_uri())
-    assert result == target
-    assert target.read_bytes() == expected_bytes
-def test_ensure_geoip_database_keeps_existing_file(tmp_path) -> None:
-    source = tmp_path / "missing.mmdb.gz"
-    target = tmp_path / "GeoLite2-Country.mmdb"
-    expected_bytes = b"existing-mmdb-bytes"
-    target.write_bytes(expected_bytes)
-    result = ensure_geoip_database(target, source.as_uri())
-    assert result == target
-    assert target.read_bytes() == expected_bytes

tests/test_repositories.py DELETED Viewed

@@ -1,95 +0,0 @@
-from datetime import UTC, datetime
-from leaderboard_analytics.repositories import AnalyticsRepository
-from leaderboard_analytics.schemas import QueryFilters
-class CapturingCollection:
-    def __init__(self, rows: list[dict] | None = None) -> None:
-        self.rows = rows or []
-        self.pipeline: list[dict] | None = None
-    def aggregate(self, pipeline: list[dict]):
-        self.pipeline = pipeline
-        return iter(self.rows)
-def _filters() -> QueryFilters:
-    return QueryFilters(
-        start_time=datetime(2026, 1, 1, tzinfo=UTC),
-        end_time=datetime(2026, 1, 31, tzinfo=UTC),
-    )
-def test_funnel_pipeline_preserves_ordered_step_logic() -> None:
-    collection = CapturingCollection()
-    repository = AnalyticsRepository(collection)  # type: ignore[arg-type]
-    repository.funnel(_filters())
-    assert collection.pipeline is not None
-    assert {"$sort": {"session_id": 1, "event_ts": 1}} in collection.pipeline
-    assert any(
-        "$push" in stage.get("$group", {}).get("events", {}) for stage in collection.pipeline
-    )
-    assert not any(
-        "$addToSet" in str(stage) and "events" in str(stage) for stage in collection.pipeline
-    )
-    assert any(
-        "table_download_at" in str(stage) and "$filter_change_at" in str(stage)
-        for stage in collection.pipeline
-    )
-def test_new_vs_returning_pipeline_computes_first_seen_before_range_match() -> None:
-    collection = CapturingCollection()
-    repository = AnalyticsRepository(collection)  # type: ignore[arg-type]
-    repository.visitors_new_vs_returning(_filters())
-    assert collection.pipeline is not None
-    window_index = next(
-        i for i, stage in enumerate(collection.pipeline) if "$setWindowFields" in stage
-    )
-    range_match_index = next(
-        i
-        for i, stage in enumerate(collection.pipeline)
-        if stage.get("$match", {}).get("event_ts") is not None
-    )
-    assert window_index < range_match_index
-def test_overview_totals_filters_empty_identifiers() -> None:
-    collection = CapturingCollection([{"pv": 1, "uv": 1, "sessions": 1, "events": 2}])
-    repository = AnalyticsRepository(collection)  # type: ignore[arg-type]
-    totals = repository.overview_totals(_filters())
-    assert totals == {"pv": 1, "uv": 1, "sessions": 1, "events": 2}
-    assert collection.pipeline is not None
-    pipeline_text = str(collection.pipeline)
-    assert '"$sessions"' in pipeline_text or "'$sessions'" in pipeline_text
-    assert '"$visitors"' in pipeline_text or "'$visitors'" in pipeline_text
-    assert "$$s" in pipeline_text
-    assert "$$v" in pipeline_text
-def test_visitor_ip_counts_groups_page_view_ips_with_existing_filters() -> None:
-    collection = CapturingCollection([{"ip": "8.8.8.8", "pv": 3}])
-    repository = AnalyticsRepository(collection)  # type: ignore[arg-type]
-    filters = QueryFilters(
-        start_time=datetime(2026, 1, 1, tzinfo=UTC),
-        end_time=datetime(2026, 1, 31, tzinfo=UTC),
-        benchmark="MTEB",
-    )
-    rows = repository.visitor_ip_counts(filters)
-    assert rows == [{"ip": "8.8.8.8", "pv": 3}]
-    assert collection.pipeline is not None
-    pipeline_text = str(collection.pipeline)
-    assert "properties.ip" in pipeline_text
-    assert "page_view" in pipeline_text
-    assert "MTEB" in pipeline_text
-    assert "$nin" in pipeline_text
-    assert "$properties.ip" in pipeline_text

tests/test_schemas.py DELETED Viewed

@@ -1,16 +0,0 @@
-from datetime import UTC, datetime
-import pytest
-from pydantic import ValidationError
-from leaderboard_analytics.schemas import QueryFilters
-def test_query_filters_rejects_invalid_time_range() -> None:
-    with pytest.raises(
-        ValidationError, match="start_time must be earlier than or equal to end_time"
-    ):
-        QueryFilters(
-            start_time=datetime(2026, 1, 2, tzinfo=UTC),
-            end_time=datetime(2026, 1, 1, tzinfo=UTC),
-        )

tests/test_services.py DELETED Viewed

@@ -1,110 +0,0 @@
-from datetime import UTC, datetime
-from pathlib import Path
-from leaderboard_analytics.schemas import QueryFilters
-from leaderboard_analytics.services import AnalyticsService
-class FakeRepository:
-    def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
-        return [
-            {"period": "2026-01-01", "pv": 2, "uv": 1, "session_count": 1, "event_count": 3},
-            {"period": "2026-01-02", "pv": 1, "uv": 1, "session_count": 1, "event_count": 2},
-        ]
-    def overview_totals(self, filters: QueryFilters) -> dict:
-        return {"pv": 3, "uv": 1, "sessions": 1, "events": 5}
-class LocationRepository:
-    def __init__(self, rows: list[dict]) -> None:
-        self.rows = rows
-    def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
-        return self.rows
-class FakeGeoIpResolver:
-    def __init__(self, countries: dict[str, tuple[str, str]]) -> None:
-        self.countries = countries
-    def resolve_country(self, ip_address: str) -> tuple[str, str]:
-        return self.countries[ip_address]
-def test_overview_uses_full_range_distinct_totals() -> None:
-    service = AnalyticsService(FakeRepository())  # type: ignore[arg-type]
-    filters = QueryFilters(
-        start_time=datetime(2026, 1, 1, tzinfo=UTC),
-        end_time=datetime(2026, 1, 2, tzinfo=UTC),
-    )
-    frame, totals = service.get_overview(filters)
-    assert list(frame["period"]) == ["2026-01-01", "2026-01-02"]
-    assert totals == {
-        "pv": 3,
-        "uv": 1,
-        "sessions": 1,
-        "events": 5,
-        "events_per_session": 5.0,
-        "sessions_per_visitor": 1.0,
-    }
-def test_visitor_locations_groups_pv_and_ip_count_by_country() -> None:
-    repository = LocationRepository(
-        [
-            {"ip": "8.8.8.8", "pv": 3},
-            {"ip": "8.8.4.4", "pv": 2},
-            {"ip": "1.1.1.1", "pv": 4},
-        ]
-    )
-    resolver = FakeGeoIpResolver(
-        {
-            "8.8.8.8": ("US", "United States"),
-            "8.8.4.4": ("US", "United States"),
-            "1.1.1.1": ("AU", "Australia"),
-        }
-    )
-    service = AnalyticsService(
-        repository,  # type: ignore[arg-type]
-        geoip_resolver=resolver,  # type: ignore[arg-type]
-    )
-    frame = service.get_visitor_locations(
-        QueryFilters(
-            start_time=datetime(2026, 1, 1, tzinfo=UTC),
-            end_time=datetime(2026, 1, 2, tzinfo=UTC),
-        )
-    )
-    assert frame.to_dict("records") == [
-        {"country_code": "US", "country_name": "United States", "pv": 5, "ip_count": 2},
-        {"country_code": "AU", "country_name": "Australia", "pv": 4, "ip_count": 1},
-    ]
-def test_visitor_locations_groups_unresolved_ips_as_unknown() -> None:
-    repository = LocationRepository(
-        [
-            {"ip": "10.0.0.1", "pv": 2},
-            {"ip": "not-an-ip", "pv": 1},
-            {"ip": "8.8.8.8", "pv": 3},
-        ]
-    )
-    service = AnalyticsService(
-        repository,  # type: ignore[arg-type]
-        geoip_database_path=Path("missing-geolite2-country.mmdb"),
-    )
-    frame = service.get_visitor_locations(
-        QueryFilters(
-            start_time=datetime(2026, 1, 1, tzinfo=UTC),
-            end_time=datetime(2026, 1, 2, tzinfo=UTC),
-        )
-    )
-    assert frame.to_dict("records") == [
-        {"country_code": "Unknown", "country_name": "Unknown", "pv": 6, "ip_count": 3}
-    ]