Spaces:
Runtime error
Runtime error
Commit
·
3ce0948
0
Parent(s):
Duplicate from autoevaluate/model-evaluator
Browse filesCo-authored-by: Lewis Tunstall <lewtun@users.noreply.huggingface.co>
- .env.template +4 -0
- .github/workflows/check_filesize.yml +16 -0
- .github/workflows/quality.yml +29 -0
- .github/workflows/run_evaluation_jobs.yml +30 -0
- .github/workflows/sync_with_spaces.yml +20 -0
- .gitignore +134 -0
- LICENSE +201 -0
- Makefile +8 -0
- README.md +114 -0
- app.py +693 -0
- evaluation.py +57 -0
- images/autotrain_job.png +0 -0
- images/autotrain_projects.png +0 -0
- notebooks/flush-prediction-repos.ipynb +177 -0
- pyproject.toml +2 -0
- requirements.txt +12 -0
- run_evaluation_jobs.py +64 -0
- utils.py +215 -0
.env.template
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
AUTOTRAIN_USERNAME=autoevaluator # The bot or user that authors evaluation jobs
|
| 2 |
+
HF_TOKEN=hf_xxx # An API token of the `autoevaluator` user
|
| 3 |
+
AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co # The AutoTrain backend to send jobs to. Use https://api.autotrain.huggingface.co for prod or http://localhost:8000 for local development
|
| 4 |
+
DATASETS_PREVIEW_API=https://datasets-server.huggingface.co # The API to grab dataset information from
|
.github/workflows/check_filesize.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Check file size
|
| 2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
| 3 |
+
pull_request:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# to run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- name: Check large files
|
| 14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
| 15 |
+
with:
|
| 16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/quality.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Code quality
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
pull_request:
|
| 8 |
+
branches:
|
| 9 |
+
- main
|
| 10 |
+
|
| 11 |
+
jobs:
|
| 12 |
+
|
| 13 |
+
check_code_quality:
|
| 14 |
+
name: Check code quality
|
| 15 |
+
runs-on: ubuntu-latest
|
| 16 |
+
steps:
|
| 17 |
+
- name: Checkout code
|
| 18 |
+
uses: actions/checkout@v2
|
| 19 |
+
- name: Setup Python environment
|
| 20 |
+
uses: actions/setup-python@v2
|
| 21 |
+
with:
|
| 22 |
+
python-version: 3.9
|
| 23 |
+
- name: Install dependencies
|
| 24 |
+
run: |
|
| 25 |
+
python -m pip install --upgrade pip
|
| 26 |
+
python -m pip install black isort flake8
|
| 27 |
+
- name: Code quality
|
| 28 |
+
run: |
|
| 29 |
+
make quality
|
.github/workflows/run_evaluation_jobs.yml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Start evaluation jobs
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
- cron: '*/15 * * * *' # Start evaluations every 15th minute
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
|
| 9 |
+
build:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout code
|
| 14 |
+
uses: actions/checkout@v2
|
| 15 |
+
|
| 16 |
+
- name: Setup Python Environment
|
| 17 |
+
uses: actions/setup-python@v2
|
| 18 |
+
with:
|
| 19 |
+
python-version: 3.8
|
| 20 |
+
|
| 21 |
+
- name: Install requirements
|
| 22 |
+
run: pip install -r requirements.txt
|
| 23 |
+
|
| 24 |
+
- name: Execute scoring script
|
| 25 |
+
env:
|
| 26 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 27 |
+
AUTOTRAIN_USERNAME: ${{ secrets.AUTOTRAIN_USERNAME }}
|
| 28 |
+
AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }}
|
| 29 |
+
run: |
|
| 30 |
+
HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python run_evaluation_jobs.py
|
.github/workflows/sync_with_spaces.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# to run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v2
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
- name: Push to hub
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: |
|
| 20 |
+
git push https://lewtun:$HF_TOKEN@huggingface.co/spaces/autoevaluate/model-evaluator main
|
.gitignore
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
pip-wheel-metadata/
|
| 24 |
+
share/python-wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# PyInstaller
|
| 31 |
+
# Usually these files are written by a python script from a template
|
| 32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
+
*.manifest
|
| 34 |
+
*.spec
|
| 35 |
+
|
| 36 |
+
# Installer logs
|
| 37 |
+
pip-log.txt
|
| 38 |
+
pip-delete-this-directory.txt
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py,cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
.pytest_cache/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
target/
|
| 76 |
+
|
| 77 |
+
# Jupyter Notebook
|
| 78 |
+
.ipynb_checkpoints
|
| 79 |
+
|
| 80 |
+
# IPython
|
| 81 |
+
profile_default/
|
| 82 |
+
ipython_config.py
|
| 83 |
+
|
| 84 |
+
# pyenv
|
| 85 |
+
.python-version
|
| 86 |
+
|
| 87 |
+
# pipenv
|
| 88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 91 |
+
# install all needed dependencies.
|
| 92 |
+
#Pipfile.lock
|
| 93 |
+
|
| 94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 95 |
+
__pypackages__/
|
| 96 |
+
|
| 97 |
+
# Celery stuff
|
| 98 |
+
celerybeat-schedule
|
| 99 |
+
celerybeat.pid
|
| 100 |
+
|
| 101 |
+
# SageMath parsed files
|
| 102 |
+
*.sage.py
|
| 103 |
+
|
| 104 |
+
# Environments
|
| 105 |
+
.env
|
| 106 |
+
.venv
|
| 107 |
+
env/
|
| 108 |
+
venv/
|
| 109 |
+
ENV/
|
| 110 |
+
env.bak/
|
| 111 |
+
venv.bak/
|
| 112 |
+
|
| 113 |
+
# Spyder project settings
|
| 114 |
+
.spyderproject
|
| 115 |
+
.spyproject
|
| 116 |
+
|
| 117 |
+
# Rope project settings
|
| 118 |
+
.ropeproject
|
| 119 |
+
|
| 120 |
+
# mkdocs documentation
|
| 121 |
+
/site
|
| 122 |
+
|
| 123 |
+
# mypy
|
| 124 |
+
.mypy_cache/
|
| 125 |
+
.dmypy.json
|
| 126 |
+
dmypy.json
|
| 127 |
+
|
| 128 |
+
# Pyre type checker
|
| 129 |
+
.pyre/
|
| 130 |
+
|
| 131 |
+
scratch/
|
| 132 |
+
|
| 133 |
+
# Evaluation job logs
|
| 134 |
+
evaluation-job-logs/
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
Makefile
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
style:
|
| 2 |
+
python -m black --line-length 119 --target-version py39 .
|
| 3 |
+
python -m isort .
|
| 4 |
+
|
| 5 |
+
quality:
|
| 6 |
+
python -m black --check --line-length 119 --target-version py39 .
|
| 7 |
+
python -m isort --check-only .
|
| 8 |
+
python -m flake8 --max-line-length 119
|
README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Model Evaluator
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.10.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
duplicated_from: autoevaluate/model-evaluator
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Evaluator
|
| 13 |
+
|
| 14 |
+
> Submit evaluation jobs to AutoTrain from the Hugging Face Hub
|
| 15 |
+
|
| 16 |
+
## Supported tasks
|
| 17 |
+
|
| 18 |
+
The table below shows which tasks are currently supported for evaluation in the AutoTrain backend:
|
| 19 |
+
|
| 20 |
+
| Task | Supported |
|
| 21 |
+
|:-----------------------------------|:---------:|
|
| 22 |
+
| `binary_classification` | ✅ |
|
| 23 |
+
| `multi_class_classification` | ✅ |
|
| 24 |
+
| `multi_label_classification` | ❌ |
|
| 25 |
+
| `entity_extraction` | ✅ |
|
| 26 |
+
| `extractive_question_answering` | ✅ |
|
| 27 |
+
| `translation` | ✅ |
|
| 28 |
+
| `summarization` | ✅ |
|
| 29 |
+
| `image_binary_classification` | ✅ |
|
| 30 |
+
| `image_multi_class_classification` | ✅ |
|
| 31 |
+
| `text_zero_shot_evaluation` | ✅ |
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
## Installation
|
| 35 |
+
|
| 36 |
+
To run the application locally, first clone this repository and install the dependencies as follows:
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
pip install -r requirements.txt
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Next, copy the example file of environment variables:
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
cp .env.template .env
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
and set the `HF_TOKEN` variable with a valid API token from the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user. Finally, spin up the application by running:
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
streamlit run app.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Usage
|
| 55 |
+
|
| 56 |
+
Evaluation on the Hub involves two main steps:
|
| 57 |
+
|
| 58 |
+
1. Submitting an evaluation job via the UI. This creates an AutoTrain project with `N` models for evaluation. At this stage, the dataset is also processed and prepared for evaluation.
|
| 59 |
+
2. Triggering the evaluation itself once the dataset is processed.
|
| 60 |
+
|
| 61 |
+
From the user perspective, only step (1) is needed since step (2) is handled by a cron job on GitHub Actions that executes the `run_evaluation_jobs.py` script every 15 minutes.
|
| 62 |
+
|
| 63 |
+
See below for details on manually triggering evaluation jobs.
|
| 64 |
+
|
| 65 |
+
### Triggering an evaluation
|
| 66 |
+
|
| 67 |
+
To evaluate the models in an AutoTrain project, run:
|
| 68 |
+
|
| 69 |
+
```
|
| 70 |
+
python run_evaluation_jobs.py
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
This will download the [`autoevaluate/evaluation-job-logs`](https://huggingface.co/datasets/autoevaluate/evaluation-job-logs) dataset from the Hub and check which evaluation projects are ready for evaluation (i.e. those whose dataset has been processed).
|
| 74 |
+
|
| 75 |
+
## AutoTrain configuration details
|
| 76 |
+
|
| 77 |
+
Models are evaluated by the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user in AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. Evaluation projects are created and run on either the `prod` or `staging` environments. You can view the status of projects in the AutoTrain UI by navigating to one of the links below (ask internally for access to the staging UI):
|
| 78 |
+
|
| 79 |
+
| AutoTrain environment | AutoTrain UI URL | `AUTOTRAIN_BACKEND_API` |
|
| 80 |
+
|:---------------------:|:--------------------------------------------------------------------------------------------------------------:|:--------------------------------------------:|
|
| 81 |
+
| `prod` | [`https://ui.autotrain.huggingface.co/projects`](https://ui.autotrain.huggingface.co/projects) | https://api.autotrain.huggingface.co |
|
| 82 |
+
| `staging` | [`https://ui-staging.autotrain.huggingface.co/projects`](https://ui-staging.autotrain.huggingface.co/projects) | https://api-staging.autotrain.huggingface.co |
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
The current configuration for evaluation jobs running on [Spaces](https://huggingface.co/spaces/autoevaluate/model-evaluator) is:
|
| 86 |
+
|
| 87 |
+
```
|
| 88 |
+
AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
To evaluate models with a _local_ instance of AutoTrain, change the environment to:
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
AUTOTRAIN_BACKEND_API=http://localhost:8000
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### Migrating from staging to production (and vice versa)
|
| 98 |
+
|
| 99 |
+
In general, evaluation jobs should run in AutoTrain's `prod` environment, which is defined by the following environment variable:
|
| 100 |
+
|
| 101 |
+
```
|
| 102 |
+
AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
However, there are times when it is necessary to run evaluation jobs in AutoTrain's `staging` environment (e.g. because a new evaluation pipeline is being deployed). In these cases the corresponding environement variable is:
|
| 106 |
+
|
| 107 |
+
```
|
| 108 |
+
AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
To migrate between these two environments, update the `AUTOTRAIN_BACKEND_API` in two places:
|
| 112 |
+
|
| 113 |
+
* In the [repo secrets](https://huggingface.co/spaces/autoevaluate/model-evaluator/settings) associated with the `model-evaluator` Space. This will ensure evaluation projects are created in the desired environment.
|
| 114 |
+
* In the [GitHub Actions secrets](https://github.com/huggingface/model-evaluator/settings/secrets/actions) associated with this repo. This will ensure that the correct evaluation jobs are approved and launched via the `run_evaluation_jobs.py` script.
|
app.py
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import yaml
|
| 8 |
+
from datasets import get_dataset_config_names
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from huggingface_hub import list_datasets
|
| 11 |
+
|
| 12 |
+
from evaluation import filter_evaluated_models
|
| 13 |
+
from utils import (
|
| 14 |
+
AUTOTRAIN_TASK_TO_HUB_TASK,
|
| 15 |
+
commit_evaluation_log,
|
| 16 |
+
create_autotrain_project_name,
|
| 17 |
+
format_col_mapping,
|
| 18 |
+
get_compatible_models,
|
| 19 |
+
get_config_metadata,
|
| 20 |
+
get_dataset_card_url,
|
| 21 |
+
get_key,
|
| 22 |
+
get_metadata,
|
| 23 |
+
http_get,
|
| 24 |
+
http_post,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
if Path(".env").is_file():
|
| 28 |
+
load_dotenv(".env")
|
| 29 |
+
|
| 30 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 31 |
+
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
|
| 32 |
+
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
|
| 33 |
+
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
|
| 34 |
+
|
| 35 |
+
# Put image tasks on top
|
| 36 |
+
TASK_TO_ID = {
|
| 37 |
+
"image_binary_classification": 17,
|
| 38 |
+
"image_multi_class_classification": 18,
|
| 39 |
+
"binary_classification": 1,
|
| 40 |
+
"multi_class_classification": 2,
|
| 41 |
+
"natural_language_inference": 22,
|
| 42 |
+
"entity_extraction": 4,
|
| 43 |
+
"extractive_question_answering": 5,
|
| 44 |
+
"translation": 6,
|
| 45 |
+
"summarization": 8,
|
| 46 |
+
"text_zero_shot_classification": 23,
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
TASK_TO_DEFAULT_METRICS = {
|
| 50 |
+
"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
|
| 51 |
+
"multi_class_classification": [
|
| 52 |
+
"f1",
|
| 53 |
+
"precision",
|
| 54 |
+
"recall",
|
| 55 |
+
"accuracy",
|
| 56 |
+
],
|
| 57 |
+
"natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
|
| 58 |
+
"entity_extraction": ["precision", "recall", "f1", "accuracy"],
|
| 59 |
+
"extractive_question_answering": ["f1", "exact_match"],
|
| 60 |
+
"translation": ["sacrebleu"],
|
| 61 |
+
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
|
| 62 |
+
"image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
|
| 63 |
+
"image_multi_class_classification": [
|
| 64 |
+
"f1",
|
| 65 |
+
"precision",
|
| 66 |
+
"recall",
|
| 67 |
+
"accuracy",
|
| 68 |
+
],
|
| 69 |
+
"text_zero_shot_classification": ["accuracy", "loss"],
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
AUTOTRAIN_TASK_TO_LANG = {
|
| 73 |
+
"translation": "en2de",
|
| 74 |
+
"image_binary_classification": "unk",
|
| 75 |
+
"image_multi_class_classification": "unk",
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
SUPPORTED_TASKS = list(TASK_TO_ID.keys())
|
| 82 |
+
|
| 83 |
+
# Extracted from utils.get_supported_metrics
|
| 84 |
+
# Hardcoded for now due to speed / caching constraints
|
| 85 |
+
SUPPORTED_METRICS = [
|
| 86 |
+
"accuracy",
|
| 87 |
+
"bertscore",
|
| 88 |
+
"bleu",
|
| 89 |
+
"cer",
|
| 90 |
+
"chrf",
|
| 91 |
+
"code_eval",
|
| 92 |
+
"comet",
|
| 93 |
+
"competition_math",
|
| 94 |
+
"coval",
|
| 95 |
+
"cuad",
|
| 96 |
+
"exact_match",
|
| 97 |
+
"f1",
|
| 98 |
+
"frugalscore",
|
| 99 |
+
"google_bleu",
|
| 100 |
+
"mae",
|
| 101 |
+
"mahalanobis",
|
| 102 |
+
"matthews_correlation",
|
| 103 |
+
"mean_iou",
|
| 104 |
+
"meteor",
|
| 105 |
+
"mse",
|
| 106 |
+
"pearsonr",
|
| 107 |
+
"perplexity",
|
| 108 |
+
"precision",
|
| 109 |
+
"recall",
|
| 110 |
+
"roc_auc",
|
| 111 |
+
"rouge",
|
| 112 |
+
"sacrebleu",
|
| 113 |
+
"sari",
|
| 114 |
+
"seqeval",
|
| 115 |
+
"spearmanr",
|
| 116 |
+
"squad",
|
| 117 |
+
"squad_v2",
|
| 118 |
+
"ter",
|
| 119 |
+
"trec_eval",
|
| 120 |
+
"wer",
|
| 121 |
+
"wiki_split",
|
| 122 |
+
"xnli",
|
| 123 |
+
"angelina-wang/directional_bias_amplification",
|
| 124 |
+
"jordyvl/ece",
|
| 125 |
+
"lvwerra/ai4code",
|
| 126 |
+
"lvwerra/amex",
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
#######
|
| 131 |
+
# APP #
|
| 132 |
+
#######
|
| 133 |
+
st.title("Evaluation on the Hub")
|
| 134 |
+
st.markdown(
|
| 135 |
+
"""
|
| 136 |
+
Welcome to Hugging Face's automatic model evaluator 👋!
|
| 137 |
+
|
| 138 |
+
This application allows you to evaluate 🤗 Transformers
|
| 139 |
+
[models](https://huggingface.co/models?library=transformers&sort=downloads)
|
| 140 |
+
across a wide variety of [datasets](https://huggingface.co/datasets) on the
|
| 141 |
+
Hub. Please select the dataset and configuration below. The results of your
|
| 142 |
+
evaluation will be displayed on the [public
|
| 143 |
+
leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
|
| 144 |
+
more details, check out out our [blog
|
| 145 |
+
post](https://huggingface.co/blog/eval-on-the-hub).
|
| 146 |
+
"""
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
all_datasets = [d.id for d in list_datasets()]
|
| 150 |
+
query_params = st.experimental_get_query_params()
|
| 151 |
+
if "first_query_params" not in st.session_state:
|
| 152 |
+
st.session_state.first_query_params = query_params
|
| 153 |
+
first_query_params = st.session_state.first_query_params
|
| 154 |
+
default_dataset = all_datasets[0]
|
| 155 |
+
if "dataset" in first_query_params:
|
| 156 |
+
if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
|
| 157 |
+
default_dataset = first_query_params["dataset"][0]
|
| 158 |
+
|
| 159 |
+
selected_dataset = st.selectbox(
|
| 160 |
+
"Select a dataset",
|
| 161 |
+
all_datasets,
|
| 162 |
+
index=all_datasets.index(default_dataset),
|
| 163 |
+
help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
|
| 164 |
+
new metadata to a dataset card.""",
|
| 165 |
+
)
|
| 166 |
+
st.experimental_set_query_params(**{"dataset": [selected_dataset]})
|
| 167 |
+
|
| 168 |
+
# Check if selected dataset can be streamed
|
| 169 |
+
is_valid_dataset = http_get(
|
| 170 |
+
path="/is-valid",
|
| 171 |
+
domain=DATASETS_PREVIEW_API,
|
| 172 |
+
params={"dataset": selected_dataset},
|
| 173 |
+
).json()
|
| 174 |
+
if is_valid_dataset["valid"] is False:
|
| 175 |
+
st.error(
|
| 176 |
+
"""The dataset you selected is not currently supported. Open a \
|
| 177 |
+
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
metadata = get_metadata(selected_dataset, token=HF_TOKEN)
|
| 181 |
+
print(f"INFO -- Dataset metadata: {metadata}")
|
| 182 |
+
if metadata is None:
|
| 183 |
+
st.warning("No evaluation metadata found. Please configure the evaluation job below.")
|
| 184 |
+
|
| 185 |
+
with st.expander("Advanced configuration"):
|
| 186 |
+
# Select task
|
| 187 |
+
selected_task = st.selectbox(
|
| 188 |
+
"Select a task",
|
| 189 |
+
SUPPORTED_TASKS,
|
| 190 |
+
index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
|
| 191 |
+
help="""Don't see your favourite task here? Open a \
|
| 192 |
+
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
|
| 193 |
+
)
|
| 194 |
+
# Select config
|
| 195 |
+
configs = get_dataset_config_names(selected_dataset)
|
| 196 |
+
selected_config = st.selectbox(
|
| 197 |
+
"Select a config",
|
| 198 |
+
configs,
|
| 199 |
+
help="""Some datasets contain several sub-datasets, known as _configurations_. \
|
| 200 |
+
Select one to evaluate your models on. \
|
| 201 |
+
See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
|
| 202 |
+
""",
|
| 203 |
+
)
|
| 204 |
+
# Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
|
| 205 |
+
config_metadata = get_config_metadata(selected_config, metadata)
|
| 206 |
+
print(f"INFO -- Config metadata: {config_metadata}")
|
| 207 |
+
|
| 208 |
+
# Select splits
|
| 209 |
+
splits_resp = http_get(
|
| 210 |
+
path="/splits",
|
| 211 |
+
domain=DATASETS_PREVIEW_API,
|
| 212 |
+
params={"dataset": selected_dataset},
|
| 213 |
+
)
|
| 214 |
+
if splits_resp.status_code == 200:
|
| 215 |
+
split_names = []
|
| 216 |
+
all_splits = splits_resp.json()
|
| 217 |
+
for split in all_splits["splits"]:
|
| 218 |
+
if split["config"] == selected_config:
|
| 219 |
+
split_names.append(split["split"])
|
| 220 |
+
|
| 221 |
+
if config_metadata is not None:
|
| 222 |
+
eval_split = config_metadata["splits"].get("eval_split", None)
|
| 223 |
+
else:
|
| 224 |
+
eval_split = None
|
| 225 |
+
selected_split = st.selectbox(
|
| 226 |
+
"Select a split",
|
| 227 |
+
split_names,
|
| 228 |
+
index=split_names.index(eval_split) if eval_split is not None else 0,
|
| 229 |
+
help="Be wary when evaluating models on the `train` split.",
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# Select columns
|
| 233 |
+
rows_resp = http_get(
|
| 234 |
+
path="/first-rows",
|
| 235 |
+
domain=DATASETS_PREVIEW_API,
|
| 236 |
+
params={
|
| 237 |
+
"dataset": selected_dataset,
|
| 238 |
+
"config": selected_config,
|
| 239 |
+
"split": selected_split,
|
| 240 |
+
},
|
| 241 |
+
).json()
|
| 242 |
+
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
| 243 |
+
|
| 244 |
+
st.markdown("**Map your dataset columns**")
|
| 245 |
+
st.markdown(
|
| 246 |
+
"""The model evaluator uses a standardised set of column names for the input examples and labels. \
|
| 247 |
+
Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
|
| 248 |
+
)
|
| 249 |
+
col1, col2 = st.columns(2)
|
| 250 |
+
|
| 251 |
+
# TODO: find a better way to layout these items
|
| 252 |
+
# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
|
| 253 |
+
col_mapping = {}
|
| 254 |
+
if selected_task in ["binary_classification", "multi_class_classification"]:
|
| 255 |
+
with col1:
|
| 256 |
+
st.markdown("`text` column")
|
| 257 |
+
st.text("")
|
| 258 |
+
st.text("")
|
| 259 |
+
st.text("")
|
| 260 |
+
st.text("")
|
| 261 |
+
st.markdown("`target` column")
|
| 262 |
+
with col2:
|
| 263 |
+
text_col = st.selectbox(
|
| 264 |
+
"This column should contain the text to be classified",
|
| 265 |
+
col_names,
|
| 266 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
| 267 |
+
if config_metadata is not None
|
| 268 |
+
else 0,
|
| 269 |
+
)
|
| 270 |
+
target_col = st.selectbox(
|
| 271 |
+
"This column should contain the labels associated with the text",
|
| 272 |
+
col_names,
|
| 273 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 274 |
+
if config_metadata is not None
|
| 275 |
+
else 0,
|
| 276 |
+
)
|
| 277 |
+
col_mapping[text_col] = "text"
|
| 278 |
+
col_mapping[target_col] = "target"
|
| 279 |
+
|
| 280 |
+
elif selected_task == "text_zero_shot_classification":
|
| 281 |
+
with col1:
|
| 282 |
+
st.markdown("`text` column")
|
| 283 |
+
st.text("")
|
| 284 |
+
st.text("")
|
| 285 |
+
st.text("")
|
| 286 |
+
st.text("")
|
| 287 |
+
st.markdown("`classes` column")
|
| 288 |
+
st.text("")
|
| 289 |
+
st.text("")
|
| 290 |
+
st.text("")
|
| 291 |
+
st.text("")
|
| 292 |
+
st.markdown("`target` column")
|
| 293 |
+
with col2:
|
| 294 |
+
text_col = st.selectbox(
|
| 295 |
+
"This column should contain the text to be classified",
|
| 296 |
+
col_names,
|
| 297 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
| 298 |
+
if config_metadata is not None
|
| 299 |
+
else 0,
|
| 300 |
+
)
|
| 301 |
+
classes_col = st.selectbox(
|
| 302 |
+
"This column should contain the classes associated with the text",
|
| 303 |
+
col_names,
|
| 304 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
|
| 305 |
+
if config_metadata is not None
|
| 306 |
+
else 0,
|
| 307 |
+
)
|
| 308 |
+
target_col = st.selectbox(
|
| 309 |
+
"This column should contain the index of the correct class",
|
| 310 |
+
col_names,
|
| 311 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 312 |
+
if config_metadata is not None
|
| 313 |
+
else 0,
|
| 314 |
+
)
|
| 315 |
+
col_mapping[text_col] = "text"
|
| 316 |
+
col_mapping[classes_col] = "classes"
|
| 317 |
+
col_mapping[target_col] = "target"
|
| 318 |
+
|
| 319 |
+
if selected_task in ["natural_language_inference"]:
|
| 320 |
+
config_metadata = get_config_metadata(selected_config, metadata)
|
| 321 |
+
with col1:
|
| 322 |
+
st.markdown("`text1` column")
|
| 323 |
+
st.text("")
|
| 324 |
+
st.text("")
|
| 325 |
+
st.text("")
|
| 326 |
+
st.text("")
|
| 327 |
+
st.text("")
|
| 328 |
+
st.markdown("`text2` column")
|
| 329 |
+
st.text("")
|
| 330 |
+
st.text("")
|
| 331 |
+
st.text("")
|
| 332 |
+
st.text("")
|
| 333 |
+
st.text("")
|
| 334 |
+
st.markdown("`target` column")
|
| 335 |
+
with col2:
|
| 336 |
+
text1_col = st.selectbox(
|
| 337 |
+
"This column should contain the first text passage to be classified",
|
| 338 |
+
col_names,
|
| 339 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
|
| 340 |
+
if config_metadata is not None
|
| 341 |
+
else 0,
|
| 342 |
+
)
|
| 343 |
+
text2_col = st.selectbox(
|
| 344 |
+
"This column should contain the second text passage to be classified",
|
| 345 |
+
col_names,
|
| 346 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
|
| 347 |
+
if config_metadata is not None
|
| 348 |
+
else 0,
|
| 349 |
+
)
|
| 350 |
+
target_col = st.selectbox(
|
| 351 |
+
"This column should contain the labels associated with the text",
|
| 352 |
+
col_names,
|
| 353 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 354 |
+
if config_metadata is not None
|
| 355 |
+
else 0,
|
| 356 |
+
)
|
| 357 |
+
col_mapping[text1_col] = "text1"
|
| 358 |
+
col_mapping[text2_col] = "text2"
|
| 359 |
+
col_mapping[target_col] = "target"
|
| 360 |
+
|
| 361 |
+
elif selected_task == "entity_extraction":
|
| 362 |
+
with col1:
|
| 363 |
+
st.markdown("`tokens` column")
|
| 364 |
+
st.text("")
|
| 365 |
+
st.text("")
|
| 366 |
+
st.text("")
|
| 367 |
+
st.text("")
|
| 368 |
+
st.markdown("`tags` column")
|
| 369 |
+
with col2:
|
| 370 |
+
tokens_col = st.selectbox(
|
| 371 |
+
"This column should contain the array of tokens to be classified",
|
| 372 |
+
col_names,
|
| 373 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
|
| 374 |
+
if config_metadata is not None
|
| 375 |
+
else 0,
|
| 376 |
+
)
|
| 377 |
+
tags_col = st.selectbox(
|
| 378 |
+
"This column should contain the labels associated with each part of the text",
|
| 379 |
+
col_names,
|
| 380 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
|
| 381 |
+
if config_metadata is not None
|
| 382 |
+
else 0,
|
| 383 |
+
)
|
| 384 |
+
col_mapping[tokens_col] = "tokens"
|
| 385 |
+
col_mapping[tags_col] = "tags"
|
| 386 |
+
|
| 387 |
+
elif selected_task == "translation":
|
| 388 |
+
with col1:
|
| 389 |
+
st.markdown("`source` column")
|
| 390 |
+
st.text("")
|
| 391 |
+
st.text("")
|
| 392 |
+
st.text("")
|
| 393 |
+
st.text("")
|
| 394 |
+
st.markdown("`target` column")
|
| 395 |
+
with col2:
|
| 396 |
+
text_col = st.selectbox(
|
| 397 |
+
"This column should contain the text to be translated",
|
| 398 |
+
col_names,
|
| 399 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
|
| 400 |
+
if config_metadata is not None
|
| 401 |
+
else 0,
|
| 402 |
+
)
|
| 403 |
+
target_col = st.selectbox(
|
| 404 |
+
"This column should contain the target translation",
|
| 405 |
+
col_names,
|
| 406 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 407 |
+
if config_metadata is not None
|
| 408 |
+
else 0,
|
| 409 |
+
)
|
| 410 |
+
col_mapping[text_col] = "source"
|
| 411 |
+
col_mapping[target_col] = "target"
|
| 412 |
+
|
| 413 |
+
elif selected_task == "summarization":
|
| 414 |
+
with col1:
|
| 415 |
+
st.markdown("`text` column")
|
| 416 |
+
st.text("")
|
| 417 |
+
st.text("")
|
| 418 |
+
st.text("")
|
| 419 |
+
st.text("")
|
| 420 |
+
st.markdown("`target` column")
|
| 421 |
+
with col2:
|
| 422 |
+
text_col = st.selectbox(
|
| 423 |
+
"This column should contain the text to be summarized",
|
| 424 |
+
col_names,
|
| 425 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
| 426 |
+
if config_metadata is not None
|
| 427 |
+
else 0,
|
| 428 |
+
)
|
| 429 |
+
target_col = st.selectbox(
|
| 430 |
+
"This column should contain the target summary",
|
| 431 |
+
col_names,
|
| 432 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 433 |
+
if config_metadata is not None
|
| 434 |
+
else 0,
|
| 435 |
+
)
|
| 436 |
+
col_mapping[text_col] = "text"
|
| 437 |
+
col_mapping[target_col] = "target"
|
| 438 |
+
|
| 439 |
+
elif selected_task == "extractive_question_answering":
|
| 440 |
+
if config_metadata is not None:
|
| 441 |
+
col_mapping = config_metadata["col_mapping"]
|
| 442 |
+
# Hub YAML parser converts periods to hyphens, so we remap them here
|
| 443 |
+
col_mapping = format_col_mapping(col_mapping)
|
| 444 |
+
with col1:
|
| 445 |
+
st.markdown("`context` column")
|
| 446 |
+
st.text("")
|
| 447 |
+
st.text("")
|
| 448 |
+
st.text("")
|
| 449 |
+
st.text("")
|
| 450 |
+
st.markdown("`question` column")
|
| 451 |
+
st.text("")
|
| 452 |
+
st.text("")
|
| 453 |
+
st.text("")
|
| 454 |
+
st.text("")
|
| 455 |
+
st.markdown("`answers.text` column")
|
| 456 |
+
st.text("")
|
| 457 |
+
st.text("")
|
| 458 |
+
st.text("")
|
| 459 |
+
st.text("")
|
| 460 |
+
st.markdown("`answers.answer_start` column")
|
| 461 |
+
with col2:
|
| 462 |
+
context_col = st.selectbox(
|
| 463 |
+
"This column should contain the question's context",
|
| 464 |
+
col_names,
|
| 465 |
+
index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
|
| 466 |
+
)
|
| 467 |
+
question_col = st.selectbox(
|
| 468 |
+
"This column should contain the question to be answered, given the context",
|
| 469 |
+
col_names,
|
| 470 |
+
index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
|
| 471 |
+
)
|
| 472 |
+
answers_text_col = st.selectbox(
|
| 473 |
+
"This column should contain example answers to the question, extracted from the context",
|
| 474 |
+
col_names,
|
| 475 |
+
index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
|
| 476 |
+
)
|
| 477 |
+
answers_start_col = st.selectbox(
|
| 478 |
+
"This column should contain the indices in the context of the first character of each `answers.text`",
|
| 479 |
+
col_names,
|
| 480 |
+
index=col_names.index(get_key(col_mapping, "answers.answer_start"))
|
| 481 |
+
if config_metadata is not None
|
| 482 |
+
else 0,
|
| 483 |
+
)
|
| 484 |
+
col_mapping[context_col] = "context"
|
| 485 |
+
col_mapping[question_col] = "question"
|
| 486 |
+
col_mapping[answers_text_col] = "answers.text"
|
| 487 |
+
col_mapping[answers_start_col] = "answers.answer_start"
|
| 488 |
+
elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
|
| 489 |
+
with col1:
|
| 490 |
+
st.markdown("`image` column")
|
| 491 |
+
st.text("")
|
| 492 |
+
st.text("")
|
| 493 |
+
st.text("")
|
| 494 |
+
st.text("")
|
| 495 |
+
st.markdown("`target` column")
|
| 496 |
+
with col2:
|
| 497 |
+
image_col = st.selectbox(
|
| 498 |
+
"This column should contain the images to be classified",
|
| 499 |
+
col_names,
|
| 500 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
|
| 501 |
+
if config_metadata is not None
|
| 502 |
+
else 0,
|
| 503 |
+
)
|
| 504 |
+
target_col = st.selectbox(
|
| 505 |
+
"This column should contain the labels associated with the images",
|
| 506 |
+
col_names,
|
| 507 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
| 508 |
+
if config_metadata is not None
|
| 509 |
+
else 0,
|
| 510 |
+
)
|
| 511 |
+
col_mapping[image_col] = "image"
|
| 512 |
+
col_mapping[target_col] = "target"
|
| 513 |
+
|
| 514 |
+
# Select metrics
|
| 515 |
+
st.markdown("**Select metrics**")
|
| 516 |
+
st.markdown("The following metrics will be computed")
|
| 517 |
+
html_string = " ".join(
|
| 518 |
+
[
|
| 519 |
+
'<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
|
| 520 |
+
+ '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
|
| 521 |
+
+ 'padding-left:5px;color:white">'
|
| 522 |
+
+ metric
|
| 523 |
+
+ "</div></div>"
|
| 524 |
+
for metric in TASK_TO_DEFAULT_METRICS[selected_task]
|
| 525 |
+
]
|
| 526 |
+
)
|
| 527 |
+
st.markdown(html_string, unsafe_allow_html=True)
|
| 528 |
+
selected_metrics = st.multiselect(
|
| 529 |
+
"(Optional) Select additional metrics",
|
| 530 |
+
sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
|
| 531 |
+
help="""User-selected metrics will be computed with their default arguments. \
|
| 532 |
+
For example, `f1` will report results for binary labels. \
|
| 533 |
+
Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
with st.form(key="form"):
|
| 537 |
+
compatible_models = get_compatible_models(selected_task, [selected_dataset])
|
| 538 |
+
selected_models = st.multiselect(
|
| 539 |
+
"Select the models you wish to evaluate",
|
| 540 |
+
compatible_models,
|
| 541 |
+
help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
|
| 542 |
+
[model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
|
| 543 |
+
)
|
| 544 |
+
print("INFO -- Selected models before filter:", selected_models)
|
| 545 |
+
|
| 546 |
+
hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
|
| 547 |
+
|
| 548 |
+
submit_button = st.form_submit_button("Evaluate models 🚀")
|
| 549 |
+
|
| 550 |
+
if submit_button:
|
| 551 |
+
if len(hf_username) == 0:
|
| 552 |
+
st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
|
| 553 |
+
elif len(selected_models) == 0:
|
| 554 |
+
st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
|
| 555 |
+
elif len(selected_models) > 10:
|
| 556 |
+
st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
|
| 557 |
+
else:
|
| 558 |
+
# Filter out previously evaluated models
|
| 559 |
+
selected_models = filter_evaluated_models(
|
| 560 |
+
selected_models,
|
| 561 |
+
selected_task,
|
| 562 |
+
selected_dataset,
|
| 563 |
+
selected_config,
|
| 564 |
+
selected_split,
|
| 565 |
+
selected_metrics,
|
| 566 |
+
)
|
| 567 |
+
print("INFO -- Selected models after filter:", selected_models)
|
| 568 |
+
if len(selected_models) > 0:
|
| 569 |
+
project_payload = {
|
| 570 |
+
"username": AUTOTRAIN_USERNAME,
|
| 571 |
+
"proj_name": create_autotrain_project_name(selected_dataset, selected_config),
|
| 572 |
+
"task": TASK_TO_ID[selected_task],
|
| 573 |
+
"config": {
|
| 574 |
+
"language": AUTOTRAIN_TASK_TO_LANG[selected_task]
|
| 575 |
+
if selected_task in AUTOTRAIN_TASK_TO_LANG
|
| 576 |
+
else "en",
|
| 577 |
+
"max_models": 5,
|
| 578 |
+
"instance": {
|
| 579 |
+
"provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
|
| 580 |
+
"instance_type": AUTOTRAIN_MACHINE[selected_task]
|
| 581 |
+
if selected_task in AUTOTRAIN_MACHINE.keys()
|
| 582 |
+
else "p3",
|
| 583 |
+
"max_runtime_seconds": 172800,
|
| 584 |
+
"num_instances": 1,
|
| 585 |
+
"disk_size_gb": 200,
|
| 586 |
+
},
|
| 587 |
+
"evaluation": {
|
| 588 |
+
"metrics": selected_metrics,
|
| 589 |
+
"models": selected_models,
|
| 590 |
+
"hf_username": hf_username,
|
| 591 |
+
},
|
| 592 |
+
},
|
| 593 |
+
}
|
| 594 |
+
print(f"INFO -- Payload: {project_payload}")
|
| 595 |
+
project_json_resp = http_post(
|
| 596 |
+
path="/projects/create",
|
| 597 |
+
payload=project_payload,
|
| 598 |
+
token=HF_TOKEN,
|
| 599 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 600 |
+
).json()
|
| 601 |
+
print(f"INFO -- Project creation response: {project_json_resp}")
|
| 602 |
+
|
| 603 |
+
if project_json_resp["created"]:
|
| 604 |
+
data_payload = {
|
| 605 |
+
"split": 4, # use "auto" split choice in AutoTrain
|
| 606 |
+
"col_mapping": col_mapping,
|
| 607 |
+
"load_config": {"max_size_bytes": 0, "shuffle": False},
|
| 608 |
+
"dataset_id": selected_dataset,
|
| 609 |
+
"dataset_config": selected_config,
|
| 610 |
+
"dataset_split": selected_split,
|
| 611 |
+
}
|
| 612 |
+
data_json_resp = http_post(
|
| 613 |
+
path=f"/projects/{project_json_resp['id']}/data/dataset",
|
| 614 |
+
payload=data_payload,
|
| 615 |
+
token=HF_TOKEN,
|
| 616 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 617 |
+
).json()
|
| 618 |
+
print(f"INFO -- Dataset creation response: {data_json_resp}")
|
| 619 |
+
if data_json_resp["download_status"] == 1:
|
| 620 |
+
train_json_resp = http_post(
|
| 621 |
+
path=f"/projects/{project_json_resp['id']}/data/start_processing",
|
| 622 |
+
token=HF_TOKEN,
|
| 623 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 624 |
+
).json()
|
| 625 |
+
# For local development we process and approve projects on-the-fly
|
| 626 |
+
if "localhost" in AUTOTRAIN_BACKEND_API:
|
| 627 |
+
with st.spinner("⏳ Waiting for data processing to complete ..."):
|
| 628 |
+
is_data_processing_success = False
|
| 629 |
+
while is_data_processing_success is not True:
|
| 630 |
+
project_status = http_get(
|
| 631 |
+
path=f"/projects/{project_json_resp['id']}",
|
| 632 |
+
token=HF_TOKEN,
|
| 633 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 634 |
+
).json()
|
| 635 |
+
if project_status["status"] == 3:
|
| 636 |
+
is_data_processing_success = True
|
| 637 |
+
time.sleep(10)
|
| 638 |
+
|
| 639 |
+
# Approve training job
|
| 640 |
+
train_job_resp = http_post(
|
| 641 |
+
path=f"/projects/{project_json_resp['id']}/start_training",
|
| 642 |
+
token=HF_TOKEN,
|
| 643 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 644 |
+
).json()
|
| 645 |
+
st.success("✅ Data processing and project approval complete - go forth and evaluate!")
|
| 646 |
+
else:
|
| 647 |
+
# Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
|
| 648 |
+
print(f"INFO -- AutoTrain job response: {train_json_resp}")
|
| 649 |
+
if train_json_resp["success"]:
|
| 650 |
+
train_eval_index = {
|
| 651 |
+
"train-eval-index": [
|
| 652 |
+
{
|
| 653 |
+
"config": selected_config,
|
| 654 |
+
"task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
|
| 655 |
+
"task_id": selected_task,
|
| 656 |
+
"splits": {"eval_split": selected_split},
|
| 657 |
+
"col_mapping": col_mapping,
|
| 658 |
+
}
|
| 659 |
+
]
|
| 660 |
+
}
|
| 661 |
+
selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
|
| 662 |
+
dataset_card_url = get_dataset_card_url(selected_dataset)
|
| 663 |
+
st.success("✅ Successfully submitted evaluation job!")
|
| 664 |
+
st.markdown(
|
| 665 |
+
f"""
|
| 666 |
+
Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
|
| 667 |
+
|
| 668 |
+
* 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
|
| 669 |
+
* 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
|
| 670 |
+
* 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
|
| 671 |
+
""" # noqa
|
| 672 |
+
)
|
| 673 |
+
st.markdown(
|
| 674 |
+
f"""
|
| 675 |
+
```yaml
|
| 676 |
+
{selected_metadata}
|
| 677 |
+
"""
|
| 678 |
+
)
|
| 679 |
+
print("INFO -- Pushing evaluation job logs to the Hub")
|
| 680 |
+
evaluation_log = {}
|
| 681 |
+
evaluation_log["project_id"] = project_json_resp["id"]
|
| 682 |
+
evaluation_log["autotrain_env"] = (
|
| 683 |
+
"staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
|
| 684 |
+
)
|
| 685 |
+
evaluation_log["payload"] = project_payload
|
| 686 |
+
evaluation_log["project_creation_response"] = project_json_resp
|
| 687 |
+
evaluation_log["dataset_creation_response"] = data_json_resp
|
| 688 |
+
evaluation_log["autotrain_job_response"] = train_json_resp
|
| 689 |
+
commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
|
| 690 |
+
else:
|
| 691 |
+
st.error("🙈 Oh no, there was an error submitting your evaluation job!")
|
| 692 |
+
else:
|
| 693 |
+
st.warning("⚠️ No models left to evaluate! Please select other models and try again.")
|
evaluation.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
import streamlit as st
|
| 5 |
+
from huggingface_hub import DatasetFilter, HfApi
|
| 6 |
+
from huggingface_hub.hf_api import DatasetInfo
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass(frozen=True, eq=True)
|
| 10 |
+
class EvaluationInfo:
|
| 11 |
+
task: str
|
| 12 |
+
model: str
|
| 13 |
+
dataset_name: str
|
| 14 |
+
dataset_config: str
|
| 15 |
+
dataset_split: str
|
| 16 |
+
metrics: set
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def create_evaluation_info(dataset_info: DatasetInfo) -> int:
|
| 20 |
+
if dataset_info.cardData is not None:
|
| 21 |
+
metadata = dataset_info.cardData["eval_info"]
|
| 22 |
+
metadata.pop("col_mapping", None)
|
| 23 |
+
# TODO(lewtun): populate dataset cards with metric info
|
| 24 |
+
if "metrics" not in metadata:
|
| 25 |
+
metadata["metrics"] = frozenset()
|
| 26 |
+
else:
|
| 27 |
+
metadata["metrics"] = frozenset(metadata["metrics"])
|
| 28 |
+
return EvaluationInfo(**metadata)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_evaluation_infos():
|
| 32 |
+
filt = DatasetFilter(author="autoevaluate")
|
| 33 |
+
evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
|
| 34 |
+
return [create_evaluation_info(dset) for dset in evaluation_datasets]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
|
| 38 |
+
evaluation_infos = get_evaluation_infos()
|
| 39 |
+
models_to_filter = copy.copy(models)
|
| 40 |
+
|
| 41 |
+
for model in models_to_filter:
|
| 42 |
+
evaluation_info = EvaluationInfo(
|
| 43 |
+
task=task,
|
| 44 |
+
model=model,
|
| 45 |
+
dataset_name=dataset_name,
|
| 46 |
+
dataset_config=dataset_config,
|
| 47 |
+
dataset_split=dataset_split,
|
| 48 |
+
metrics=frozenset(metrics),
|
| 49 |
+
)
|
| 50 |
+
if evaluation_info in evaluation_infos:
|
| 51 |
+
st.info(
|
| 52 |
+
f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
|
| 53 |
+
This model will be excluded from the evaluation job..."
|
| 54 |
+
)
|
| 55 |
+
models.remove(model)
|
| 56 |
+
|
| 57 |
+
return models
|
images/autotrain_job.png
ADDED
|
images/autotrain_projects.png
ADDED
|
notebooks/flush-prediction-repos.ipynb
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "c8093b9e-ca6a-423d-96c3-5fe21f7109a1",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"## Imports"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 1,
|
| 14 |
+
"id": "efe8cda7-a687-4867-b1f0-8efbcd428681",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"import os\n",
|
| 19 |
+
"from pathlib import Path\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"from dotenv import load_dotenv\n",
|
| 22 |
+
"from huggingface_hub import DatasetFilter, delete_repo, list_datasets\n",
|
| 23 |
+
"from tqdm.auto import tqdm\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"if Path(\".env\").is_file():\n",
|
| 26 |
+
" load_dotenv(\".env\")\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"HF_TOKEN = os.getenv(\"HF_TOKEN\")"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "markdown",
|
| 33 |
+
"id": "8f6e01f0-b658-451f-999c-e08d9f4bbbd3",
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"source": [
|
| 36 |
+
"## Get all prediction repos from autoevaluate org"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 2,
|
| 42 |
+
"id": "2e369478-66d3-498d-a8fd-95bc9180f362",
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"def get_prediction_repos():\n",
|
| 47 |
+
" all_repos = list_datasets(author=\"autoevaluate\")\n",
|
| 48 |
+
" prediction_repos = [\n",
|
| 49 |
+
" repo for repo in all_repos if repo.id.split(\"/\")[1].startswith(\"autoeval-\")\n",
|
| 50 |
+
" ]\n",
|
| 51 |
+
" return prediction_repos"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "code",
|
| 56 |
+
"execution_count": 3,
|
| 57 |
+
"id": "542db019-d01f-42f5-bef4-888dae8eeadb",
|
| 58 |
+
"metadata": {},
|
| 59 |
+
"outputs": [
|
| 60 |
+
{
|
| 61 |
+
"data": {
|
| 62 |
+
"text/plain": [
|
| 63 |
+
"66"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
"execution_count": 3,
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"output_type": "execute_result"
|
| 69 |
+
}
|
| 70 |
+
],
|
| 71 |
+
"source": [
|
| 72 |
+
"prediction_repos = get_prediction_repos()\n",
|
| 73 |
+
"len(prediction_repos)"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 4,
|
| 79 |
+
"id": "331cfabf-4b73-490f-8d6a-86b5bc162666",
|
| 80 |
+
"metadata": {},
|
| 81 |
+
"outputs": [
|
| 82 |
+
{
|
| 83 |
+
"data": {
|
| 84 |
+
"text/plain": [
|
| 85 |
+
"DatasetInfo: {\n",
|
| 86 |
+
"\tid: autoevaluate/autoeval-staging-eval-project-9dcc51b5-6464670\n",
|
| 87 |
+
"\tsha: d3bb02be592d167f7a217ac9341d187142d9a90a\n",
|
| 88 |
+
"\tlastModified: 2022-06-13T14:54:34.000Z\n",
|
| 89 |
+
"\ttags: ['type:predictions', 'tags:autotrain', 'tags:evaluation', 'datasets:glue']\n",
|
| 90 |
+
"\tprivate: False\n",
|
| 91 |
+
"\tauthor: autoevaluate\n",
|
| 92 |
+
"\tdescription: None\n",
|
| 93 |
+
"\tcitation: None\n",
|
| 94 |
+
"\tcardData: None\n",
|
| 95 |
+
"\tsiblings: None\n",
|
| 96 |
+
"\tgated: False\n",
|
| 97 |
+
"\tdownloads: 12\n",
|
| 98 |
+
"}"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
"execution_count": 4,
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"output_type": "execute_result"
|
| 104 |
+
}
|
| 105 |
+
],
|
| 106 |
+
"source": [
|
| 107 |
+
"prediction_repos[0]"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"cell_type": "markdown",
|
| 112 |
+
"id": "57a86b69-ffe8-4035-8f3d-5c917d8ce7bf",
|
| 113 |
+
"metadata": {},
|
| 114 |
+
"source": [
|
| 115 |
+
"## Delete all prediction repos"
|
| 116 |
+
]
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"cell_type": "code",
|
| 120 |
+
"execution_count": 5,
|
| 121 |
+
"id": "6c8e23e7-2a6d-437b-9742-17f37684d9eb",
|
| 122 |
+
"metadata": {},
|
| 123 |
+
"outputs": [
|
| 124 |
+
{
|
| 125 |
+
"data": {
|
| 126 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 127 |
+
"model_id": "06fa304dcc6d44e39205b20a5e488052",
|
| 128 |
+
"version_major": 2,
|
| 129 |
+
"version_minor": 0
|
| 130 |
+
},
|
| 131 |
+
"text/plain": [
|
| 132 |
+
" 0%| | 0/66 [00:00<?, ?it/s]"
|
| 133 |
+
]
|
| 134 |
+
},
|
| 135 |
+
"metadata": {},
|
| 136 |
+
"output_type": "display_data"
|
| 137 |
+
}
|
| 138 |
+
],
|
| 139 |
+
"source": [
|
| 140 |
+
"for repo in tqdm(prediction_repos):\n",
|
| 141 |
+
" delete_repo(\n",
|
| 142 |
+
" repo_id=repo.id,\n",
|
| 143 |
+
" repo_type=\"dataset\",\n",
|
| 144 |
+
" )"
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"cell_type": "code",
|
| 149 |
+
"execution_count": null,
|
| 150 |
+
"id": "7d64b0aa-d05f-4497-9bd2-eb2fc0d8bd7a",
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"outputs": [],
|
| 153 |
+
"source": []
|
| 154 |
+
}
|
| 155 |
+
],
|
| 156 |
+
"metadata": {
|
| 157 |
+
"kernelspec": {
|
| 158 |
+
"display_name": "autoevaluate",
|
| 159 |
+
"language": "python",
|
| 160 |
+
"name": "autoevaluate"
|
| 161 |
+
},
|
| 162 |
+
"language_info": {
|
| 163 |
+
"codemirror_mode": {
|
| 164 |
+
"name": "ipython",
|
| 165 |
+
"version": 3
|
| 166 |
+
},
|
| 167 |
+
"file_extension": ".py",
|
| 168 |
+
"mimetype": "text/x-python",
|
| 169 |
+
"name": "python",
|
| 170 |
+
"nbconvert_exporter": "python",
|
| 171 |
+
"pygments_lexer": "ipython3",
|
| 172 |
+
"version": "3.8.13"
|
| 173 |
+
}
|
| 174 |
+
},
|
| 175 |
+
"nbformat": 4,
|
| 176 |
+
"nbformat_minor": 5
|
| 177 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.isort]
|
| 2 |
+
profile = "black"
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huggingface-hub<0.8
|
| 2 |
+
python-dotenv
|
| 3 |
+
streamlit==1.10.0
|
| 4 |
+
datasets<2.3
|
| 5 |
+
evaluate<0.2
|
| 6 |
+
jsonlines
|
| 7 |
+
typer
|
| 8 |
+
# Dataset specific deps
|
| 9 |
+
py7zr<0.19
|
| 10 |
+
openpyxl<3.1
|
| 11 |
+
# Dirty bug from Google
|
| 12 |
+
protobuf<=3.20.1
|
run_evaluation_jobs.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import typer
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
from utils import http_get, http_post
|
| 9 |
+
|
| 10 |
+
if Path(".env").is_file():
|
| 11 |
+
load_dotenv(".env")
|
| 12 |
+
|
| 13 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 14 |
+
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
|
| 15 |
+
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
|
| 16 |
+
|
| 17 |
+
if "staging" in AUTOTRAIN_BACKEND_API:
|
| 18 |
+
AUTOTRAIN_ENV = "staging"
|
| 19 |
+
else:
|
| 20 |
+
AUTOTRAIN_ENV = "prod"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def main():
|
| 24 |
+
print(f"💡 Starting jobs on {AUTOTRAIN_ENV} environment")
|
| 25 |
+
logs_df = load_dataset("autoevaluate/evaluation-job-logs", use_auth_token=HF_TOKEN, split="train").to_pandas()
|
| 26 |
+
# Filter out legacy AutoTrain submissions prior to project approvals requirement
|
| 27 |
+
projects_df = logs_df.copy()[(~logs_df["project_id"].isnull())]
|
| 28 |
+
# Filter IDs for appropriate AutoTrain env (staging vs prod)
|
| 29 |
+
projects_df = projects_df.copy().query(f"autotrain_env == '{AUTOTRAIN_ENV}'")
|
| 30 |
+
projects_to_approve = projects_df["project_id"].astype(int).tolist()
|
| 31 |
+
failed_approvals = []
|
| 32 |
+
print(f"🚀 Found {len(projects_to_approve)} evaluation projects to approve!")
|
| 33 |
+
|
| 34 |
+
for project_id in projects_to_approve:
|
| 35 |
+
print(f"Attempting to evaluate project ID {project_id} ...")
|
| 36 |
+
try:
|
| 37 |
+
project_info = http_get(
|
| 38 |
+
path=f"/projects/{project_id}",
|
| 39 |
+
token=HF_TOKEN,
|
| 40 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 41 |
+
).json()
|
| 42 |
+
print(project_info)
|
| 43 |
+
# Only start evaluation for projects with completed data processing (status=3)
|
| 44 |
+
if project_info["status"] == 3 and project_info["training_status"] == "not_started":
|
| 45 |
+
train_job_resp = http_post(
|
| 46 |
+
path=f"/projects/{project_id}/start_training",
|
| 47 |
+
token=HF_TOKEN,
|
| 48 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 49 |
+
).json()
|
| 50 |
+
print(f"🤖 Project {project_id} approval response: {train_job_resp}")
|
| 51 |
+
else:
|
| 52 |
+
print(f"💪 Project {project_id} either not ready or has already been evaluated. Skipping ...")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"There was a problem obtaining the project info for project ID {project_id}")
|
| 55 |
+
print(f"Error message: {e}")
|
| 56 |
+
failed_approvals.append(project_id)
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
if len(failed_approvals) > 0:
|
| 60 |
+
print(f"🚨 Failed to approve {len(failed_approvals)} projects: {failed_approvals}")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
typer.run(main)
|
utils.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, List, Union
|
| 4 |
+
|
| 5 |
+
import jsonlines
|
| 6 |
+
import requests
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from evaluate import load
|
| 9 |
+
from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
AUTOTRAIN_TASK_TO_HUB_TASK = {
|
| 13 |
+
"binary_classification": "text-classification",
|
| 14 |
+
"multi_class_classification": "text-classification",
|
| 15 |
+
"natural_language_inference": "text-classification",
|
| 16 |
+
"entity_extraction": "token-classification",
|
| 17 |
+
"extractive_question_answering": "question-answering",
|
| 18 |
+
"translation": "translation",
|
| 19 |
+
"summarization": "summarization",
|
| 20 |
+
"image_binary_classification": "image-classification",
|
| 21 |
+
"image_multi_class_classification": "image-classification",
|
| 22 |
+
"text_zero_shot_classification": "text-generation",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
HUB_TASK_TO_AUTOTRAIN_TASK = {v: k for k, v in AUTOTRAIN_TASK_TO_HUB_TASK.items()}
|
| 27 |
+
LOGS_REPO = "evaluation-job-logs"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_auth_headers(token: str, prefix: str = "Bearer"):
|
| 31 |
+
return {"Authorization": f"{prefix} {token}"}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
|
| 35 |
+
"""HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
|
| 36 |
+
try:
|
| 37 |
+
response = requests.post(
|
| 38 |
+
url=domain + path,
|
| 39 |
+
json=payload,
|
| 40 |
+
headers=get_auth_headers(token=token),
|
| 41 |
+
allow_redirects=True,
|
| 42 |
+
params=params,
|
| 43 |
+
)
|
| 44 |
+
except requests.exceptions.ConnectionError:
|
| 45 |
+
print("❌ Failed to reach AutoNLP API, check your internet connection")
|
| 46 |
+
response.raise_for_status()
|
| 47 |
+
return response
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def http_get(path: str, domain: str, token: str = None, params: dict = None) -> requests.Response:
|
| 51 |
+
"""HTTP POST request to `path`, raises UnreachableAPIError if the API cannot be reached"""
|
| 52 |
+
try:
|
| 53 |
+
response = requests.get(
|
| 54 |
+
url=domain + path,
|
| 55 |
+
headers=get_auth_headers(token=token),
|
| 56 |
+
allow_redirects=True,
|
| 57 |
+
params=params,
|
| 58 |
+
)
|
| 59 |
+
except requests.exceptions.ConnectionError:
|
| 60 |
+
print(f"❌ Failed to reach {path}, check your internet connection")
|
| 61 |
+
response.raise_for_status()
|
| 62 |
+
return response
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_metadata(dataset_name: str, token: str) -> Union[Dict, None]:
|
| 66 |
+
data = dataset_info(dataset_name, token=token)
|
| 67 |
+
if data.cardData is not None and "train-eval-index" in data.cardData.keys():
|
| 68 |
+
return data.cardData["train-eval-index"]
|
| 69 |
+
else:
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_compatible_models(task: str, dataset_ids: List[str]) -> List[str]:
|
| 74 |
+
"""
|
| 75 |
+
Returns all model IDs that are compatible with the given task and dataset names.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
task (`str`): The task to search for.
|
| 79 |
+
dataset_names (`List[str]`): A list of dataset names to search for.
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
A list of model IDs, sorted alphabetically.
|
| 83 |
+
"""
|
| 84 |
+
compatible_models = []
|
| 85 |
+
# Allow any summarization model to be used for summarization tasks
|
| 86 |
+
# and allow any text-generation model to be used for text_zero_shot_classification
|
| 87 |
+
if task in ("summarization", "text_zero_shot_classification"):
|
| 88 |
+
model_filter = ModelFilter(
|
| 89 |
+
task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
|
| 90 |
+
library=["transformers", "pytorch"],
|
| 91 |
+
)
|
| 92 |
+
compatible_models.extend(HfApi().list_models(filter=model_filter))
|
| 93 |
+
# Include models trained on SQuAD datasets, since these can be evaluated on
|
| 94 |
+
# other SQuAD-like datasets
|
| 95 |
+
if task == "extractive_question_answering":
|
| 96 |
+
dataset_ids.extend(["squad", "squad_v2"])
|
| 97 |
+
|
| 98 |
+
# TODO: relax filter on PyTorch models if TensorFlow supported in AutoTrain
|
| 99 |
+
for dataset_id in dataset_ids:
|
| 100 |
+
model_filter = ModelFilter(
|
| 101 |
+
task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
|
| 102 |
+
trained_dataset=dataset_id,
|
| 103 |
+
library=["transformers", "pytorch"],
|
| 104 |
+
)
|
| 105 |
+
compatible_models.extend(HfApi().list_models(filter=model_filter))
|
| 106 |
+
return sorted(set([model.modelId for model in compatible_models]))
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def get_key(col_mapping, val):
|
| 110 |
+
for key, value in col_mapping.items():
|
| 111 |
+
if val == value:
|
| 112 |
+
return key
|
| 113 |
+
|
| 114 |
+
return "key doesn't exist"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def format_col_mapping(col_mapping: dict) -> dict:
|
| 118 |
+
for k, v in col_mapping["answers"].items():
|
| 119 |
+
col_mapping[f"answers.{k}"] = f"answers.{v}"
|
| 120 |
+
del col_mapping["answers"]
|
| 121 |
+
return col_mapping
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def commit_evaluation_log(evaluation_log, hf_access_token=None):
|
| 125 |
+
logs_repo_url = f"https://huggingface.co/datasets/autoevaluate/{LOGS_REPO}"
|
| 126 |
+
logs_repo = Repository(
|
| 127 |
+
local_dir=LOGS_REPO,
|
| 128 |
+
clone_from=logs_repo_url,
|
| 129 |
+
repo_type="dataset",
|
| 130 |
+
private=True,
|
| 131 |
+
use_auth_token=hf_access_token,
|
| 132 |
+
)
|
| 133 |
+
logs_repo.git_pull()
|
| 134 |
+
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
|
| 135 |
+
lines = []
|
| 136 |
+
for obj in r:
|
| 137 |
+
lines.append(obj)
|
| 138 |
+
|
| 139 |
+
lines.append(evaluation_log)
|
| 140 |
+
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
|
| 141 |
+
for job in lines:
|
| 142 |
+
writer.write(job)
|
| 143 |
+
logs_repo.push_to_hub(
|
| 144 |
+
commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
|
| 145 |
+
)
|
| 146 |
+
print("INFO -- Pushed evaluation logs to the Hub")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@st.experimental_memo
|
| 150 |
+
def get_supported_metrics():
|
| 151 |
+
"""Helper function to get all metrics compatible with evaluation service.
|
| 152 |
+
|
| 153 |
+
Requires all metric dependencies installed in the same environment, so wait until
|
| 154 |
+
https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
|
| 155 |
+
"""
|
| 156 |
+
metrics = [metric.id for metric in list_metrics()]
|
| 157 |
+
supported_metrics = []
|
| 158 |
+
for metric in tqdm(metrics):
|
| 159 |
+
# TODO: this currently requires all metric dependencies to be installed
|
| 160 |
+
# in the same environment. Refactor to avoid needing to actually load
|
| 161 |
+
# the metric.
|
| 162 |
+
try:
|
| 163 |
+
print(f"INFO -- Attempting to load metric: {metric}")
|
| 164 |
+
metric_func = load(metric)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(e)
|
| 167 |
+
print("WARNING -- Skipping the following metric, which cannot load:", metric)
|
| 168 |
+
continue
|
| 169 |
+
|
| 170 |
+
argspec = inspect.getfullargspec(metric_func.compute)
|
| 171 |
+
if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
|
| 172 |
+
# We require that "references" and "predictions" are arguments
|
| 173 |
+
# to the metric function. We also require that the other arguments
|
| 174 |
+
# besides "references" and "predictions" have defaults and so do not
|
| 175 |
+
# need to be specified explicitly.
|
| 176 |
+
defaults = True
|
| 177 |
+
for key, value in argspec.kwonlydefaults.items():
|
| 178 |
+
if key not in ("references", "predictions"):
|
| 179 |
+
if value is None:
|
| 180 |
+
defaults = False
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
if defaults:
|
| 184 |
+
supported_metrics.append(metric)
|
| 185 |
+
return supported_metrics
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def get_dataset_card_url(dataset_id: str) -> str:
|
| 189 |
+
"""Gets the URL to edit the dataset card for the given dataset ID."""
|
| 190 |
+
if "/" in dataset_id:
|
| 191 |
+
return f"https://huggingface.co/datasets/{dataset_id}/edit/main/README.md"
|
| 192 |
+
else:
|
| 193 |
+
return f"https://github.com/huggingface/datasets/edit/master/datasets/{dataset_id}/README.md"
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def create_autotrain_project_name(dataset_id: str, dataset_config: str) -> str:
|
| 197 |
+
"""Creates an AutoTrain project name for the given dataset ID."""
|
| 198 |
+
# Project names cannot have "/", so we need to format community datasets accordingly
|
| 199 |
+
dataset_id_formatted = dataset_id.replace("/", "__")
|
| 200 |
+
dataset_config_formatted = dataset_config.replace("--", "__")
|
| 201 |
+
# Project names need to be unique, so we append a random string to guarantee this while adhering to naming rules
|
| 202 |
+
basename = f"eval-{dataset_id_formatted}-{dataset_config_formatted}"
|
| 203 |
+
basename = basename[:60] if len(basename) > 60 else basename # Hub naming limitation
|
| 204 |
+
return f"{basename}-{str(uuid.uuid4())[:6]}"
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]:
|
| 208 |
+
"""Gets the dataset card metadata for the given config."""
|
| 209 |
+
if metadata is None:
|
| 210 |
+
return None
|
| 211 |
+
config_metadata = [m for m in metadata if m["config"] == config]
|
| 212 |
+
if len(config_metadata) >= 1:
|
| 213 |
+
return config_metadata[0]
|
| 214 |
+
else:
|
| 215 |
+
return None
|