Initial: HumanSignal gliner example patched for HF Spaces
Browse files- Dockerfile +48 -0
- README.md +123 -5
- _wsgi.py +122 -0
- model.py +261 -0
- requirements-base.txt +2 -0
- requirements-test.txt +2 -0
- requirements.txt +5 -0
- test_api.py +68 -0
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1
|
| 2 |
+
ARG PYTHON_VERSION=3.11
|
| 3 |
+
|
| 4 |
+
FROM python:${PYTHON_VERSION}-slim AS python-base
|
| 5 |
+
ARG TEST_ENV
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 10 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 11 |
+
PORT=${PORT:-9090} \
|
| 12 |
+
PIP_CACHE_DIR=/.cache \
|
| 13 |
+
WORKERS=1 \
|
| 14 |
+
THREADS=8
|
| 15 |
+
|
| 16 |
+
# Update the base OS
|
| 17 |
+
RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \
|
| 18 |
+
--mount=type=cache,target="/var/lib/apt/lists",sharing=locked \
|
| 19 |
+
set -eux; \
|
| 20 |
+
apt-get update; \
|
| 21 |
+
apt-get upgrade -y; \
|
| 22 |
+
apt install --no-install-recommends -y \
|
| 23 |
+
git; \
|
| 24 |
+
apt-get autoremove -y
|
| 25 |
+
|
| 26 |
+
# install base requirements
|
| 27 |
+
COPY requirements-base.txt .
|
| 28 |
+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
|
| 29 |
+
pip install -r requirements-base.txt
|
| 30 |
+
|
| 31 |
+
# install custom requirements
|
| 32 |
+
COPY requirements.txt .
|
| 33 |
+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
|
| 34 |
+
pip install -r requirements.txt
|
| 35 |
+
|
| 36 |
+
# install test requirements if needed
|
| 37 |
+
COPY requirements-test.txt .
|
| 38 |
+
# build only when TEST_ENV="true"
|
| 39 |
+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
|
| 40 |
+
if [ "$TEST_ENV" = "true" ]; then \
|
| 41 |
+
pip install -r requirements-test.txt; \
|
| 42 |
+
fi
|
| 43 |
+
|
| 44 |
+
COPY . .
|
| 45 |
+
|
| 46 |
+
EXPOSE 9090
|
| 47 |
+
|
| 48 |
+
CMD gunicorn --preload --bind :$PORT --workers $WORKERS --threads $THREADS --timeout 0 _wsgi:app
|
README.md
CHANGED
|
@@ -1,10 +1,128 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: LS GLiNER Backend
|
| 3 |
+
emoji: 🪄
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 9090
|
| 8 |
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: GLiNER zero-shot NER as a Label Studio ML backend
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# LS GLiNER Backend (Hugging Face Spaces)
|
| 14 |
+
|
| 15 |
+
This Space wraps HumanSignal's [`gliner` ML backend example](https://github.com/HumanSignal/label-studio-ml-backend/tree/master/label_studio_ml/examples/gliner) as a Hugging Face Space. GLiNER is a zero-shot NER model — it accepts arbitrary user-defined labels at inference time, so it can predict any label your LS project's config defines without retraining.
|
| 16 |
+
|
| 17 |
+
Default model: `urchade/gliner_medium-v2.1` (~750MB). Override via `GLINER_MODEL_NAME` env var.
|
| 18 |
+
|
| 19 |
+
**Patches from the upstream example (minimal):**
|
| 20 |
+
- Added Spaces SDK frontmatter at the top of this README.
|
| 21 |
+
- Removed `docker-compose.yml` (not used on Spaces).
|
| 22 |
+
|
| 23 |
+
Connect from Label Studio: set the ML backend URL to `https://davanstrien-ls-gliner-backend.hf.space`.
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
<!-- Original upstream README below -->
|
| 28 |
+
|
| 29 |
+
<!--
|
| 30 |
+
---
|
| 31 |
+
title: Use GLiNER for NER annotation
|
| 32 |
+
type: guide
|
| 33 |
+
tier: all
|
| 34 |
+
order: 37
|
| 35 |
+
hide_menu: true
|
| 36 |
+
hide_frontmatter_title: true
|
| 37 |
+
meta_title: Use GLiNER for NER annotation
|
| 38 |
+
meta_description: Tutorial on how to use GLiNER with your Label Studio project to complete NER tasks
|
| 39 |
+
categories:
|
| 40 |
+
- Natural Language Processing
|
| 41 |
+
- Named Entity Recognition
|
| 42 |
+
- GLiNER
|
| 43 |
+
- BERT
|
| 44 |
+
- Hugging Face
|
| 45 |
+
image: "/guide/ml_tutorials/gliner.png"
|
| 46 |
+
---
|
| 47 |
+
-->
|
| 48 |
+
|
| 49 |
+
# Use GLiNER for NER annotation
|
| 50 |
+
|
| 51 |
+
The GLiNER model is a BERT family model for generalist NER. We download the model from HuggingFace, but the original
|
| 52 |
+
model is
|
| 53 |
+
available on [GitHub](https://github.com/urchade/GLiNER).
|
| 54 |
+
|
| 55 |
+
## Before you begin
|
| 56 |
+
|
| 57 |
+
Before you begin, you must install the [Label Studio ML backend](https://github.com/HumanSignal/label-studio-ml-backend?tab=readme-ov-file#quickstart).
|
| 58 |
+
|
| 59 |
+
This tutorial uses the [`gliner` example](https://github.com/HumanSignal/label-studio-ml-backend/tree/master/label_studio_ml/examples/gliner).
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
## Running with Docker (recommended)
|
| 63 |
+
|
| 64 |
+
1. Start Machine Learning backend on `http://localhost:9090` with prebuilt image:
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
docker-compose up
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
2. Validate that backend is running
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
$ curl http://localhost:9090/
|
| 74 |
+
{"status":"UP"}
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
3. Create a project in Label Studio. Then from the **Model** page in the project settings, [connect the model](https://labelstud.io/guide/ml#Connect-the-model-to-Label-Studio). The default URL is `http://localhost:9090`.
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
## Building from source (advanced)
|
| 81 |
+
|
| 82 |
+
To build the ML backend from source, you have to clone the repository and build the Docker image:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
docker-compose build
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Running without Docker (advanced)
|
| 89 |
+
|
| 90 |
+
To run the ML backend without Docker, you have to clone the repository and install all dependencies using pip:
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
python -m venv ml-backend
|
| 94 |
+
source ml-backend/bin/activate
|
| 95 |
+
pip install -r requirements.txt
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
Then you can start the ML backend:
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
label-studio-ml start ./dir_with_your_model
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
## Configuration
|
| 105 |
+
|
| 106 |
+
Parameters can be set in `docker-compose.yml` before running the container.
|
| 107 |
+
|
| 108 |
+
The following common parameters are available:
|
| 109 |
+
- `BASIC_AUTH_USER` - Specify the basic auth user for the model server.
|
| 110 |
+
- `BASIC_AUTH_PASS` - Specify the basic auth password for the model server.
|
| 111 |
+
- `LOG_LEVEL` - Set the log level for the model server.
|
| 112 |
+
- `WORKERS` - Specify the number of workers for the model server.
|
| 113 |
+
- `THREADS` - Specify the number of threads for the model server.
|
| 114 |
+
- `LABEL_STUDIO_URL` - Specify the URL of your Label Studio instance. Note that this might need to be `http://host.docker.internal:8080` if you are running Label Studio on another Docker container.
|
| 115 |
+
- `LABEL_STUDIO_API_KEY`- Specify the API key for authenticating your Label Studio instance. You can find this by logging into Label Studio and and [going to the **Account & Settings** page](https://labelstud.io/guide/user_account#Access-token).
|
| 116 |
+
|
| 117 |
+
## A Note on Model Training
|
| 118 |
+
|
| 119 |
+
If you plan to use a webhook to train this model on "Start Training", note that you do
|
| 120 |
+
not need to configure a separate webhook. Instead, go to the three dots next to your model
|
| 121 |
+
on the Model tab in your project settings and click "start training".
|
| 122 |
+
|
| 123 |
+
Additionally, note that this container has been set for a **VERY SMALL** demo set, with only 1
|
| 124 |
+
non-eval sample (we expect the first 10 data samples to be for evaluation.)
|
| 125 |
+
|
| 126 |
+
If you're working with a larger dataset, be sure to:
|
| 127 |
+
1. update num_steps and batch size to the number of training steps you want and the batch size that works for your dataset.
|
| 128 |
+
2. change the uploaded model after training (line 239 of `model.py`) to the highest checkpoint that you have.
|
_wsgi.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import argparse
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import logging.config
|
| 6 |
+
|
| 7 |
+
logging.config.dictConfig({
|
| 8 |
+
"version": 1,
|
| 9 |
+
"disable_existing_loggers": False,
|
| 10 |
+
"formatters": {
|
| 11 |
+
"standard": {
|
| 12 |
+
"format": "[%(asctime)s] [%(levelname)s] [%(name)s::%(funcName)s::%(lineno)d] %(message)s"
|
| 13 |
+
}
|
| 14 |
+
},
|
| 15 |
+
"handlers": {
|
| 16 |
+
"console": {
|
| 17 |
+
"class": "logging.StreamHandler",
|
| 18 |
+
"level": os.getenv('LOG_LEVEL'),
|
| 19 |
+
"stream": "ext://sys.stdout",
|
| 20 |
+
"formatter": "standard"
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"root": {
|
| 24 |
+
"level": os.getenv('LOG_LEVEL'),
|
| 25 |
+
"handlers": [
|
| 26 |
+
"console"
|
| 27 |
+
],
|
| 28 |
+
"propagate": True
|
| 29 |
+
}
|
| 30 |
+
})
|
| 31 |
+
|
| 32 |
+
from label_studio_ml.api import init_app
|
| 33 |
+
from model import GLiNERModel
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
_DEFAULT_CONFIG_PATH = os.path.join(os.path.dirname(__file__), 'config.json')
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_kwargs_from_config(config_path=_DEFAULT_CONFIG_PATH):
|
| 40 |
+
if not os.path.exists(config_path):
|
| 41 |
+
return dict()
|
| 42 |
+
with open(config_path) as f:
|
| 43 |
+
config = json.load(f)
|
| 44 |
+
assert isinstance(config, dict)
|
| 45 |
+
return config
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
parser = argparse.ArgumentParser(description='Label studio')
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
'-p', '--port', dest='port', type=int, default=9090,
|
| 52 |
+
help='Server port')
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
'--host', dest='host', type=str, default='0.0.0.0',
|
| 55 |
+
help='Server host')
|
| 56 |
+
parser.add_argument(
|
| 57 |
+
'--kwargs', '--with', dest='kwargs', metavar='KEY=VAL', nargs='+', type=lambda kv: kv.split('='),
|
| 58 |
+
help='Additional LabelStudioMLBase model initialization kwargs')
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
'-d', '--debug', dest='debug', action='store_true',
|
| 61 |
+
help='Switch debug mode')
|
| 62 |
+
parser.add_argument(
|
| 63 |
+
'--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default=None,
|
| 64 |
+
help='Logging level')
|
| 65 |
+
parser.add_argument(
|
| 66 |
+
'--model-dir', dest='model_dir', default=os.path.dirname(__file__),
|
| 67 |
+
help='Directory where models are stored (relative to the project directory)')
|
| 68 |
+
parser.add_argument(
|
| 69 |
+
'--check', dest='check', action='store_true',
|
| 70 |
+
help='Validate model instance before launching server')
|
| 71 |
+
parser.add_argument('--basic-auth-user',
|
| 72 |
+
default=os.environ.get('ML_SERVER_BASIC_AUTH_USER', None),
|
| 73 |
+
help='Basic auth user')
|
| 74 |
+
|
| 75 |
+
parser.add_argument('--basic-auth-pass',
|
| 76 |
+
default=os.environ.get('ML_SERVER_BASIC_AUTH_PASS', None),
|
| 77 |
+
help='Basic auth pass')
|
| 78 |
+
|
| 79 |
+
args = parser.parse_args()
|
| 80 |
+
|
| 81 |
+
# setup logging level
|
| 82 |
+
if args.log_level:
|
| 83 |
+
logging.root.setLevel(args.log_level)
|
| 84 |
+
|
| 85 |
+
def isfloat(value):
|
| 86 |
+
try:
|
| 87 |
+
float(value)
|
| 88 |
+
return True
|
| 89 |
+
except ValueError:
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
def parse_kwargs():
|
| 93 |
+
param = dict()
|
| 94 |
+
for k, v in args.kwargs:
|
| 95 |
+
if v.isdigit():
|
| 96 |
+
param[k] = int(v)
|
| 97 |
+
elif v == 'True' or v == 'true':
|
| 98 |
+
param[k] = True
|
| 99 |
+
elif v == 'False' or v == 'false':
|
| 100 |
+
param[k] = False
|
| 101 |
+
elif isfloat(v):
|
| 102 |
+
param[k] = float(v)
|
| 103 |
+
else:
|
| 104 |
+
param[k] = v
|
| 105 |
+
return param
|
| 106 |
+
|
| 107 |
+
kwargs = get_kwargs_from_config()
|
| 108 |
+
|
| 109 |
+
if args.kwargs:
|
| 110 |
+
kwargs.update(parse_kwargs())
|
| 111 |
+
|
| 112 |
+
if args.check:
|
| 113 |
+
print('Check "' + GLiNERModel.__name__ + '" instance creation..')
|
| 114 |
+
model = GLiNERModel(**kwargs)
|
| 115 |
+
|
| 116 |
+
app = init_app(model_class=GLiNERModel, basic_auth_user=args.basic_auth_user, basic_auth_pass=args.basic_auth_pass)
|
| 117 |
+
|
| 118 |
+
app.run(host=args.host, port=args.port, debug=args.debug)
|
| 119 |
+
|
| 120 |
+
else:
|
| 121 |
+
# for uWSGI use
|
| 122 |
+
app = init_app(model_class=GLiNERModel)
|
model.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from math import floor
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
import pathlib
|
| 6 |
+
|
| 7 |
+
import label_studio_sdk
|
| 8 |
+
from gliner import GLiNER
|
| 9 |
+
from gliner.data_processing.collator import DataCollator
|
| 10 |
+
from gliner.training import Trainer, TrainingArguments
|
| 11 |
+
from label_studio_sdk.label_interface.objects import PredictionValue
|
| 12 |
+
|
| 13 |
+
from label_studio_ml.model import LabelStudioMLBase
|
| 14 |
+
from label_studio_ml.response import ModelResponse
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
GLINER_MODEL_NAME = os.getenv("GLINER_MODEL_NAME", "urchade/gliner_medium-v2.1")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class GLiNERModel(LabelStudioMLBase):
|
| 22 |
+
"""
|
| 23 |
+
Custom ML Backend for GILNER model
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def setup(self):
|
| 27 |
+
"""Configure any parameters of your model here
|
| 28 |
+
"""
|
| 29 |
+
self.LABEL_STUDIO_HOST = os.getenv('LABEL_STUDIO_URL', 'http://localhost:8080')
|
| 30 |
+
self.LABEL_STUDIO_API_KEY = os.getenv('LABEL_STUDIO_API_KEY')
|
| 31 |
+
self.MODEL_DIR = os.getenv("MODEL_DIR", "/data/models")
|
| 32 |
+
self.finetuned_model_path = os.getenv("FINETUNED_MODEL_PATH", f"models/checkpoint-10")
|
| 33 |
+
self.threshold = float(os.getenv('THRESHOLD', 0.5))
|
| 34 |
+
self.model = None
|
| 35 |
+
|
| 36 |
+
def lazy_init(self):
|
| 37 |
+
if not self.model:
|
| 38 |
+
try:
|
| 39 |
+
logger.info(f"Loading Pretrained Model from {self.finetuned_model_path}")
|
| 40 |
+
self.model = GLiNER.from_pretrained(str(pathlib.Path(self.MODEL_DIR, self.finetuned_model_path)), local_files_only=True)
|
| 41 |
+
self.set("model_version", f'{self.__class__.__name__}-v0.0.2')
|
| 42 |
+
|
| 43 |
+
except:
|
| 44 |
+
# If no finetuned model, use default
|
| 45 |
+
logger.info(f"No Pretrained Model Found. Loading GLINER model {GLINER_MODEL_NAME}")
|
| 46 |
+
self.model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
|
| 47 |
+
self.set("model_version", f'{self.__class__.__name__}-v0.0.1')
|
| 48 |
+
|
| 49 |
+
def convert_to_ls_annotation(self, prediction, from_name, to_name):
|
| 50 |
+
"""
|
| 51 |
+
Convert from GLiNER output format to Label Studio annotastion format
|
| 52 |
+
:param prediction: The prediction output from GLiNER
|
| 53 |
+
:param from_name
|
| 54 |
+
:param to_name
|
| 55 |
+
"""
|
| 56 |
+
results = []
|
| 57 |
+
sent_preds = []
|
| 58 |
+
for ent in prediction:
|
| 59 |
+
label = [ent['label']]
|
| 60 |
+
if label:
|
| 61 |
+
score = ent['score']
|
| 62 |
+
sent_preds.append({
|
| 63 |
+
'from_name': from_name,
|
| 64 |
+
'to_name': to_name,
|
| 65 |
+
'type': 'labels',
|
| 66 |
+
"value": {
|
| 67 |
+
"start": ent['start'],
|
| 68 |
+
"end": ent['end'],
|
| 69 |
+
"text": ent['text'],
|
| 70 |
+
"labels": label
|
| 71 |
+
},
|
| 72 |
+
"score": round(score, 4)
|
| 73 |
+
})
|
| 74 |
+
|
| 75 |
+
# add minimum of certaincy scores of entities in sentence for active learning use
|
| 76 |
+
score = min([p['score'] for p in sent_preds]) if sent_preds else 2.0
|
| 77 |
+
results.append(PredictionValue(
|
| 78 |
+
result=sent_preds,
|
| 79 |
+
score=score,
|
| 80 |
+
model_version=self.get('model_version')
|
| 81 |
+
))
|
| 82 |
+
|
| 83 |
+
return results
|
| 84 |
+
|
| 85 |
+
def convert_char_to_token_span(self, text: List, start: int, end: int):
|
| 86 |
+
"""
|
| 87 |
+
A helper function to convert character spans to token spans
|
| 88 |
+
text: a list of the tokenized text
|
| 89 |
+
:param start: the first character of the span, as an int
|
| 90 |
+
end: the last character of the span, as an int
|
| 91 |
+
returns: the first and last tokens of the spans, as ints
|
| 92 |
+
"""
|
| 93 |
+
start_token = None
|
| 94 |
+
end_token = None
|
| 95 |
+
total_char = 0
|
| 96 |
+
for i, word in enumerate(text):
|
| 97 |
+
if total_char >= start and not start_token:
|
| 98 |
+
start_token = i
|
| 99 |
+
if total_char >= end and not end_token:
|
| 100 |
+
end_token = i
|
| 101 |
+
total_char += (len(word) + 1)
|
| 102 |
+
if not end_token:
|
| 103 |
+
end_token = len(text)
|
| 104 |
+
return start_token, end_token
|
| 105 |
+
|
| 106 |
+
def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -> ModelResponse:
|
| 107 |
+
""" inference logic
|
| 108 |
+
:param tasks: [Label Studio tasks in JSON format](https://labelstud.io/guide/task_format.html)
|
| 109 |
+
:param context: [Label Studio context in JSON format](https://labelstud.io/guide/ml_create#Implement-prediction-logic)
|
| 110 |
+
:return model_response
|
| 111 |
+
ModelResponse(predictions=predictions) with
|
| 112 |
+
predictions: [Predictions array in JSON format](https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks)
|
| 113 |
+
"""
|
| 114 |
+
print(f'''\
|
| 115 |
+
Run prediction on {tasks}
|
| 116 |
+
Received context: {context}
|
| 117 |
+
Project ID: {self.project_id}
|
| 118 |
+
Label config: {self.label_config}
|
| 119 |
+
Parsed JSON Label config: {self.parsed_label_config}
|
| 120 |
+
Extra params: {self.extra_params}''')
|
| 121 |
+
|
| 122 |
+
# TODO: this may result in single-time timeout for large models - consider adjusting the timeout on Label Studio side
|
| 123 |
+
self.lazy_init()
|
| 124 |
+
# make predictions with currently set model
|
| 125 |
+
from_name, to_name, value = self.label_interface.get_first_tag_occurence('Labels', 'Text')
|
| 126 |
+
|
| 127 |
+
# get labels from the labeling configuration
|
| 128 |
+
labels = sorted(self.label_interface.get_tag(from_name).labels)
|
| 129 |
+
|
| 130 |
+
texts = [task['data'][value] for task in tasks]
|
| 131 |
+
predictions = []
|
| 132 |
+
for text in texts:
|
| 133 |
+
entities = self.model.predict_entities(text, labels, threshold=self.threshold)
|
| 134 |
+
pred = self.convert_to_ls_annotation(entities, from_name, to_name)
|
| 135 |
+
predictions.extend(pred)
|
| 136 |
+
|
| 137 |
+
return ModelResponse(predictions=predictions)
|
| 138 |
+
|
| 139 |
+
def process_training_data(self, task):
|
| 140 |
+
"""
|
| 141 |
+
Process the task from Label Studio export to isolate the information needed for prediction.
|
| 142 |
+
We need the tokenized text of the input, along with the start and end indicies, by word, of the annotated spans
|
| 143 |
+
:param task: the task as output by Label Studio
|
| 144 |
+
"""
|
| 145 |
+
# We get the list of tokens from the original data sample we uploaded
|
| 146 |
+
tokens = task['data']['tokens']
|
| 147 |
+
ner = []
|
| 148 |
+
# Parse the annotations
|
| 149 |
+
for annotation in task['annotations']:
|
| 150 |
+
for result in annotation['result']:
|
| 151 |
+
start = result['value']['start']
|
| 152 |
+
end = result['value']['end']
|
| 153 |
+
start_token, end_token = self.convert_char_to_token_span(tokens, start, end)
|
| 154 |
+
label = result['value']['labels'][0]
|
| 155 |
+
ner.append([start_token, end_token, label])
|
| 156 |
+
return tokens, ner
|
| 157 |
+
|
| 158 |
+
def train(self, model, training_args, train_data, eval_data=None):
|
| 159 |
+
"""
|
| 160 |
+
retrain the GLiNER model. Code adapted from the GLiNER finetuning notebook.
|
| 161 |
+
:param model: the model to train
|
| 162 |
+
:param config: the config object for training parameters
|
| 163 |
+
:param train_data: the training data, as a list of dictionaries
|
| 164 |
+
:param eval_data: the eval data
|
| 165 |
+
"""
|
| 166 |
+
# TODO: this may result in single-time timeout for large models - consider adjusting the timeout on Label Studio side
|
| 167 |
+
self.lazy_init()
|
| 168 |
+
logger.info("Training Model")
|
| 169 |
+
if training_args.use_cpu == True:
|
| 170 |
+
model = model.to('cpu')
|
| 171 |
+
else:
|
| 172 |
+
model = model.to("cuda")
|
| 173 |
+
|
| 174 |
+
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)
|
| 175 |
+
|
| 176 |
+
trainer = Trainer(
|
| 177 |
+
model=model,
|
| 178 |
+
args=training_args,
|
| 179 |
+
train_dataset=train_data,
|
| 180 |
+
eval_dataset=eval_data,
|
| 181 |
+
tokenizer=model.data_processor.transformer_tokenizer,
|
| 182 |
+
data_collator=data_collator,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
trainer.train()
|
| 186 |
+
|
| 187 |
+
#Save model
|
| 188 |
+
ckpt = str(pathlib.Path(self.MODEL_DIR, self.finetuned_model_path))
|
| 189 |
+
logger.info(f"Model Trained, saving to {ckpt} ")
|
| 190 |
+
trainer.save_model(ckpt)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def fit(self, event, data, **kwargs):
|
| 194 |
+
"""
|
| 195 |
+
This method is called each time an annotation is created or updated
|
| 196 |
+
You can run your logic here to update the model and persist it to the cache
|
| 197 |
+
It is not recommended to perform long-running operations here, as it will block the main thread
|
| 198 |
+
Instead, consider running a separate process or a thread (like RQ worker) to perform the training
|
| 199 |
+
:param event: event type can be ('ANNOTATION_CREATED', 'ANNOTATION_UPDATED')
|
| 200 |
+
:param data: the payload received from the event (check [Webhook event reference](https://labelstud.io/guide/webhook_reference.html))
|
| 201 |
+
"""
|
| 202 |
+
self.lazy_init()
|
| 203 |
+
# we only train the model if the "start training" button is pressed from settings.
|
| 204 |
+
if event == "START_TRAINING":
|
| 205 |
+
logger.info("Fitting model")
|
| 206 |
+
|
| 207 |
+
# download annotated tasks from Label Studio
|
| 208 |
+
ls = label_studio_sdk.Client(self.LABEL_STUDIO_HOST, self.LABEL_STUDIO_API_KEY)
|
| 209 |
+
project = ls.get_project(id=self.project_id)
|
| 210 |
+
tasks = project.get_labeled_tasks()
|
| 211 |
+
|
| 212 |
+
logger.info(f"Downloaded {len(tasks)} labeled tasks from Label Studio")
|
| 213 |
+
|
| 214 |
+
training_data = []
|
| 215 |
+
for task in tasks:
|
| 216 |
+
tokens, ner = self.process_training_data(task)
|
| 217 |
+
training_data.append({"tokenized_text": tokens, "ner": ner})
|
| 218 |
+
|
| 219 |
+
from_name, to_name, value = self.label_interface.get_first_tag_occurence('Labels', 'Text')
|
| 220 |
+
eval_data = {
|
| 221 |
+
"entity_types": sorted(self.label_interface.get_tag(from_name).labels),
|
| 222 |
+
"samples": training_data[:10]
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
training_data = training_data[10:]
|
| 226 |
+
logger.debug(training_data)
|
| 227 |
+
|
| 228 |
+
# Define the hyperparameters in a config variable
|
| 229 |
+
# This comes from the pretraining example in the GLiNER repo
|
| 230 |
+
num_steps = 10
|
| 231 |
+
batch_size = 1
|
| 232 |
+
data_size = len(training_data)
|
| 233 |
+
num_batches = floor(data_size / batch_size)
|
| 234 |
+
num_epochs = max(1, floor(num_steps / num_batches))
|
| 235 |
+
|
| 236 |
+
training_args = TrainingArguments(
|
| 237 |
+
output_dir="models/training_output",
|
| 238 |
+
|
| 239 |
+
learning_rate=5e-6,
|
| 240 |
+
weight_decay=0.01,
|
| 241 |
+
others_lr=1e-5,
|
| 242 |
+
others_weight_decay=0.01,
|
| 243 |
+
lr_scheduler_type="linear", # cosine
|
| 244 |
+
warmup_ratio=0.1,
|
| 245 |
+
per_device_train_batch_size=batch_size,
|
| 246 |
+
per_device_eval_batch_size=batch_size,
|
| 247 |
+
focal_loss_alpha=0.75,
|
| 248 |
+
focal_loss_gamma=2,
|
| 249 |
+
num_train_epochs=num_epochs,
|
| 250 |
+
evaluation_strategy="steps",
|
| 251 |
+
save_steps=100,
|
| 252 |
+
save_total_limit=10,
|
| 253 |
+
dataloader_num_workers=0,
|
| 254 |
+
use_cpu=True,
|
| 255 |
+
report_to="none",
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
self.train(self.model, training_args, training_data, eval_data)
|
| 259 |
+
|
| 260 |
+
else:
|
| 261 |
+
logger.info("Model training not triggered")
|
requirements-base.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gunicorn==23.0.0
|
| 2 |
+
label-studio-ml @ git+https://github.com/HumanSignal/label-studio-ml-backend.git
|
requirements-test.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest
|
| 2 |
+
pytest-cov
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gliner==0.2.16
|
| 2 |
+
torch==2.7.1
|
| 3 |
+
accelerate>=0.26.0
|
| 4 |
+
transformers==4.38.2
|
| 5 |
+
huggingface-hub==0.21.4
|
test_api.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This file contains tests for the API of your model. You can run these tests by installing test requirements:
|
| 3 |
+
|
| 4 |
+
```bash
|
| 5 |
+
pip install -r requirements-test.txt
|
| 6 |
+
```
|
| 7 |
+
Then execute `pytest` in the directory of this file.
|
| 8 |
+
|
| 9 |
+
- Change `NewModel` to the name of the class in your model.py file.
|
| 10 |
+
- Change the `request` and `expected_response` variables to match the input and output of your model.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import pytest
|
| 14 |
+
import json
|
| 15 |
+
from model import GLiNERModel
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@pytest.fixture
|
| 19 |
+
def client():
|
| 20 |
+
from _wsgi import init_app
|
| 21 |
+
app = init_app(model_class=GLiNERModel)
|
| 22 |
+
app.config['TESTING'] = True
|
| 23 |
+
with app.test_client() as client:
|
| 24 |
+
yield client
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_predict(client):
|
| 28 |
+
request = {
|
| 29 |
+
'tasks': [{'id': 6,
|
| 30 |
+
'data': {'id': '5316', 'sample_id': '83dd3f62-4dd5-45eb-8626-ee8539963194',
|
| 31 |
+
'tokens': ['atomoxetine', '[', 'oral', 'suspension', ']', 'norepinephrine', 'reuptake',
|
| 32 |
+
'inhibitor'],
|
| 33 |
+
'ner_tags': ['B-Medication/Vaccine', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
|
| 34 |
+
'ner_tags_index': [63, 0, 0, 0, 0, 0, 0, 0],
|
| 35 |
+
'text': 'atomoxetine [ oral suspension ] norepinephrine reuptake inhibitor'},
|
| 36 |
+
'meta': {},
|
| 37 |
+
'created_at': '2024-04-13T19:22:37.153686Z',
|
| 38 |
+
'updated_at': '2024-05-03T00:03:22.356871Z',
|
| 39 |
+
'is_labeled': False,
|
| 40 |
+
'overlap': 1,
|
| 41 |
+
'inner_id': 6,
|
| 42 |
+
'total_annotations': 1,
|
| 43 |
+
'cancelled_annotations': 0,
|
| 44 |
+
'total_predictions': 0,
|
| 45 |
+
'comment_count': 0,
|
| 46 |
+
'unresolved_comment_count': 0,
|
| 47 |
+
'last_comment_updated_at': None,
|
| 48 |
+
'project': 2,
|
| 49 |
+
'updated_by': 1,
|
| 50 |
+
'file_upload': None,
|
| 51 |
+
'comment_authors': [],
|
| 52 |
+
'predictions': [],
|
| 53 |
+
}],
|
| 54 |
+
# Your labeling configuration here
|
| 55 |
+
'label_config': '<View> \\n <Labels name="label" toName="text">\\n<Label value="Medication/Vaccine" background="red"/>\\n<Label value="MedicalProcedure" background="blue"/>\\n<Label value="AnatomicalStructure" background="orange"/>\\n<Label value="Symptom" background="green"/>\\n<Label value="Disease" background="purple"/>\\n</Labels>\\n<Text name="text" value="$text"/>\\n</View>'
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
expected_response = {"results": [{"model_version": "GLiNERModel-v0.0.1", "result": [
|
| 59 |
+
{"from_name": "label", "score": 0.922, "to_name": "text", "type": "labels",
|
| 60 |
+
"value": {"end": 11, "labels": ["Medication/Vaccine"], "start": 0, "text": "atomoxetine"}},
|
| 61 |
+
{"from_name": "label", "score": 0.7053, "to_name": "text", "type": "labels",
|
| 62 |
+
"value": {"end": 65, "labels": ["Medication/Vaccine"], "start": 32,
|
| 63 |
+
"text": "norepinephrine reuptake inhibitor"}}], "score": 0.7053}]}
|
| 64 |
+
|
| 65 |
+
response = client.post('/predict', data=json.dumps(request), content_type='application/json')
|
| 66 |
+
assert response.status_code == 200
|
| 67 |
+
response = json.loads(response.data)
|
| 68 |
+
assert expected_response == response
|