ryleytraverse urchade commited on
Commit
34f1545
·
0 Parent(s):

Duplicate from fastino/gliner2-base-v1

Browse files

Co-authored-by: Urchade Zaratiana <urchade@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: gliner2
3
+ ---
4
+ ## Model Description
5
+
6
+ GLiNER2 extends the original GLiNER architecture to support multi-task information extraction with a schema-driven interface. This base model provides efficient CPU-based inference while maintaining high accuracy across diverse extraction tasks.
7
+
8
+ **Key Features:**
9
+ - Multi-task capability: NER, classification, and structured extraction
10
+ - Schema-driven interface with field types and constraints
11
+ - CPU-first design for fast inference without GPU requirements
12
+ - 100% local processing with zero external dependencies
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install gliner2
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ ### Entity Extraction
23
+
24
+ ```python
25
+ from gliner2 import GLiNER2
26
+
27
+ # Load the model
28
+ extractor = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
29
+
30
+ # Extract entities
31
+ text = "Apple CEO Tim Cook announced iPhone 15 in Cupertino yesterday."
32
+ result = extractor.extract_entities(text, ["company", "person", "product", "location"])
33
+
34
+ print(result)
35
+ # Output: {'entities': {'company': ['Apple'], 'person': ['Tim Cook'], 'product': ['iPhone 15'], 'location': ['Cupertino']}}
36
+ ```
37
+
38
+ ### Text Classification
39
+
40
+ ```python
41
+ # Single-label classification
42
+ result = extractor.classify_text(
43
+ "This laptop has amazing performance but terrible battery life!",
44
+ {"sentiment": ["positive", "negative", "neutral"]}
45
+ )
46
+ print(result)
47
+ # Output: {'sentiment': 'negative'}
48
+
49
+ # Multi-label classification
50
+ result = extractor.classify_text(
51
+ "Great camera quality, decent performance, but poor battery life.",
52
+ {
53
+ "aspects": {
54
+ "labels": ["camera", "performance", "battery", "display", "price"],
55
+ "multi_label": True,
56
+ "cls_threshold": 0.4
57
+ }
58
+ }
59
+ )
60
+ print(result)
61
+ # Output: {'aspects': ['camera', 'performance', 'battery']}
62
+ ```
63
+
64
+ ### Structured Data Extraction
65
+
66
+ ```python
67
+ text = "iPhone 15 Pro Max with 256GB storage, A17 Pro chip, priced at $1199."
68
+
69
+ result = extractor.extract_json(
70
+ text,
71
+ {
72
+ "product": [
73
+ "name::str::Full product name and model",
74
+ "storage::str::Storage capacity",
75
+ "processor::str::Chip or processor information",
76
+ "price::str::Product price with currency"
77
+ ]
78
+ }
79
+ )
80
+
81
+ print(result)
82
+ # Output: {
83
+ # 'product': [{
84
+ # 'name': 'iPhone 15 Pro Max',
85
+ # 'storage': '256GB',
86
+ # 'processor': 'A17 Pro chip',
87
+ # 'price': '$1199'
88
+ # }]
89
+ # }
90
+ ```
91
+
92
+ ### Multi-Task Schema Composition
93
+
94
+ ```python
95
+ # Combine all extraction types
96
+ schema = (extractor.create_schema()
97
+ .entities({
98
+ "person": "Names of people or individuals",
99
+ "company": "Organization or business names",
100
+ "product": "Products or services mentioned"
101
+ })
102
+ .classification("sentiment", ["positive", "negative", "neutral"])
103
+ .structure("product_info")
104
+ .field("name", dtype="str")
105
+ .field("price", dtype="str")
106
+ .field("features", dtype="list")
107
+ )
108
+
109
+ text = "Apple CEO Tim Cook unveiled the iPhone 15 Pro for $999."
110
+ results = extractor.extract(text, schema)
111
+
112
+ print(results)
113
+ # Output: {
114
+ # 'entities': {'person': ['Tim Cook'], 'company': ['Apple'], 'product': ['iPhone 15 Pro']},
115
+ # 'sentiment': 'positive',
116
+ # 'product_info': [{'name': 'iPhone 15 Pro', 'price': '$999', 'features': [...]}]
117
+ # }
118
+ ```
119
+
120
+ ## Model Details
121
+
122
+ - **Model Type:** Bidirectional Transformer Encoder (BERT-based)
123
+ - **Parameters:** 205M
124
+ - **Input:** Text sequences
125
+ - **Output:** Entities, classifications, and structured data
126
+ - **Architecture:** Based on GLiNER with multi-task extensions
127
+ - **Training Data:** Multi-domain datasets for NER, classification, and structured extraction
128
+
129
+ ## Performance
130
+
131
+ This model is optimized for:
132
+ - Fast CPU inference (no GPU required)
133
+ - Low latency applications
134
+ - Resource-constrained environments
135
+ - Multi-task extraction scenarios
136
+
137
+ ## Citation
138
+
139
+ If you use this model in your research, please cite:
140
+
141
+ ```bibtex
142
+ @misc{zaratiana2025gliner2efficientmultitaskinformation,
143
+ title={GLiNER2: An Efficient Multi-Task Information Extraction System with Schema-Driven Interface},
144
+ author={Urchade Zaratiana and Gil Pasternak and Oliver Boyd and George Hurn-Maloney and Ash Lewis},
145
+ year={2025},
146
+ eprint={2507.18546},
147
+ archivePrefix={arXiv},
148
+ primaryClass={cs.CL},
149
+ url={https://arxiv.org/abs/2507.18546},
150
+ }
151
+ ```
152
+
153
+ ## License
154
+
155
+ This project is licensed under the Apache License 2.0.
156
+
157
+ ## Links
158
+
159
+ - **Repository:** https://github.com/fastino-ai/GLiNER2
160
+ - **Paper:** https://arxiv.org/abs/2507.18546
161
+ - **Organization:** [Fastino AI](https://fastino.ai)
added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[C]": 128004,
3
+ "[DESCRIPTION]": 128010,
4
+ "[EXAMPLE]": 128008,
5
+ "[E]": 128005,
6
+ "[L]": 128007,
7
+ "[MASK]": 128000,
8
+ "[OUTPUT]": 128009,
9
+ "[P]": 128003,
10
+ "[R]": 128006,
11
+ "[SEP_STRUCT]": 128001,
12
+ "[SEP_TEXT]": 128002
13
+ }
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "counting_layer": "count_lstm_v2",
4
+ "max_width": 8,
5
+ "model_name": "microsoft/deberta-v3-base",
6
+ "model_type": "extractor",
7
+ "token_pooling": "first",
8
+ "transformers_version": "4.51.0"
9
+ }
encoder_config/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "layer_norm_eps": 1e-07,
10
+ "legacy": true,
11
+ "max_position_embeddings": 512,
12
+ "max_relative_positions": -1,
13
+ "model_type": "deberta-v2",
14
+ "norm_rel_ebd": "layer_norm",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "pad_token_id": 0,
18
+ "pooler_dropout": 0,
19
+ "pooler_hidden_act": "gelu",
20
+ "pooler_hidden_size": 768,
21
+ "pos_att_type": [
22
+ "p2c",
23
+ "c2p"
24
+ ],
25
+ "position_biased_input": false,
26
+ "position_buckets": 256,
27
+ "relative_attention": true,
28
+ "share_att_key": true,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.51.0",
31
+ "type_vocab_size": 0,
32
+ "vocab_size": 128011
33
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845fc4bd93c525b86124c58ab4f56c9eacf8587953086b14c501fab25957c007
3
+ size 833938108
special_tokens_map.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "[SEP_STRUCT]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "[SEP_TEXT]",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "[P]",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "[C]",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "[E]",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "[R]",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "[L]",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "[EXAMPLE]",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "[OUTPUT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "[DESCRIPTION]",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ }
73
+ ],
74
+ "bos_token": {
75
+ "content": "[CLS]",
76
+ "lstrip": false,
77
+ "normalized": false,
78
+ "rstrip": false,
79
+ "single_word": false
80
+ },
81
+ "cls_token": {
82
+ "content": "[CLS]",
83
+ "lstrip": false,
84
+ "normalized": false,
85
+ "rstrip": false,
86
+ "single_word": false
87
+ },
88
+ "eos_token": {
89
+ "content": "[SEP]",
90
+ "lstrip": false,
91
+ "normalized": false,
92
+ "rstrip": false,
93
+ "single_word": false
94
+ },
95
+ "mask_token": {
96
+ "content": "[MASK]",
97
+ "lstrip": false,
98
+ "normalized": false,
99
+ "rstrip": false,
100
+ "single_word": false
101
+ },
102
+ "pad_token": {
103
+ "content": "[PAD]",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false
108
+ },
109
+ "sep_token": {
110
+ "content": "[SEP]",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false
115
+ },
116
+ "unk_token": {
117
+ "content": "[UNK]",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false
122
+ }
123
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128001": {
44
+ "content": "[SEP_STRUCT]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128002": {
52
+ "content": "[SEP_TEXT]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128003": {
60
+ "content": "[P]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128004": {
68
+ "content": "[C]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128005": {
76
+ "content": "[E]",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128006": {
84
+ "content": "[R]",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128007": {
92
+ "content": "[L]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128008": {
100
+ "content": "[EXAMPLE]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128009": {
108
+ "content": "[OUTPUT]",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128010": {
116
+ "content": "[DESCRIPTION]",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ }
123
+ },
124
+ "additional_special_tokens": [
125
+ "[SEP_STRUCT]",
126
+ "[SEP_TEXT]",
127
+ "[P]",
128
+ "[C]",
129
+ "[E]",
130
+ "[R]",
131
+ "[L]",
132
+ "[EXAMPLE]",
133
+ "[OUTPUT]",
134
+ "[DESCRIPTION]"
135
+ ],
136
+ "bos_token": "[CLS]",
137
+ "clean_up_tokenization_spaces": false,
138
+ "cls_token": "[CLS]",
139
+ "do_lower_case": false,
140
+ "eos_token": "[SEP]",
141
+ "extra_special_tokens": {},
142
+ "mask_token": "[MASK]",
143
+ "model_max_length": 1000000000000000019884624838656,
144
+ "pad_token": "[PAD]",
145
+ "sep_token": "[SEP]",
146
+ "sp_model_kwargs": {},
147
+ "split_by_punct": false,
148
+ "tokenizer_class": "DebertaV2TokenizerFast",
149
+ "unk_token": "[UNK]",
150
+ "vocab_type": "spm"
151
+ }