Duplicate from fastino/gliner2-base-v1

Browse files

Co-authored-by: Urchade Zaratiana <urchade@users.noreply.huggingface.co>

Files changed (10) hide show

.gitattributes +35 -0
README.md +161 -0
added_tokens.json +13 -0
config.json +9 -0
encoder_config/config.json +33 -0
model.safetensors +3 -0
special_tokens_map.json +123 -0
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +151 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,161 @@

+---
+library_name: gliner2
+---
+## Model Description
+GLiNER2 extends the original GLiNER architecture to support multi-task information extraction with a schema-driven interface. This base model provides efficient CPU-based inference while maintaining high accuracy across diverse extraction tasks.
+**Key Features:**
+- Multi-task capability: NER, classification, and structured extraction
+- Schema-driven interface with field types and constraints
+- CPU-first design for fast inference without GPU requirements
+- 100% local processing with zero external dependencies
+## Installation
+```bash
+pip install gliner2
+```
+## Usage
+### Entity Extraction
+```python
+from gliner2 import GLiNER2
+# Load the model
+extractor = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
+# Extract entities
+text = "Apple CEO Tim Cook announced iPhone 15 in Cupertino yesterday."
+result = extractor.extract_entities(text, ["company", "person", "product", "location"])
+print(result)
+# Output: {'entities': {'company': ['Apple'], 'person': ['Tim Cook'], 'product': ['iPhone 15'], 'location': ['Cupertino']}}
+```
+### Text Classification
+```python
+# Single-label classification
+result = extractor.classify_text(
+    "This laptop has amazing performance but terrible battery life!",
+    {"sentiment": ["positive", "negative", "neutral"]}
+)
+print(result)
+# Output: {'sentiment': 'negative'}
+# Multi-label classification
+result = extractor.classify_text(
+    "Great camera quality, decent performance, but poor battery life.",
+    {
+        "aspects": {
+            "labels": ["camera", "performance", "battery", "display", "price"],
+            "multi_label": True,
+            "cls_threshold": 0.4
+        }
+    }
+)
+print(result)
+# Output: {'aspects': ['camera', 'performance', 'battery']}
+```
+### Structured Data Extraction
+```python
+text = "iPhone 15 Pro Max with 256GB storage, A17 Pro chip, priced at $1199."
+result = extractor.extract_json(
+    text,
+    {
+        "product": [
+            "name::str::Full product name and model",
+            "storage::str::Storage capacity",
+            "processor::str::Chip or processor information",
+            "price::str::Product price with currency"
+        ]
+    }
+)
+print(result)
+# Output: {
+#     'product': [{
+#         'name': 'iPhone 15 Pro Max',
+#         'storage': '256GB',
+#         'processor': 'A17 Pro chip',
+#         'price': '$1199'
+#     }]
+# }
+```
+### Multi-Task Schema Composition
+```python
+# Combine all extraction types
+schema = (extractor.create_schema()
+    .entities({
+        "person": "Names of people or individuals",
+        "company": "Organization or business names",
+        "product": "Products or services mentioned"
+    })
+    .classification("sentiment", ["positive", "negative", "neutral"])
+    .structure("product_info")
+        .field("name", dtype="str")
+        .field("price", dtype="str")
+        .field("features", dtype="list")
+)
+text = "Apple CEO Tim Cook unveiled the iPhone 15 Pro for $999."
+results = extractor.extract(text, schema)
+print(results)
+# Output: {
+#     'entities': {'person': ['Tim Cook'], 'company': ['Apple'], 'product': ['iPhone 15 Pro']},
+#     'sentiment': 'positive',
+#     'product_info': [{'name': 'iPhone 15 Pro', 'price': '$999', 'features': [...]}]
+# }
+```
+## Model Details
+- **Model Type:** Bidirectional Transformer Encoder (BERT-based)
+- **Parameters:** 205M
+- **Input:** Text sequences
+- **Output:** Entities, classifications, and structured data
+- **Architecture:** Based on GLiNER with multi-task extensions
+- **Training Data:** Multi-domain datasets for NER, classification, and structured extraction
+## Performance
+This model is optimized for:
+- Fast CPU inference (no GPU required)
+- Low latency applications
+- Resource-constrained environments
+- Multi-task extraction scenarios
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@misc{zaratiana2025gliner2efficientmultitaskinformation,
+      title={GLiNER2: An Efficient Multi-Task Information Extraction System with Schema-Driven Interface},
+      author={Urchade Zaratiana and Gil Pasternak and Oliver Boyd and George Hurn-Maloney and Ash Lewis},
+      year={2025},
+      eprint={2507.18546},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2507.18546},
+}
+```
+## License
+This project is licensed under the Apache License 2.0.
+## Links
+- **Repository:** https://github.com/fastino-ai/GLiNER2
+- **Paper:** https://arxiv.org/abs/2507.18546
+- **Organization:** [Fastino AI](https://fastino.ai)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "[C]": 128004,
+  "[DESCRIPTION]": 128010,
+  "[EXAMPLE]": 128008,
+  "[E]": 128005,
+  "[L]": 128007,
+  "[MASK]": 128000,
+  "[OUTPUT]": 128009,
+  "[P]": 128003,
+  "[R]": 128006,
+  "[SEP_STRUCT]": 128001,
+  "[SEP_TEXT]": 128002
+}

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_attn_implementation_autoset": true,
+  "counting_layer": "count_lstm_v2",
+  "max_width": 8,
+  "model_name": "microsoft/deberta-v3-base",
+  "model_type": "extractor",
+  "token_pooling": "first",
+  "transformers_version": "4.51.0"
+}

encoder_config/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_attn_implementation_autoset": true,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "legacy": true,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.0",
+  "type_vocab_size": 0,
+  "vocab_size": 128011
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:845fc4bd93c525b86124c58ab4f56c9eacf8587953086b14c501fab25957c007
+size 833938108

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,123 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "[SEP_STRUCT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[SEP_TEXT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[P]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[C]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[E]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[R]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[L]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[EXAMPLE]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[OUTPUT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[DESCRIPTION]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "[SEP_STRUCT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "[SEP_TEXT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "[P]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "[C]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "[E]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "[R]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "[L]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "[EXAMPLE]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "[OUTPUT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "[DESCRIPTION]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[SEP_STRUCT]",
+    "[SEP_TEXT]",
+    "[P]",
+    "[C]",
+    "[E]",
+    "[R]",
+    "[L]",
+    "[EXAMPLE]",
+    "[OUTPUT]",
+    "[DESCRIPTION]"
+  ],
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2TokenizerFast",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}