Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,103 +1,78 @@
|
|
| 1 |
from typing import Dict, Union
|
| 2 |
from gliner import GLiNER
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
|
| 5 |
-
model = GLiNER.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
examples = [
|
| 8 |
[
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
0.5,
|
| 12 |
False,
|
| 13 |
],
|
| 14 |
[
|
| 15 |
-
"
|
| 16 |
-
"
|
| 17 |
0.5,
|
| 18 |
False,
|
| 19 |
],
|
| 20 |
[
|
| 21 |
-
"
|
| 22 |
-
"
|
| 23 |
0.5,
|
| 24 |
False,
|
| 25 |
],
|
| 26 |
[
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
0.5,
|
| 30 |
False,
|
| 31 |
],
|
| 32 |
[
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
0.5,
|
| 36 |
False,
|
| 37 |
],
|
| 38 |
[
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
0.5,
|
| 42 |
False,
|
| 43 |
],
|
| 44 |
[
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
0.5,
|
| 48 |
False,
|
| 49 |
],
|
| 50 |
[
|
| 51 |
-
"
|
| 52 |
-
"
|
| 53 |
0.5,
|
| 54 |
False,
|
| 55 |
],
|
| 56 |
[
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
0.5,
|
| 60 |
False,
|
| 61 |
],
|
| 62 |
[
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
0.5,
|
| 66 |
False,
|
| 67 |
],
|
| 68 |
-
[
|
| 69 |
-
"Elisa Müller, eine Künstlerin aus Berlin, hat ihre neueste Skulptur im öffentlichen Park am Alexanderplatz ausgestellt. Ihre Künstlernummer lautet DE-112233445.",
|
| 70 |
-
"person, profession, location, artist ID number",
|
| 71 |
-
0.5,
|
| 72 |
-
False,
|
| 73 |
-
],
|
| 74 |
-
[
|
| 75 |
-
"Federico García, un jugador de fútbol de Sevilla, ha firmado un contrato de tres años con el club Real Betis. Su número de licencia deportiva es ES-9876543210.",
|
| 76 |
-
"person, profession, organization, sports license number",
|
| 77 |
-
0.5,
|
| 78 |
-
False,
|
| 79 |
-
],
|
| 80 |
-
[
|
| 81 |
-
"Sarah White, a London-based actress, will be performing in 'Hamlet' at the Globe Theatre located at 21 New Globe Walk. Her Equity membership number is UK-1234567.",
|
| 82 |
-
"person, profession, location, address, membership number",
|
| 83 |
-
0.5,
|
| 84 |
-
False,
|
| 85 |
-
],
|
| 86 |
-
[
|
| 87 |
-
"Ricardo Mello, engenheiro civil, trabalha na construção da nova barragem no Rio Douro, Portugal. Seu número de registro profissional é PT-987654321.",
|
| 88 |
-
"person, profession, project location, professional registration number",
|
| 89 |
-
0.5,
|
| 90 |
-
False,
|
| 91 |
-
],
|
| 92 |
-
[
|
| 93 |
-
"Giuseppe Conti, un cliente di Milano, ha fatto un acquisto presso il negozio La Rinascente situato in Piazza Duomo. Il numero della sua carta di credito è IT-4567891234567891.",
|
| 94 |
-
"person, location, address, credit card number",
|
| 95 |
-
0.5,
|
| 96 |
-
False,
|
| 97 |
-
]
|
| 98 |
]
|
| 99 |
|
| 100 |
-
|
| 101 |
def ner(
|
| 102 |
text, labels: str, threshold: float, nested_ner: bool
|
| 103 |
) -> Dict[str, Union[str, int, float]]:
|
|
@@ -122,19 +97,11 @@ def ner(
|
|
| 122 |
with gr.Blocks(title="GLiNER-M-v2.1") as demo:
|
| 123 |
gr.Markdown(
|
| 124 |
"""
|
| 125 |
-
#
|
| 126 |
|
| 127 |
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
|
| 128 |
-
|
| 129 |
-
The model has been trained by fine-tuning urchade/gliner_multi-v2.1 on the urchade/synthetic-pii-ner-mistral-v1 dataset.
|
| 130 |
-
|
| 131 |
-
## Links
|
| 132 |
-
|
| 133 |
-
* Model: https://huggingface.co/urchade/gliner_multi_pii-v1
|
| 134 |
-
* All GLiNER models: https://huggingface.co/models?library=gliner
|
| 135 |
-
* Paper: https://arxiv.org/abs/2311.08526
|
| 136 |
-
* Repository: https://github.com/urchade/GLiNER
|
| 137 |
"""
|
|
|
|
| 138 |
)
|
| 139 |
with gr.Accordion("How to run this model locally", open=False):
|
| 140 |
gr.Markdown(
|
|
@@ -150,31 +117,33 @@ with gr.Blocks(title="GLiNER-M-v2.1") as demo:
|
|
| 150 |
"""
|
| 151 |
)
|
| 152 |
gr.Code(
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
labels = ["work", "booking number", "personally identifiable information", "driver licence", "person", "book", "full address", "company", "actor", "character", "email", "passport number", "Social Security Number", "phone number"]
|
| 163 |
entities = model.predict_entities(text, labels)
|
| 164 |
|
| 165 |
for entity in entities:
|
| 166 |
-
print(entity["text"], "=>", entity["label"])
|
| 167 |
-
''',
|
| 168 |
-
language="python",
|
| 169 |
-
)
|
| 170 |
-
gr.Code(
|
| 171 |
-
"""
|
| 172 |
-
Harilala Rasoanaivo => person
|
| 173 |
-
Rasoanaivo Enterprises => company
|
| 174 |
-
Lot II M 92 Antohomadinika => full address
|
| 175 |
-
+261 32 22 345 67 => phone number
|
| 176 |
-
harilala.rasoanaivo@telma.mg => email
|
| 177 |
-
501-02-1234 => Social Security Number
|
| 178 |
"""
|
| 179 |
)
|
| 180 |
|
|
|
|
| 1 |
from typing import Dict, Union
|
| 2 |
from gliner import GLiNER
|
| 3 |
import gradio as gr
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
+
model = GLiNER.from_pretrained(
|
| 7 |
+
"gravitee-io/gliner-pii-detection",
|
| 8 |
+
token=os.getenv("HUGGINGFACE_TOKEN"),
|
| 9 |
+
load_onnx_model=True,
|
| 10 |
+
load_tokenizer=True, onnx_model_file="model.onnx"
|
| 11 |
+
)
|
| 12 |
|
| 13 |
examples = [
|
| 14 |
[
|
| 15 |
+
"Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is jana.k@example.com.",
|
| 16 |
+
"name, driver_license_number, street_address, email",
|
| 17 |
0.5,
|
| 18 |
False,
|
| 19 |
],
|
| 20 |
[
|
| 21 |
+
"Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
|
| 22 |
+
"name, street_address, ipv4, api_key, company",
|
| 23 |
0.5,
|
| 24 |
False,
|
| 25 |
],
|
| 26 |
[
|
| 27 |
+
"Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
|
| 28 |
+
"name, credit_card_number, credit_card_security_code, iban",
|
| 29 |
0.5,
|
| 30 |
False,
|
| 31 |
],
|
| 32 |
[
|
| 33 |
+
"Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is t.becker@dataflux.de.",
|
| 34 |
+
"name, employee_id, company, date, email",
|
| 35 |
0.5,
|
| 36 |
False,
|
| 37 |
],
|
| 38 |
[
|
| 39 |
+
"Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
|
| 40 |
+
"name, street_address, ssn, date_of_birth",
|
| 41 |
0.5,
|
| 42 |
False,
|
| 43 |
],
|
| 44 |
[
|
| 45 |
+
"Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
|
| 46 |
+
"name, swift_bic_code, bank_routing_number, street_address",
|
| 47 |
0.5,
|
| 48 |
False,
|
| 49 |
],
|
| 50 |
[
|
| 51 |
+
"Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
|
| 52 |
+
"name, employee_id, date, time, password",
|
| 53 |
0.5,
|
| 54 |
False,
|
| 55 |
],
|
| 56 |
[
|
| 57 |
+
"Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
|
| 58 |
+
"name, date_of_birth, street_address, bban",
|
| 59 |
0.5,
|
| 60 |
False,
|
| 61 |
],
|
| 62 |
[
|
| 63 |
+
"Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
|
| 64 |
+
"name, passport_number, street_address, phone_number",
|
| 65 |
0.5,
|
| 66 |
False,
|
| 67 |
],
|
| 68 |
[
|
| 69 |
+
"Alejandro Torres created his customer account on 2024-08-30 using email ale.torres@correo.mx and ID CUST-MX-1122.",
|
| 70 |
+
"name, date, email, customer_id",
|
| 71 |
0.5,
|
| 72 |
False,
|
| 73 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
]
|
| 75 |
|
|
|
|
| 76 |
def ner(
|
| 77 |
text, labels: str, threshold: float, nested_ner: bool
|
| 78 |
) -> Dict[str, Union[str, int, float]]:
|
|
|
|
| 97 |
with gr.Blocks(title="GLiNER-M-v2.1") as demo:
|
| 98 |
gr.Markdown(
|
| 99 |
"""
|
| 100 |
+
# Gravitee PII (Personnally Identifiable Information extraction)
|
| 101 |
|
| 102 |
GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
"""
|
| 104 |
+
|
| 105 |
)
|
| 106 |
with gr.Accordion("How to run this model locally", open=False):
|
| 107 |
gr.Markdown(
|
|
|
|
| 117 |
"""
|
| 118 |
)
|
| 119 |
gr.Code(
|
| 120 |
+
"""
|
| 121 |
+
model = GLiNER.from_pretrained(
|
| 122 |
+
"gravitee-io/gliner-pii-detection",
|
| 123 |
+
load_onnx_model=True,
|
| 124 |
+
load_tokenizer=True, onnx_model_file="model.onnx"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
text = '''
|
| 128 |
+
Hey, just a quick update. I talked to David yesterday.
|
| 129 |
+
He sent over the files from his private email (david.doe@example.com), and we should be careful with his SSN: 123-45-6789.
|
| 130 |
+
Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
|
| 131 |
+
He mentioned his new address is 123 Maple Street in New York.
|
| 132 |
+
His PC adress is 192.168.1.100.
|
| 133 |
+
'''
|
| 134 |
+
|
| 135 |
+
labels = ["name",
|
| 136 |
+
"email",
|
| 137 |
+
"ssn",
|
| 138 |
+
"api_key",
|
| 139 |
+
"street_address",
|
| 140 |
+
"date",
|
| 141 |
+
"ipv4"]
|
| 142 |
|
|
|
|
| 143 |
entities = model.predict_entities(text, labels)
|
| 144 |
|
| 145 |
for entity in entities:
|
| 146 |
+
print(entity["text"], "=>", entity["label"], "=>", entity["score"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
"""
|
| 148 |
)
|
| 149 |
|