MikeG27 commited on
Commit
ee1da13
·
verified ·
1 Parent(s): df65a41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -83
app.py CHANGED
@@ -1,103 +1,78 @@
1
  from typing import Dict, Union
2
  from gliner import GLiNER
3
  import gradio as gr
 
4
 
5
- model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
 
 
 
 
 
6
 
7
  examples = [
8
  [
9
- "Pierre Dubois, résident de Paris, a fondé sa propre entreprise, Le Petit Café, située au 15 Rue de la Paix. Son numéro d'entreprise est FR-987654321-1, et il utilise le compte bancaire 9876543210 pour les transactions.",
10
- "person, organization, address, company registration number, bank account number",
11
  0.5,
12
  False,
13
  ],
14
  [
15
- "Leticia Ramírez, una habitante de Barcelona, tiene una cita médica programada en el Hospital General de Cataluña, situado en 10 Calle de los Ángeles. Su número de la seguridad social es ES-123456789-A y su grupo sanguíneo es AB+.",
16
- "person, location, address, social security number, blood type",
17
  0.5,
18
  False,
19
  ],
20
  [
21
- "John Smith, from London, teaches mathematics at Royal Academy located at 25 King’s Road. His employee ID is UK-987654-321 and he has been working there since 2015.",
22
- "person, profession, organization, address, employee ID number",
23
  0.5,
24
  False,
25
  ],
26
  [
27
- "In Frankfurt, Claudia Weber frequently visits her local bank branch, Deutsche Bank, at 48 Hauptstraße. Her account number is DE-1234567890123456, used primarily for her mortgage payments.",
28
- "person, location, address, bank account number",
29
  0.5,
30
  False,
31
  ],
32
  [
33
- "Marta Rossi, residente a Roma, ha acquistato un appartamento al 123 Via Condotti. Il numero di registrazione della proprietà è IT-654321-2018 e il mutuo è gestito tramite la Banca d'Italia con numero di conto 3216549870.",
34
- "person, address, property registration number, bank account number",
35
  0.5,
36
  False,
37
  ],
38
  [
39
- "Paulo Coelho, um turista do Brasil, fez um seguro de viagem com a empresa Seguros PT antes de sua viagem para Lisboa. O número da apólice é BR-987654321-123 e inclui cobertura médica.",
40
- "person, nationality, company, insurance policy number, coverage",
41
  0.5,
42
  False,
43
  ],
44
  [
45
- "Julia Fischer, eine Kundin aus München, hat bei der BayWa AG, einem großen Anbieter von Baustoffen mit Sitz am 77 Industriestraße, einen Kredit aufgenommen. Die Kreditnummer lautet DE-12345678.",
46
- "person, city, organization, address, loan number",
47
  0.5,
48
  False,
49
  ],
50
  [
51
- "Carlos Sánchez, profesor en la Universidad de Madrid, reside en el 5 Calle de Alcalá. Su número de identificación de profesor es ES-192837465 y tiene un doctorado en filosofía.",
52
- "person, profession, address, teacher ID number, degree",
53
  0.5,
54
  False,
55
  ],
56
  [
57
- "Sophie Dupont, une journaliste française, travaille pour Le Monde, basé au 33 rue des Écoles à Paris. Son numéro d'identification de presse est FR-75649023.",
58
- "person, profession, organization, address, press ID number",
59
  0.5,
60
  False,
61
  ],
62
  [
63
- "Manuel Oliveira, um agricultor em Porto, possui uma grande plantação de vinhas na Rua da Estrada, 120. O número de registro agrícola é PT-5678912345.",
64
- "person, profession, address, agricultural registration number",
65
  0.5,
66
  False,
67
  ],
68
- [
69
- "Elisa Müller, eine Künstlerin aus Berlin, hat ihre neueste Skulptur im öffentlichen Park am Alexanderplatz ausgestellt. Ihre Künstlernummer lautet DE-112233445.",
70
- "person, profession, location, artist ID number",
71
- 0.5,
72
- False,
73
- ],
74
- [
75
- "Federico García, un jugador de fútbol de Sevilla, ha firmado un contrato de tres años con el club Real Betis. Su número de licencia deportiva es ES-9876543210.",
76
- "person, profession, organization, sports license number",
77
- 0.5,
78
- False,
79
- ],
80
- [
81
- "Sarah White, a London-based actress, will be performing in 'Hamlet' at the Globe Theatre located at 21 New Globe Walk. Her Equity membership number is UK-1234567.",
82
- "person, profession, location, address, membership number",
83
- 0.5,
84
- False,
85
- ],
86
- [
87
- "Ricardo Mello, engenheiro civil, trabalha na construção da nova barragem no Rio Douro, Portugal. Seu número de registro profissional é PT-987654321.",
88
- "person, profession, project location, professional registration number",
89
- 0.5,
90
- False,
91
- ],
92
- [
93
- "Giuseppe Conti, un cliente di Milano, ha fatto un acquisto presso il negozio La Rinascente situato in Piazza Duomo. Il numero della sua carta di credito è IT-4567891234567891.",
94
- "person, location, address, credit card number",
95
- 0.5,
96
- False,
97
- ]
98
  ]
99
 
100
-
101
  def ner(
102
  text, labels: str, threshold: float, nested_ner: bool
103
  ) -> Dict[str, Union[str, int, float]]:
@@ -122,19 +97,11 @@ def ner(
122
  with gr.Blocks(title="GLiNER-M-v2.1") as demo:
123
  gr.Markdown(
124
  """
125
- # GLiNER-PII (Personnally Identifiable Information extraction)
126
 
127
  GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
128
-
129
- The model has been trained by fine-tuning urchade/gliner_multi-v2.1 on the urchade/synthetic-pii-ner-mistral-v1 dataset.
130
-
131
- ## Links
132
-
133
- * Model: https://huggingface.co/urchade/gliner_multi_pii-v1
134
- * All GLiNER models: https://huggingface.co/models?library=gliner
135
- * Paper: https://arxiv.org/abs/2311.08526
136
- * Repository: https://github.com/urchade/GLiNER
137
  """
 
138
  )
139
  with gr.Accordion("How to run this model locally", open=False):
140
  gr.Markdown(
@@ -150,31 +117,33 @@ with gr.Blocks(title="GLiNER-M-v2.1") as demo:
150
  """
151
  )
152
  gr.Code(
153
- '''
154
- from gliner import GLiNER
155
-
156
- model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
157
-
158
- text = """
159
- Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est harilala.rasoanaivo@telma.mg. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
160
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- labels = ["work", "booking number", "personally identifiable information", "driver licence", "person", "book", "full address", "company", "actor", "character", "email", "passport number", "Social Security Number", "phone number"]
163
  entities = model.predict_entities(text, labels)
164
 
165
  for entity in entities:
166
- print(entity["text"], "=>", entity["label"])
167
- ''',
168
- language="python",
169
- )
170
- gr.Code(
171
- """
172
- Harilala Rasoanaivo => person
173
- Rasoanaivo Enterprises => company
174
- Lot II M 92 Antohomadinika => full address
175
- +261 32 22 345 67 => phone number
176
- harilala.rasoanaivo@telma.mg => email
177
- 501-02-1234 => Social Security Number
178
  """
179
  )
180
 
 
1
  from typing import Dict, Union
2
  from gliner import GLiNER
3
  import gradio as gr
4
+ import os
5
 
6
+ model = GLiNER.from_pretrained(
7
+ "gravitee-io/gliner-pii-detection",
8
+ token=os.getenv("HUGGINGFACE_TOKEN"),
9
+ load_onnx_model=True,
10
+ load_tokenizer=True, onnx_model_file="model.onnx"
11
+ )
12
 
13
  examples = [
14
  [
15
+ "Jana Kowalczyk's driver license number is PL-DL-55443322 and she resides at 78 Ulica Nowowiejska, Wrocław. Her contact email is jana.k@example.com.",
16
+ "name, driver_license_number, street_address, email",
17
  0.5,
18
  False,
19
  ],
20
  [
21
+ "Nguyen Van Long from Hanoi logs in from the IP 10.0.0.5 and uses the API key: 12ab34cd56ef78gh90ij. His company is VietNet Global.",
22
+ "name, street_address, ipv4, api_key, company",
23
  0.5,
24
  False,
25
  ],
26
  [
27
+ "Sarah Johnson made a transaction using the credit card 379354508162306 and the CVV 834. Her IBAN is GB29 NWBK 6016 1331 9268 19.",
28
+ "name, credit_card_number, credit_card_security_code, iban",
29
  0.5,
30
  False,
31
  ],
32
  [
33
+ "Employee Thomas Becker has the ID DE-EMP-44991 and joined DataFlux GmbH on 2021-12-01. His internal email is t.becker@dataflux.de.",
34
+ "name, employee_id, company, date, email",
35
  0.5,
36
  False,
37
  ],
38
  [
39
+ "Laura Rossi lives at Via Roma 101, Milano. Her social security number is IT-9988776655 and she was born on 1982-07-14.",
40
+ "name, street_address, ssn, date_of_birth",
41
  0.5,
42
  False,
43
  ],
44
  [
45
+ "Omar El-Zein uses the SWIFT code BOFAUS3N and his bank routing number is 026009593. He lives near 12 Al-Azhar Street, Cairo.",
46
+ "name, swift_bic_code, bank_routing_number, street_address",
47
  0.5,
48
  False,
49
  ],
50
  [
51
+ "Chen Wei's employee badge shows ID EMP-CN-8899. He signed the contract on 2023-03-20 at 10:15 AM using the password Dragon@123.",
52
+ "name, employee_id, date, time, password",
53
  0.5,
54
  False,
55
  ],
56
  [
57
+ "Fatoumata Diarra, born 1994-04-04, lives at 45 Avenue de la Liberté, Bamako. Her BBAN is ML2930012345678901234567890.",
58
+ "name, date_of_birth, street_address, bban",
59
  0.5,
60
  False,
61
  ],
62
  [
63
+ "Daniel Evans has a passport number K01234567 and a permanent address at 500 Pine Street, Seattle. His contact number is +1-206-555-0199.",
64
+ "name, passport_number, street_address, phone_number",
65
  0.5,
66
  False,
67
  ],
68
  [
69
+ "Alejandro Torres created his customer account on 2024-08-30 using email ale.torres@correo.mx and ID CUST-MX-1122.",
70
+ "name, date, email, customer_id",
71
  0.5,
72
  False,
73
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ]
75
 
 
76
  def ner(
77
  text, labels: str, threshold: float, nested_ner: bool
78
  ) -> Dict[str, Union[str, int, float]]:
 
97
  with gr.Blocks(title="GLiNER-M-v2.1") as demo:
98
  gr.Markdown(
99
  """
100
+ # Gravitee PII (Personnally Identifiable Information extraction)
101
 
102
  GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.
 
 
 
 
 
 
 
 
 
103
  """
104
+
105
  )
106
  with gr.Accordion("How to run this model locally", open=False):
107
  gr.Markdown(
 
117
  """
118
  )
119
  gr.Code(
120
+ """
121
+ model = GLiNER.from_pretrained(
122
+ "gravitee-io/gliner-pii-detection",
123
+ load_onnx_model=True,
124
+ load_tokenizer=True, onnx_model_file="model.onnx"
125
+ )
126
+
127
+ text = '''
128
+ Hey, just a quick update. I talked to David yesterday.
129
+ He sent over the files from his private email (david.doe@example.com), and we should be careful with his SSN: 123-45-6789.
130
+ Also, please don't push the GitHub repo until we remove the API key: ghp_abcdEfgh1234567890.
131
+ He mentioned his new address is 123 Maple Street in New York.
132
+ His PC adress is 192.168.1.100.
133
+ '''
134
+
135
+ labels = ["name",
136
+ "email",
137
+ "ssn",
138
+ "api_key",
139
+ "street_address",
140
+ "date",
141
+ "ipv4"]
142
 
 
143
  entities = model.predict_entities(text, labels)
144
 
145
  for entity in entities:
146
+ print(entity["text"], "=>", entity["label"], "=>", entity["score"])
 
 
 
 
 
 
 
 
 
 
 
147
  """
148
  )
149