vrashad commited on
Commit
07c826b
·
verified ·
1 Parent(s): 841be26

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +187 -79
README.md CHANGED
@@ -101,89 +101,197 @@ To use the model for spell correction:
101
 
102
  ```python
103
  import torch
104
- from transformers import AutoTokenizer, AutoModelForTokenClassification
105
-
106
- model_id = "LocalDoc/private_ner_azerbaijani_v2"
107
-
108
- tokenizer = AutoTokenizer.from_pretrained(model_id)
109
- model = AutoModelForTokenClassification.from_pretrained(model_id)
110
-
111
- test_text = (
112
- "Salam, mənim adım Əli Hüseynovdur. Doğum tarixim 15.05.1990-dır. Bakı şəhərində, Nizami küçəsində, 25/31 ünvanında yaşayıram. Telefon nömrəm +994552345678-dir."
113
- )
114
-
115
- inputs = tokenizer(test_text, return_tensors="pt", return_offsets_mapping=True)
116
-
117
- offset_mapping = inputs.pop("offset_mapping")
118
-
119
- with torch.no_grad():
120
- outputs = model(**inputs)
121
-
122
- predictions = torch.argmax(outputs.logits, dim=2)
123
-
124
- tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
125
- offset_mapping = offset_mapping[0].tolist()
126
- predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
127
- word_ids = inputs.word_ids(batch_index=0)
128
-
129
- aggregated = []
130
- prev_word_id = None
131
- for idx, word_id in enumerate(word_ids):
132
- if word_id is None:
133
- continue
134
- if word_id != prev_word_id:
135
- aggregated.append({
136
- "word_id": word_id,
137
- "tokens": [tokens[idx]],
138
- "offsets": [offset_mapping[idx]],
139
- "label": predicted_labels[idx]
140
- })
141
- else:
142
- aggregated[-1]["tokens"].append(tokens[idx])
143
- aggregated[-1]["offsets"].append(offset_mapping[idx])
144
- prev_word_id = word_id
145
-
146
- entities = []
147
- current_entity = None
148
- for word in aggregated:
149
- if word["label"] == "O":
150
- if current_entity is not None:
151
- entities.append(current_entity)
152
- current_entity = None
153
- else:
154
- if current_entity is None:
155
- current_entity = {
156
- "type": word["label"],
157
- "start": word["offsets"][0][0],
158
- "end": word["offsets"][-1][1]
159
- }
160
- else:
161
- if word["label"] == current_entity["type"]:
162
- current_entity["end"] = word["offsets"][-1][1]
163
- else:
164
- entities.append(current_entity)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  current_entity = {
166
- "type": word["label"],
167
- "start": word["offsets"][0][0],
168
- "end": word["offsets"][-1][1]
 
 
169
  }
170
- if current_entity is not None:
171
- entities.append(current_entity)
172
-
173
- for entity in entities:
174
- entity["text"] = test_text[entity["start"]:entity["end"]]
175
-
176
- for entity in entities:
177
- print(entity)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  ```
179
 
180
- ```json
181
- {'type': 'FIRSTNAME', 'start': 18, 'end': 21, 'text': 'Əli'}
182
- {'type': 'LASTNAME', 'start': 22, 'end': 34, 'text': 'Hüseynovdur.'}
183
- {'type': 'DOB', 'start': 49, 'end': 64, 'text': '15.05.1990-dır.'}
184
- {'type': 'STREET', 'start': 81, 'end': 87, 'text': 'Nizami'}
185
- {'type': 'BUILDINGNUMBER', 'start': 99, 'end': 104, 'text': '25/31'}
186
- {'type': 'PHONENUMBER', 'start': 141, 'end': 159, 'text': '+994552345678-dir.'}
 
 
 
 
 
 
 
 
 
 
 
 
187
  ```
188
 
189
 
 
101
 
102
  ```python
103
  import torch
104
+ from transformers import AutoModelForTokenClassification, XLMRobertaTokenizerFast
105
+ import numpy as np
106
+ from typing import List, Dict, Tuple
107
+
108
+ class AzerbaijaniNER:
109
+ def __init__(self, model_name_or_path="LocalDoc/private_ner_azerbaijani_v2"):
110
+ self.model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
111
+ self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
112
+
113
+ self.model.eval()
114
+
115
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ self.model.to(self.device)
117
+
118
+ self.id_to_label = {
119
+ 0: "O",
120
+ 1: "B-AGE", 2: "B-BUILDINGNUM", 3: "B-CITY", 4: "B-CREDITCARDNUMBER",
121
+ 5: "B-DATE", 6: "B-DRIVERLICENSENUM", 7: "B-EMAIL", 8: "B-GIVENNAME",
122
+ 9: "B-IDCARDNUM", 10: "B-PASSPORTNUM", 11: "B-STREET", 12: "B-SURNAME",
123
+ 13: "B-TAXNUM", 14: "B-TELEPHONENUM", 15: "B-TIME", 16: "B-ZIPCODE",
124
+ 17: "I-AGE", 18: "I-BUILDINGNUM", 19: "I-CITY", 20: "I-CREDITCARDNUMBER",
125
+ 21: "I-DATE", 22: "I-DRIVERLICENSENUM", 23: "I-EMAIL", 24: "I-GIVENNAME",
126
+ 25: "I-IDCARDNUM", 26: "I-PASSPORTNUM", 27: "I-STREET", 28: "I-SURNAME",
127
+ 29: "I-TAXNUM", 30: "I-TELEPHONENUM", 31: "I-TIME", 32: "I-ZIPCODE"
128
+ }
129
+
130
+ self.entity_types = {
131
+ "AGE": "Age",
132
+ "BUILDINGNUM": "Building Number",
133
+ "CITY": "City",
134
+ "CREDITCARDNUMBER": "Credit Card Number",
135
+ "DATE": "Date",
136
+ "DRIVERLICENSENUM": "Driver License Number",
137
+ "EMAIL": "Email",
138
+ "GIVENNAME": "Given Name",
139
+ "IDCARDNUM": "ID Card Number",
140
+ "PASSPORTNUM": "Passport Number",
141
+ "STREET": "Street",
142
+ "SURNAME": "Surname",
143
+ "TAXNUM": "Tax ID Number",
144
+ "TELEPHONENUM": "Phone Number",
145
+ "TIME": "Time",
146
+ "ZIPCODE": "Zip Code"
147
+ }
148
+
149
+ def predict(self, text: str, max_length: int = 512) -> List[Dict]:
150
+ text = text.lower()
151
+
152
+ inputs = self.tokenizer(
153
+ text,
154
+ return_tensors="pt",
155
+ max_length=max_length,
156
+ padding="max_length",
157
+ truncation=True,
158
+ return_offsets_mapping=True
159
+ )
160
+
161
+ offset_mapping = inputs.pop("offset_mapping").numpy()[0]
162
+
163
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
164
+
165
+ with torch.no_grad():
166
+ outputs = self.model(**inputs)
167
+ predictions = outputs.logits.argmax(dim=2)
168
+
169
+ predictions = predictions[0].cpu().numpy()
170
+
171
+ entities = []
172
+ current_entity = None
173
+
174
+ for idx, (offset, pred_id) in enumerate(zip(offset_mapping, predictions)):
175
+ if offset[0] == 0 and offset[1] == 0:
176
+ continue
177
+
178
+ pred_label = self.id_to_label[pred_id]
179
+
180
+ if pred_label.startswith("B-"):
181
+ if current_entity:
182
+ entities.append(current_entity)
183
+
184
+ entity_type = pred_label[2:]
185
  current_entity = {
186
+ "label": entity_type,
187
+ "name": self.entity_types.get(entity_type, entity_type),
188
+ "start": int(offset[0]),
189
+ "end": int(offset[1]),
190
+ "value": text[offset[0]:offset[1]]
191
  }
192
+
193
+ elif pred_label.startswith("I-") and current_entity is not None:
194
+ entity_type = pred_label[2:]
195
+
196
+ if entity_type == current_entity["label"]:
197
+ current_entity["end"] = int(offset[1])
198
+ current_entity["value"] = text[current_entity["start"]:current_entity["end"]]
199
+ else:
200
+ entities.append(current_entity)
201
+ current_entity = None
202
+
203
+ elif pred_label == "O" and current_entity is not None:
204
+ entities.append(current_entity)
205
+ current_entity = None
206
+
207
+ if current_entity:
208
+ entities.append(current_entity)
209
+
210
+ return entities
211
+
212
+ def anonymize_text(self, text: str, replacement_char: str = "X") -> Tuple[str, List[Dict]]:
213
+ entities = self.predict(text)
214
+
215
+ if not entities:
216
+ return text, []
217
+
218
+ entities.sort(key=lambda x: x["start"], reverse=True)
219
+
220
+ anonymized_text = text
221
+ for entity in entities:
222
+ start = entity["start"]
223
+ end = entity["end"]
224
+ length = end - start
225
+ anonymized_text = anonymized_text[:start] + replacement_char * length + anonymized_text[end:]
226
+
227
+ entities.sort(key=lambda x: x["start"])
228
+
229
+ return anonymized_text, entities
230
+
231
+ def highlight_entities(self, text: str) -> str:
232
+ entities = self.predict(text)
233
+
234
+ if not entities:
235
+ return text
236
+
237
+ entities.sort(key=lambda x: x["start"], reverse=True)
238
+
239
+ highlighted_text = text
240
+ for entity in entities:
241
+ start = entity["start"]
242
+ end = entity["end"]
243
+ entity_value = entity["value"]
244
+ entity_type = entity["name"]
245
+
246
+ highlighted_text = (
247
+ highlighted_text[:start] +
248
+ f"[{entity_type}: {entity_value}]" +
249
+ highlighted_text[end:]
250
+ )
251
+
252
+ return highlighted_text
253
+
254
+ if __name__ == "__main__":
255
+ ner = AzerbaijaniNER()
256
+
257
+ test_text = """Salam, mənim adım Əli Hüseynovdu. Doğum tarixim 15.05.1990-dır. Bakı şəhərində, 28 may küçəsi 4 ünvanında yaşayıram. Telefon nömrəm +994552345678-dir. Mən 4169741358254152 nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?"""
258
+
259
+ print("=== Original Text ===")
260
+ print(test_text)
261
+ print("\n=== Found Entities ===")
262
+
263
+ entities = ner.predict(test_text)
264
+ for entity in entities:
265
+ print(f"{entity['name']}: {entity['value']} (positions {entity['start']}-{entity['end']})")
266
+
267
+ print("\n=== Text with Highlighted Entities ===")
268
+ highlighted_text = ner.highlight_entities(test_text)
269
+ print(highlighted_text)
270
+
271
+ print("\n=== Anonymized Text ===")
272
+ anonymized_text, _ = ner.anonymize_text(test_text)
273
+ print(anonymized_text)
274
  ```
275
 
276
+ ```
277
+ === Original Text ===
278
+ Salam, mənim adım Əli Hüseynovdu. Doğum tarixim 15.05.1990-dır. Bakı şəhərində, 28 may küçəsi 4 ünvanında yaşayıram. Telefon nömrəm +994552345678-dir. Mən 4169741358254152 nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
279
+
280
+ === Found Entities ===
281
+ Given Name: əli (positions 18-21)
282
+ Surname: hüseynov (positions 22-30)
283
+ Date: 15.05.1990 (positions 48-58)
284
+ City: bakı (positions 64-68)
285
+ Street: 28 may küçəsi (positions 80-93)
286
+ Building Number: 4 (positions 94-95)
287
+ Phone Number: +994552345678 (positions 132-145)
288
+ Credit Card Number: 4169741358254152 (positions 155-171)
289
+
290
+ === Text with Highlighted Entities ===
291
+ Salam, mənim adım [Given Name: əli] [Surname: hüseynov]du. Doğum tarixim [Date: 15.05.1990]-dır. [City: bakı] şəhərində, [Street: 28 may küçəsi] [Building Number: 4] ünvanında yaşayıram. Telefon nömrəm [Phone Number: +994552345678]-dir. Mən [Credit Card Number: 4169741358254152] nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
292
+
293
+ === Anonymized Text ===
294
+ Salam, mənim adım XXX XXXXXXXXdu. Doğum tarixim XXXXXXXXXX-dır. XXXX şəhərində, XXXXXXXXXXXXX X ünvanında yaşayıram. Telefon nömrəm XXXXXXXXXXXXX-dir. Mən XXXXXXXXXXXXXXXX nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
295
  ```
296
 
297