Update README.md
Browse files
README.md
CHANGED
|
@@ -101,89 +101,197 @@ To use the model for spell correction:
|
|
| 101 |
|
| 102 |
```python
|
| 103 |
import torch
|
| 104 |
-
from transformers import
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
"
|
| 137 |
-
"
|
| 138 |
-
"
|
| 139 |
-
"
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
current_entity = {
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
-
"
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
```
|
| 179 |
|
| 180 |
-
```
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
```
|
| 188 |
|
| 189 |
|
|
|
|
| 101 |
|
| 102 |
```python
|
| 103 |
import torch
|
| 104 |
+
from transformers import AutoModelForTokenClassification, XLMRobertaTokenizerFast
|
| 105 |
+
import numpy as np
|
| 106 |
+
from typing import List, Dict, Tuple
|
| 107 |
+
|
| 108 |
+
class AzerbaijaniNER:
|
| 109 |
+
def __init__(self, model_name_or_path="LocalDoc/private_ner_azerbaijani_v2"):
|
| 110 |
+
self.model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
|
| 111 |
+
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
|
| 112 |
+
|
| 113 |
+
self.model.eval()
|
| 114 |
+
|
| 115 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 116 |
+
self.model.to(self.device)
|
| 117 |
+
|
| 118 |
+
self.id_to_label = {
|
| 119 |
+
0: "O",
|
| 120 |
+
1: "B-AGE", 2: "B-BUILDINGNUM", 3: "B-CITY", 4: "B-CREDITCARDNUMBER",
|
| 121 |
+
5: "B-DATE", 6: "B-DRIVERLICENSENUM", 7: "B-EMAIL", 8: "B-GIVENNAME",
|
| 122 |
+
9: "B-IDCARDNUM", 10: "B-PASSPORTNUM", 11: "B-STREET", 12: "B-SURNAME",
|
| 123 |
+
13: "B-TAXNUM", 14: "B-TELEPHONENUM", 15: "B-TIME", 16: "B-ZIPCODE",
|
| 124 |
+
17: "I-AGE", 18: "I-BUILDINGNUM", 19: "I-CITY", 20: "I-CREDITCARDNUMBER",
|
| 125 |
+
21: "I-DATE", 22: "I-DRIVERLICENSENUM", 23: "I-EMAIL", 24: "I-GIVENNAME",
|
| 126 |
+
25: "I-IDCARDNUM", 26: "I-PASSPORTNUM", 27: "I-STREET", 28: "I-SURNAME",
|
| 127 |
+
29: "I-TAXNUM", 30: "I-TELEPHONENUM", 31: "I-TIME", 32: "I-ZIPCODE"
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
self.entity_types = {
|
| 131 |
+
"AGE": "Age",
|
| 132 |
+
"BUILDINGNUM": "Building Number",
|
| 133 |
+
"CITY": "City",
|
| 134 |
+
"CREDITCARDNUMBER": "Credit Card Number",
|
| 135 |
+
"DATE": "Date",
|
| 136 |
+
"DRIVERLICENSENUM": "Driver License Number",
|
| 137 |
+
"EMAIL": "Email",
|
| 138 |
+
"GIVENNAME": "Given Name",
|
| 139 |
+
"IDCARDNUM": "ID Card Number",
|
| 140 |
+
"PASSPORTNUM": "Passport Number",
|
| 141 |
+
"STREET": "Street",
|
| 142 |
+
"SURNAME": "Surname",
|
| 143 |
+
"TAXNUM": "Tax ID Number",
|
| 144 |
+
"TELEPHONENUM": "Phone Number",
|
| 145 |
+
"TIME": "Time",
|
| 146 |
+
"ZIPCODE": "Zip Code"
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
def predict(self, text: str, max_length: int = 512) -> List[Dict]:
|
| 150 |
+
text = text.lower()
|
| 151 |
+
|
| 152 |
+
inputs = self.tokenizer(
|
| 153 |
+
text,
|
| 154 |
+
return_tensors="pt",
|
| 155 |
+
max_length=max_length,
|
| 156 |
+
padding="max_length",
|
| 157 |
+
truncation=True,
|
| 158 |
+
return_offsets_mapping=True
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
offset_mapping = inputs.pop("offset_mapping").numpy()[0]
|
| 162 |
+
|
| 163 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 164 |
+
|
| 165 |
+
with torch.no_grad():
|
| 166 |
+
outputs = self.model(**inputs)
|
| 167 |
+
predictions = outputs.logits.argmax(dim=2)
|
| 168 |
+
|
| 169 |
+
predictions = predictions[0].cpu().numpy()
|
| 170 |
+
|
| 171 |
+
entities = []
|
| 172 |
+
current_entity = None
|
| 173 |
+
|
| 174 |
+
for idx, (offset, pred_id) in enumerate(zip(offset_mapping, predictions)):
|
| 175 |
+
if offset[0] == 0 and offset[1] == 0:
|
| 176 |
+
continue
|
| 177 |
+
|
| 178 |
+
pred_label = self.id_to_label[pred_id]
|
| 179 |
+
|
| 180 |
+
if pred_label.startswith("B-"):
|
| 181 |
+
if current_entity:
|
| 182 |
+
entities.append(current_entity)
|
| 183 |
+
|
| 184 |
+
entity_type = pred_label[2:]
|
| 185 |
current_entity = {
|
| 186 |
+
"label": entity_type,
|
| 187 |
+
"name": self.entity_types.get(entity_type, entity_type),
|
| 188 |
+
"start": int(offset[0]),
|
| 189 |
+
"end": int(offset[1]),
|
| 190 |
+
"value": text[offset[0]:offset[1]]
|
| 191 |
}
|
| 192 |
+
|
| 193 |
+
elif pred_label.startswith("I-") and current_entity is not None:
|
| 194 |
+
entity_type = pred_label[2:]
|
| 195 |
+
|
| 196 |
+
if entity_type == current_entity["label"]:
|
| 197 |
+
current_entity["end"] = int(offset[1])
|
| 198 |
+
current_entity["value"] = text[current_entity["start"]:current_entity["end"]]
|
| 199 |
+
else:
|
| 200 |
+
entities.append(current_entity)
|
| 201 |
+
current_entity = None
|
| 202 |
+
|
| 203 |
+
elif pred_label == "O" and current_entity is not None:
|
| 204 |
+
entities.append(current_entity)
|
| 205 |
+
current_entity = None
|
| 206 |
+
|
| 207 |
+
if current_entity:
|
| 208 |
+
entities.append(current_entity)
|
| 209 |
+
|
| 210 |
+
return entities
|
| 211 |
+
|
| 212 |
+
def anonymize_text(self, text: str, replacement_char: str = "X") -> Tuple[str, List[Dict]]:
|
| 213 |
+
entities = self.predict(text)
|
| 214 |
+
|
| 215 |
+
if not entities:
|
| 216 |
+
return text, []
|
| 217 |
+
|
| 218 |
+
entities.sort(key=lambda x: x["start"], reverse=True)
|
| 219 |
+
|
| 220 |
+
anonymized_text = text
|
| 221 |
+
for entity in entities:
|
| 222 |
+
start = entity["start"]
|
| 223 |
+
end = entity["end"]
|
| 224 |
+
length = end - start
|
| 225 |
+
anonymized_text = anonymized_text[:start] + replacement_char * length + anonymized_text[end:]
|
| 226 |
+
|
| 227 |
+
entities.sort(key=lambda x: x["start"])
|
| 228 |
+
|
| 229 |
+
return anonymized_text, entities
|
| 230 |
+
|
| 231 |
+
def highlight_entities(self, text: str) -> str:
|
| 232 |
+
entities = self.predict(text)
|
| 233 |
+
|
| 234 |
+
if not entities:
|
| 235 |
+
return text
|
| 236 |
+
|
| 237 |
+
entities.sort(key=lambda x: x["start"], reverse=True)
|
| 238 |
+
|
| 239 |
+
highlighted_text = text
|
| 240 |
+
for entity in entities:
|
| 241 |
+
start = entity["start"]
|
| 242 |
+
end = entity["end"]
|
| 243 |
+
entity_value = entity["value"]
|
| 244 |
+
entity_type = entity["name"]
|
| 245 |
+
|
| 246 |
+
highlighted_text = (
|
| 247 |
+
highlighted_text[:start] +
|
| 248 |
+
f"[{entity_type}: {entity_value}]" +
|
| 249 |
+
highlighted_text[end:]
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
return highlighted_text
|
| 253 |
+
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
ner = AzerbaijaniNER()
|
| 256 |
+
|
| 257 |
+
test_text = """Salam, mənim adım Əli Hüseynovdu. Doğum tarixim 15.05.1990-dır. Bakı şəhərində, 28 may küçəsi 4 ünvanında yaşayıram. Telefon nömrəm +994552345678-dir. Mən 4169741358254152 nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?"""
|
| 258 |
+
|
| 259 |
+
print("=== Original Text ===")
|
| 260 |
+
print(test_text)
|
| 261 |
+
print("\n=== Found Entities ===")
|
| 262 |
+
|
| 263 |
+
entities = ner.predict(test_text)
|
| 264 |
+
for entity in entities:
|
| 265 |
+
print(f"{entity['name']}: {entity['value']} (positions {entity['start']}-{entity['end']})")
|
| 266 |
+
|
| 267 |
+
print("\n=== Text with Highlighted Entities ===")
|
| 268 |
+
highlighted_text = ner.highlight_entities(test_text)
|
| 269 |
+
print(highlighted_text)
|
| 270 |
+
|
| 271 |
+
print("\n=== Anonymized Text ===")
|
| 272 |
+
anonymized_text, _ = ner.anonymize_text(test_text)
|
| 273 |
+
print(anonymized_text)
|
| 274 |
```
|
| 275 |
|
| 276 |
+
```
|
| 277 |
+
=== Original Text ===
|
| 278 |
+
Salam, mənim adım Əli Hüseynovdu. Doğum tarixim 15.05.1990-dır. Bakı şəhərində, 28 may küçəsi 4 ünvanında yaşayıram. Telefon nömrəm +994552345678-dir. Mən 4169741358254152 nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
|
| 279 |
+
|
| 280 |
+
=== Found Entities ===
|
| 281 |
+
Given Name: əli (positions 18-21)
|
| 282 |
+
Surname: hüseynov (positions 22-30)
|
| 283 |
+
Date: 15.05.1990 (positions 48-58)
|
| 284 |
+
City: bakı (positions 64-68)
|
| 285 |
+
Street: 28 may küçəsi (positions 80-93)
|
| 286 |
+
Building Number: 4 (positions 94-95)
|
| 287 |
+
Phone Number: +994552345678 (positions 132-145)
|
| 288 |
+
Credit Card Number: 4169741358254152 (positions 155-171)
|
| 289 |
+
|
| 290 |
+
=== Text with Highlighted Entities ===
|
| 291 |
+
Salam, mənim adım [Given Name: əli] [Surname: hüseynov]du. Doğum tarixim [Date: 15.05.1990]-dır. [City: bakı] şəhərində, [Street: 28 may küçəsi] [Building Number: 4] ünvanında yaşayıram. Telefon nömrəm [Phone Number: +994552345678]-dir. Mən [Credit Card Number: 4169741358254152] nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
|
| 292 |
+
|
| 293 |
+
=== Anonymized Text ===
|
| 294 |
+
Salam, mənim adım XXX XXXXXXXXdu. Doğum tarixim XXXXXXXXXX-dır. XXXX şəhərində, XXXXXXXXXXXXX X ünvanında yaşayıram. Telefon nömrəm XXXXXXXXXXXXX-dir. Mən XXXXXXXXXXXXXXXX nömrəli kartdan ödəniş etmişəm. Sifarişim nə vaxt çatdırılcaq ?
|
| 295 |
```
|
| 296 |
|
| 297 |
|