Update README.md
Browse files
README.md
CHANGED
|
@@ -99,7 +99,7 @@ formats = {
|
|
| 99 |
"sensitivity": """<|im_start|>system\nSensitivity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 100 |
"complexity": """<|im_start|>system\nComplexity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 101 |
"entity_detection": """<|im_start|>system\nEntity Detection<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 102 |
-
"entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities
|
| 103 |
}
|
| 104 |
|
| 105 |
def model_inference(text, mode="anonymization", max_new_tokens=2028, config=None, entity_mapping=None, return_entities=False, reverse_mapping=False):
|
|
@@ -288,26 +288,26 @@ def select_entities_based_on_json(prediction_json, entity_json):
|
|
| 288 |
|
| 289 |
Example text
|
| 290 |
```
|
| 291 |
-
We have a community picnic
|
| 292 |
```
|
| 293 |
|
| 294 |
The sensitivity analysis feature evaluates the sensitivity of a given text and the complexitivity feature rates the complexity.
|
| 295 |
```python
|
| 296 |
-
text = "We have a community picnic
|
| 297 |
|
| 298 |
# Generate sensitivity score
|
| 299 |
sensitivity_score = model_inference(text, mode="sensitivity")
|
| 300 |
-
print(f"Sensitivity Score: {sensitivity_score}"
|
| 301 |
|
| 302 |
# Generate complexity score
|
| 303 |
complexity_score = model_inference(text, mode="complexity")
|
| 304 |
-
print(f"Complexity: {complexity_score}"
|
| 305 |
```
|
| 306 |
|
| 307 |
Output
|
| 308 |
```
|
| 309 |
Sensitivity Score: 0
|
| 310 |
-
Complexity
|
| 311 |
```
|
| 312 |
|
| 313 |
### 3. Anonymization and Re-Anonymization
|
|
@@ -325,58 +325,20 @@ print(f"Anonymized Text: {anonymized_text}\n")
|
|
| 325 |
# Restore the original text
|
| 326 |
anonymized_text, entity_mapping = model_inference(text, mode="anonymization", return_entities=True)
|
| 327 |
print(f"Entity Mapping:\n{entity_mapping}\n")
|
| 328 |
-
print(f"Anonymized Text: {anonymized_text}\n")
|
| 329 |
restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping, reverse_mapping=True)
|
| 330 |
print(f"Restored Text: {restored_text}")
|
| 331 |
```
|
| 332 |
|
| 333 |
Output
|
| 334 |
```
|
| 335 |
-
Anonymized Text: We have a community picnic
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
| 339 |
|
| 340 |
-
|
| 341 |
-
```python
|
| 342 |
-
print(f"{json.dumps(entity_mapping, indent=4)}")
|
| 343 |
-
```
|
| 344 |
-
Output
|
| 345 |
-
```
|
| 346 |
-
{
|
| 347 |
-
"Greenfield Park": {
|
| 348 |
-
"TYPE": "LOC",
|
| 349 |
-
"RANDOM": "Maplewood Park",
|
| 350 |
-
"GENERAL": [
|
| 351 |
-
[
|
| 352 |
-
"Local Park",
|
| 353 |
-
"3"
|
| 354 |
-
],
|
| 355 |
-
[
|
| 356 |
-
"Public Park",
|
| 357 |
-
"5"
|
| 358 |
-
],
|
| 359 |
-
[
|
| 360 |
-
"Recreational Area",
|
| 361 |
-
"7"
|
| 362 |
-
]
|
| 363 |
-
]
|
| 364 |
-
},
|
| 365 |
-
"11 AM": {
|
| 366 |
-
"TYPE": "DATETIME",
|
| 367 |
-
"RANDOM": "1 PM",
|
| 368 |
-
"GENERAL": [
|
| 369 |
-
[
|
| 370 |
-
"Late Morning",
|
| 371 |
-
"2"
|
| 372 |
-
],
|
| 373 |
-
[
|
| 374 |
-
"A",
|
| 375 |
-
"4"
|
| 376 |
-
]
|
| 377 |
-
]
|
| 378 |
-
}
|
| 379 |
-
}
|
| 380 |
```
|
| 381 |
|
| 382 |
Normally you would process the annonymized version with a LLM and than reanonymize the result back.
|
|
|
|
| 99 |
"sensitivity": """<|im_start|>system\nSensitivity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 100 |
"complexity": """<|im_start|>system\nComplexity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 101 |
"entity_detection": """<|im_start|>system\nEntity Detection<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
| 102 |
+
"entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities:\n{entities}\ntext:\n{text}<|im_end|>\n<|im_start|>assistant\n"""
|
| 103 |
}
|
| 104 |
|
| 105 |
def model_inference(text, mode="anonymization", max_new_tokens=2028, config=None, entity_mapping=None, return_entities=False, reverse_mapping=False):
|
|
|
|
| 288 |
|
| 289 |
Example text
|
| 290 |
```
|
| 291 |
+
We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail annoucment!
|
| 292 |
```
|
| 293 |
|
| 294 |
The sensitivity analysis feature evaluates the sensitivity of a given text and the complexitivity feature rates the complexity.
|
| 295 |
```python
|
| 296 |
+
text = "We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail annoucment!"
|
| 297 |
|
| 298 |
# Generate sensitivity score
|
| 299 |
sensitivity_score = model_inference(text, mode="sensitivity")
|
| 300 |
+
print(f"Sensitivity Score: {sensitivity_score}")
|
| 301 |
|
| 302 |
# Generate complexity score
|
| 303 |
complexity_score = model_inference(text, mode="complexity")
|
| 304 |
+
print(f"Complexity: {complexity_score}")
|
| 305 |
```
|
| 306 |
|
| 307 |
Output
|
| 308 |
```
|
| 309 |
Sensitivity Score: 0
|
| 310 |
+
Complexity: 3
|
| 311 |
```
|
| 312 |
|
| 313 |
### 3. Anonymization and Re-Anonymization
|
|
|
|
| 325 |
# Restore the original text
|
| 326 |
anonymized_text, entity_mapping = model_inference(text, mode="anonymization", return_entities=True)
|
| 327 |
print(f"Entity Mapping:\n{entity_mapping}\n")
|
|
|
|
| 328 |
restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping, reverse_mapping=True)
|
| 329 |
print(f"Restored Text: {restored_text}")
|
| 330 |
```
|
| 331 |
|
| 332 |
Output
|
| 333 |
```
|
| 334 |
+
Anonymized Text: We have a community picnic at Sunnyvale Park, it is on A Day of the Week at Morning. Write me an e-mail announcement!
|
| 335 |
|
| 336 |
+
Entity Mapping:
|
| 337 |
+
Greenfield Park : Sunnyvale Park
|
| 338 |
+
thursday : A Day of the Week
|
| 339 |
+
11 AM : Morning
|
| 340 |
|
| 341 |
+
Restored Text: We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail announcement!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
```
|
| 343 |
|
| 344 |
Normally you would process the annonymized version with a LLM and than reanonymize the result back.
|