Spaces:
Sleeping
Sleeping
langdonholmes
commited on
Commit
·
b0e291c
1
Parent(s):
6449ca4
inherited from names_database
Browse files- .gitignore +1 -3
- Pipfile +1 -0
- Pipfile.lock +53 -39
- data/ascii_fb_names_small.parquet +3 -0
- match_replace.py +26 -65
- names_database.py +28 -0
.gitignore
CHANGED
|
@@ -1,3 +1 @@
|
|
| 1 |
-
__pycache__
|
| 2 |
-
__pycache__/spacy_analyzer.cpython-310.pyc
|
| 3 |
-
__pycache__/spacy_recognizer.cpython-310.pyc
|
|
|
|
| 1 |
+
__pycache__/*
|
|
|
|
|
|
Pipfile
CHANGED
|
@@ -14,6 +14,7 @@ streamlit = "==1.17.0"
|
|
| 14 |
tokenizers = "==0.12.1"
|
| 15 |
torch = "==1.12.0"
|
| 16 |
en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
|
|
|
|
| 17 |
|
| 18 |
[dev-packages]
|
| 19 |
|
|
|
|
| 14 |
tokenizers = "==0.12.1"
|
| 15 |
torch = "==1.12.0"
|
| 16 |
en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
|
| 17 |
+
names-dataset = "*"
|
| 18 |
|
| 19 |
[dev-packages]
|
| 20 |
|
Pipfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"_meta": {
|
| 3 |
"hash": {
|
| 4 |
-
"sha256": "
|
| 5 |
},
|
| 6 |
"pipfile-spec": 6,
|
| 7 |
"requires": {
|
|
@@ -463,39 +463,46 @@
|
|
| 463 |
"markers": "python_version >= '3.6'",
|
| 464 |
"version": "==1.0.9"
|
| 465 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
"numpy": {
|
| 467 |
"hashes": [
|
| 468 |
-
"sha256:
|
| 469 |
-
"sha256:
|
| 470 |
-
"sha256:
|
| 471 |
-
"sha256:
|
| 472 |
-
"sha256:
|
| 473 |
-
"sha256:
|
| 474 |
-
"sha256:
|
| 475 |
-
"sha256:
|
| 476 |
-
"sha256:
|
| 477 |
-
"sha256:
|
| 478 |
-
"sha256:
|
| 479 |
-
"sha256:
|
| 480 |
-
"sha256:
|
| 481 |
-
"sha256:
|
| 482 |
-
"sha256:
|
| 483 |
-
"sha256:
|
| 484 |
-
"sha256:
|
| 485 |
-
"sha256:
|
| 486 |
-
"sha256:
|
| 487 |
-
"sha256:
|
| 488 |
-
"sha256:
|
| 489 |
-
"sha256:
|
| 490 |
-
"sha256:
|
| 491 |
-
"sha256:
|
| 492 |
-
"sha256:
|
| 493 |
-
"sha256:
|
| 494 |
-
"sha256:
|
| 495 |
-
"sha256:
|
| 496 |
],
|
| 497 |
"markers": "python_version >= '3.10'",
|
| 498 |
-
"version": "==1.24.
|
| 499 |
},
|
| 500 |
"packaging": {
|
| 501 |
"hashes": [
|
|
@@ -542,10 +549,10 @@
|
|
| 542 |
},
|
| 543 |
"phonenumbers": {
|
| 544 |
"hashes": [
|
| 545 |
-
"sha256:
|
| 546 |
-
"sha256:
|
| 547 |
],
|
| 548 |
-
"version": "==8.13.
|
| 549 |
},
|
| 550 |
"pillow": {
|
| 551 |
"hashes": [
|
|
@@ -737,6 +744,13 @@
|
|
| 737 |
"markers": "python_version >= '3.7'",
|
| 738 |
"version": "==11.0.0"
|
| 739 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
"pycryptodome": {
|
| 741 |
"hashes": [
|
| 742 |
"sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
|
|
@@ -1070,11 +1084,11 @@
|
|
| 1070 |
},
|
| 1071 |
"setuptools": {
|
| 1072 |
"hashes": [
|
| 1073 |
-
"sha256:
|
| 1074 |
-
"sha256:
|
| 1075 |
],
|
| 1076 |
"markers": "python_version >= '3.7'",
|
| 1077 |
-
"version": "==67.
|
| 1078 |
},
|
| 1079 |
"six": {
|
| 1080 |
"hashes": [
|
|
@@ -1480,11 +1494,11 @@
|
|
| 1480 |
},
|
| 1481 |
"zipp": {
|
| 1482 |
"hashes": [
|
| 1483 |
-
"sha256:
|
| 1484 |
-
"sha256:
|
| 1485 |
],
|
| 1486 |
"markers": "python_version >= '3.7'",
|
| 1487 |
-
"version": "==3.
|
| 1488 |
}
|
| 1489 |
},
|
| 1490 |
"develop": {}
|
|
|
|
| 1 |
{
|
| 2 |
"_meta": {
|
| 3 |
"hash": {
|
| 4 |
+
"sha256": "6a4aa8c782c5b5fd8f5f0b3d7ba6cb6541f37295823bdee26d3fd575533c5999"
|
| 5 |
},
|
| 6 |
"pipfile-spec": 6,
|
| 7 |
"requires": {
|
|
|
|
| 463 |
"markers": "python_version >= '3.6'",
|
| 464 |
"version": "==1.0.9"
|
| 465 |
},
|
| 466 |
+
"names-dataset": {
|
| 467 |
+
"hashes": [
|
| 468 |
+
"sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"
|
| 469 |
+
],
|
| 470 |
+
"index": "pypi",
|
| 471 |
+
"version": "==3.1.0"
|
| 472 |
+
},
|
| 473 |
"numpy": {
|
| 474 |
"hashes": [
|
| 475 |
+
"sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
|
| 476 |
+
"sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
|
| 477 |
+
"sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
|
| 478 |
+
"sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
|
| 479 |
+
"sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
|
| 480 |
+
"sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
|
| 481 |
+
"sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
|
| 482 |
+
"sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
|
| 483 |
+
"sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
|
| 484 |
+
"sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
|
| 485 |
+
"sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
|
| 486 |
+
"sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
|
| 487 |
+
"sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
|
| 488 |
+
"sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
|
| 489 |
+
"sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
|
| 490 |
+
"sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
|
| 491 |
+
"sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
|
| 492 |
+
"sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
|
| 493 |
+
"sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
|
| 494 |
+
"sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
|
| 495 |
+
"sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
|
| 496 |
+
"sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
|
| 497 |
+
"sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
|
| 498 |
+
"sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
|
| 499 |
+
"sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
|
| 500 |
+
"sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
|
| 501 |
+
"sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
|
| 502 |
+
"sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
|
| 503 |
],
|
| 504 |
"markers": "python_version >= '3.10'",
|
| 505 |
+
"version": "==1.24.2"
|
| 506 |
},
|
| 507 |
"packaging": {
|
| 508 |
"hashes": [
|
|
|
|
| 549 |
},
|
| 550 |
"phonenumbers": {
|
| 551 |
"hashes": [
|
| 552 |
+
"sha256:1531b42c8c49a1f06b08598441bf1f11fe2618f707c6fc96b581b44aa4f2b0e3",
|
| 553 |
+
"sha256:f8bd92975ba7463b7828ae2f95e1037b7e0ab8f023e9e8ffb7c560fd7f5d66d7"
|
| 554 |
],
|
| 555 |
+
"version": "==8.13.6"
|
| 556 |
},
|
| 557 |
"pillow": {
|
| 558 |
"hashes": [
|
|
|
|
| 744 |
"markers": "python_version >= '3.7'",
|
| 745 |
"version": "==11.0.0"
|
| 746 |
},
|
| 747 |
+
"pycountry": {
|
| 748 |
+
"hashes": [
|
| 749 |
+
"sha256:b2163a246c585894d808f18783e19137cb70a0c18fb36748dc01fc6f109c1646"
|
| 750 |
+
],
|
| 751 |
+
"markers": "python_version >= '3.6' and python_version < '4'",
|
| 752 |
+
"version": "==22.3.5"
|
| 753 |
+
},
|
| 754 |
"pycryptodome": {
|
| 755 |
"hashes": [
|
| 756 |
"sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
|
|
|
|
| 1084 |
},
|
| 1085 |
"setuptools": {
|
| 1086 |
"hashes": [
|
| 1087 |
+
"sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c",
|
| 1088 |
+
"sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"
|
| 1089 |
],
|
| 1090 |
"markers": "python_version >= '3.7'",
|
| 1091 |
+
"version": "==67.2.0"
|
| 1092 |
},
|
| 1093 |
"six": {
|
| 1094 |
"hashes": [
|
|
|
|
| 1494 |
},
|
| 1495 |
"zipp": {
|
| 1496 |
"hashes": [
|
| 1497 |
+
"sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6",
|
| 1498 |
+
"sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b"
|
| 1499 |
],
|
| 1500 |
"markers": "python_version >= '3.7'",
|
| 1501 |
+
"version": "==3.13.0"
|
| 1502 |
}
|
| 1503 |
},
|
| 1504 |
"develop": {}
|
data/ascii_fb_names_small.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf5cf2fa43dc172c613f72793641f668e33c30b4e23932616de36cc0ce3447d
|
| 3 |
+
size 33601747
|
match_replace.py
CHANGED
|
@@ -1,69 +1,31 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
while(surrogate_name == original_name):
|
| 30 |
-
# situation when gender can be matched
|
| 31 |
-
if not gender:
|
| 32 |
-
gender_df = fb_df[fb_df["gender"] == gender]
|
| 33 |
-
gender_c_df = gender_df[gender_df["country"] == country_code]
|
| 34 |
-
# situations: whether country code can be matched
|
| 35 |
-
if gender_c_df.shape[0] > 0:
|
| 36 |
-
surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
|
| 37 |
-
# if gender match, country not match: randomly return from gender df
|
| 38 |
-
else:
|
| 39 |
-
surrogate_name = gender_df[f_l].sample(n=1).to_string()
|
| 40 |
-
else:
|
| 41 |
-
# situation when gender cannot be match: gender is None
|
| 42 |
-
country_df = fb_df[fb_df["country"] == country_code]
|
| 43 |
-
# situation when country can be matched
|
| 44 |
-
if country_df.shape[0] > 0:
|
| 45 |
-
surrogate_name = country_df[f_l].sample(n=1).to_string()
|
| 46 |
-
# situation when neither gender nor country can be matched
|
| 47 |
-
# randomly return one name from the whole dataset
|
| 48 |
-
else:
|
| 49 |
-
surrogate_name = fb_df[f_l].sample(n=1).to_string()
|
| 50 |
-
|
| 51 |
-
return surrogate_name
|
| 52 |
-
|
| 53 |
-
def match_entity(original_info, entity):
|
| 54 |
-
# TODO: need refinement for each kind of entity
|
| 55 |
-
if entity == 'STUDENT':
|
| 56 |
-
# TODO: here, change between 1 and 2
|
| 57 |
-
return match_name_2(original_info)
|
| 58 |
-
elif entity == 'EMAIL_ADDRESS':
|
| 59 |
-
return 'JaneDoe@mail.com'
|
| 60 |
-
elif entity == 'PHONE_NUMBER':
|
| 61 |
-
#TODO: specific form of number will be returned for consistency
|
| 62 |
-
return '000-000-0000'
|
| 63 |
-
elif entity == 'URL':
|
| 64 |
-
return 'google.com'
|
| 65 |
-
else:
|
| 66 |
-
pass
|
| 67 |
|
| 68 |
def match_name(original_name):
|
| 69 |
# FIXME: take too LONG time to run (large df used multi-times), how to improve
|
|
@@ -74,7 +36,6 @@ def match_name(original_name):
|
|
| 74 |
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
|
| 75 |
first_name = original_name.split()[0]
|
| 76 |
global fb_df
|
| 77 |
-
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
|
| 78 |
names = fb_df[fb_df['first']==first_name]
|
| 79 |
if not names.empty:
|
| 80 |
name_df = names.sample(n=1)
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
+
from names_database import NameDatabase
|
| 4 |
+
|
| 5 |
+
names_db = NameDatabase
|
| 6 |
+
|
| 7 |
+
def describe_name(first_names, last_names):
|
| 8 |
+
gender = names_db.get_gender() if first_names else None
|
| 9 |
+
country = names_db.get_country() if last_names else None
|
| 10 |
+
return gender, country
|
| 11 |
+
|
| 12 |
+
def split_name(all_names):
|
| 13 |
+
'''Splits name into parts.
|
| 14 |
+
If one token, assume it is a first name.
|
| 15 |
+
If two tokens, first and last name.
|
| 16 |
+
If three tokens, one first name and two last names.
|
| 17 |
+
If four tokens, two first names and two last names.'''
|
| 18 |
+
match all_names.split():
|
| 19 |
+
case [first]:
|
| 20 |
+
return first, None
|
| 21 |
+
case [first, last]:
|
| 22 |
+
return first, last
|
| 23 |
+
case [first, last_1, last_2]:
|
| 24 |
+
return first, ' '.join((last_1, last_2))
|
| 25 |
+
case [first_1, first_2, last_1, last_2]:
|
| 26 |
+
return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
|
| 27 |
+
case _:
|
| 28 |
+
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def match_name(original_name):
|
| 31 |
# FIXME: take too LONG time to run (large df used multi-times), how to improve
|
|
|
|
| 36 |
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
|
| 37 |
first_name = original_name.split()[0]
|
| 38 |
global fb_df
|
|
|
|
| 39 |
names = fb_df[fb_df['first']==first_name]
|
| 40 |
if not names.empty:
|
| 41 |
name_df = names.sample(n=1)
|
names_database.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from names_dataset import NameDataset, NameWrapper
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
class NameDatabase(NameDataset):
|
| 5 |
+
def __init__(self) -> None:
|
| 6 |
+
super().__init__()
|
| 7 |
+
self.names = pd.read_parquet('ascii_fb_names_small.parquet')
|
| 8 |
+
|
| 9 |
+
def get_random_name(
|
| 10 |
+
self,
|
| 11 |
+
country: Optional[str] = None,
|
| 12 |
+
gender: Optional[str] = None
|
| 13 |
+
):
|
| 14 |
+
'''country: ISO country code in 'alpha 2' format
|
| 15 |
+
gender: "M" or "F"
|
| 16 |
+
'''
|
| 17 |
+
names_view = self.names
|
| 18 |
+
if country:
|
| 19 |
+
names_view = names_view[names_view['country'] == country]
|
| 20 |
+
if gender:
|
| 21 |
+
names_view = names_view[names_view['gender'] == gender]
|
| 22 |
+
return names_view.sample(weights=names_view.count)
|
| 23 |
+
|
| 24 |
+
def get_gender(first_names: str):
|
| 25 |
+
return NameWrapper(self.search(first_names)).gender
|
| 26 |
+
|
| 27 |
+
def get_country(last_names: str):
|
| 28 |
+
return NameWrapper(self.search(last_names)).country
|