Spaces:
Running
Running
Subhajit Chakraborty
commited on
Commit
·
99de885
1
Parent(s):
6445c44
update files(6)
Browse files
src/post_extraction_tools/data_quality_enhancer.py
CHANGED
|
@@ -54,6 +54,12 @@ key_industry_types = [
|
|
| 54 |
"Aviation & Airlines"
|
| 55 |
]
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def enhancer(data: object, embedder) -> list:
|
| 58 |
"""
|
| 59 |
Enhances the data quality by removing duplicates
|
|
@@ -73,7 +79,7 @@ def enhancer(data: object, embedder) -> list:
|
|
| 73 |
c1 = companies[i]
|
| 74 |
# name1 = tokenize(c1.get("company_name", ""))
|
| 75 |
# ind1 = tokenize(c1.get("industry_type", ""))
|
| 76 |
-
c1_name_embedding = embedder.encode(
|
| 77 |
c1_ind_embedding = embedder.encode([c1.get("industry_type", "")])
|
| 78 |
# c1["ind_embedding"] = c1_ind_embedding
|
| 79 |
c1_country = c1.get("country", "").lower().strip()
|
|
@@ -83,7 +89,7 @@ def enhancer(data: object, embedder) -> list:
|
|
| 83 |
if j in duplicate_idx:
|
| 84 |
continue
|
| 85 |
c2 = companies[j]
|
| 86 |
-
c2_name_embedding = embedder.encode(
|
| 87 |
c2_ind_embedding = embedder.encode([c2.get("industry_type", "")])
|
| 88 |
# c2["ind_embedding"] = c2_ind_embedding
|
| 89 |
c2_country = c2.get("country", "").lower().strip()
|
|
|
|
| 54 |
"Aviation & Airlines"
|
| 55 |
]
|
| 56 |
|
| 57 |
+
def clean_company_name(name: str) -> str:
|
| 58 |
+
name = name.lower()
|
| 59 |
+
name = re.sub(r'\b(inc\.?|ltd\.?|llc\.?|co\.?|corp\.?|corporation|limited|pvt|private|public|plc|gmbh|s\.a\.|srl|bv|ag|oy|ab|spa|sas|sdn bhd|holdings|group|company|enterprises|technologies)\b', '', name)
|
| 60 |
+
name = re.sub(r'[^a-z0-9\s]', '', name)
|
| 61 |
+
return [re.sub(r'\s+', ' ', name).strip()]
|
| 62 |
+
|
| 63 |
def enhancer(data: object, embedder) -> list:
|
| 64 |
"""
|
| 65 |
Enhances the data quality by removing duplicates
|
|
|
|
| 79 |
c1 = companies[i]
|
| 80 |
# name1 = tokenize(c1.get("company_name", ""))
|
| 81 |
# ind1 = tokenize(c1.get("industry_type", ""))
|
| 82 |
+
c1_name_embedding = embedder.encode(clean_company_name(c1.get("company_name", "")))
|
| 83 |
c1_ind_embedding = embedder.encode([c1.get("industry_type", "")])
|
| 84 |
# c1["ind_embedding"] = c1_ind_embedding
|
| 85 |
c1_country = c1.get("country", "").lower().strip()
|
|
|
|
| 89 |
if j in duplicate_idx:
|
| 90 |
continue
|
| 91 |
c2 = companies[j]
|
| 92 |
+
c2_name_embedding = embedder.encode(clean_company_name(c2.get("company_name", "")))
|
| 93 |
c2_ind_embedding = embedder.encode([c2.get("industry_type", "")])
|
| 94 |
# c2["ind_embedding"] = c2_ind_embedding
|
| 95 |
c2_country = c2.get("country", "").lower().strip()
|