Subhajit Chakraborty commited on
Commit
99de885
·
1 Parent(s): 6445c44

update files(6)

Browse files
src/post_extraction_tools/data_quality_enhancer.py CHANGED
@@ -54,6 +54,12 @@ key_industry_types = [
54
  "Aviation & Airlines"
55
  ]
56
 
 
 
 
 
 
 
57
  def enhancer(data: object, embedder) -> list:
58
  """
59
  Enhances the data quality by removing duplicates
@@ -73,7 +79,7 @@ def enhancer(data: object, embedder) -> list:
73
  c1 = companies[i]
74
  # name1 = tokenize(c1.get("company_name", ""))
75
  # ind1 = tokenize(c1.get("industry_type", ""))
76
- c1_name_embedding = embedder.encode([c1.get("company_name", "")])
77
  c1_ind_embedding = embedder.encode([c1.get("industry_type", "")])
78
  # c1["ind_embedding"] = c1_ind_embedding
79
  c1_country = c1.get("country", "").lower().strip()
@@ -83,7 +89,7 @@ def enhancer(data: object, embedder) -> list:
83
  if j in duplicate_idx:
84
  continue
85
  c2 = companies[j]
86
- c2_name_embedding = embedder.encode([c2.get("company_name", "")])
87
  c2_ind_embedding = embedder.encode([c2.get("industry_type", "")])
88
  # c2["ind_embedding"] = c2_ind_embedding
89
  c2_country = c2.get("country", "").lower().strip()
 
54
  "Aviation & Airlines"
55
  ]
56
 
57
+ def clean_company_name(name: str) -> str:
58
+ name = name.lower()
59
+ name = re.sub(r'\b(inc\.?|ltd\.?|llc\.?|co\.?|corp\.?|corporation|limited|pvt|private|public|plc|gmbh|s\.a\.|srl|bv|ag|oy|ab|spa|sas|sdn bhd|holdings|group|company|enterprises|technologies)\b', '', name)
60
+ name = re.sub(r'[^a-z0-9\s]', '', name)
61
+ return [re.sub(r'\s+', ' ', name).strip()]
62
+
63
  def enhancer(data: object, embedder) -> list:
64
  """
65
  Enhances the data quality by removing duplicates
 
79
  c1 = companies[i]
80
  # name1 = tokenize(c1.get("company_name", ""))
81
  # ind1 = tokenize(c1.get("industry_type", ""))
82
+ c1_name_embedding = embedder.encode(clean_company_name(c1.get("company_name", "")))
83
  c1_ind_embedding = embedder.encode([c1.get("industry_type", "")])
84
  # c1["ind_embedding"] = c1_ind_embedding
85
  c1_country = c1.get("country", "").lower().strip()
 
89
  if j in duplicate_idx:
90
  continue
91
  c2 = companies[j]
92
+ c2_name_embedding = embedder.encode(clean_company_name(c2.get("company_name", "")))
93
  c2_ind_embedding = embedder.encode([c2.get("industry_type", "")])
94
  # c2["ind_embedding"] = c2_ind_embedding
95
  c2_country = c2.get("country", "").lower().strip()