Subhajit Chakraborty commited on
Commit
eb93708
·
1 Parent(s): dcc5dd1

modify(1)

Browse files
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  .env
 
2
  __pycache__/
3
  .pytest_cache/
4
  *.pyc
 
1
  .env
2
+ .venv/
3
  __pycache__/
4
  .pytest_cache/
5
  *.pyc
src/app.py CHANGED
@@ -2,13 +2,14 @@ import streamlit as st
2
  import os
3
  from services.queryService import QService
4
  from services.llm_client import LLMClient
 
5
  from sentence_transformers import SentenceTransformer
6
  from post_extraction_tools import (
7
  website_adder,
8
  clean_json,
9
  lead_scoring,
10
  data_quality_enhancer,
11
- chart_data
12
  )
13
  from services.add_leads import add_leads_f
14
  import json
@@ -25,13 +26,16 @@ def load_model():
25
  os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp/huggingface/sentence_transformers"
26
 
27
  os.makedirs("/tmp/huggingface", exist_ok=True)
28
- return SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder="/tmp/huggingface")
 
 
29
 
30
 
31
  embedder = load_model()
32
  lead_scorer = lead_scoring.LeadScoring(llm, embedder)
33
 
34
  st.set_page_config(page_title="Caprae Capital Lead Generation Tool", layout="wide")
 
35
  # st.title("Lead Management Dashboard")
36
  # This is the navigation section
37
  if "page" not in st.session_state:
@@ -62,12 +66,13 @@ with st.sidebar:
62
  st.session_state.page = page_name
63
 
64
  if st.session_state.page == "Dashboard":
65
- main_lead_info_file = "data/all_cleaned_companies.json"
66
  st.header("Welcome!!")
67
  st.text("Here you will find all about your leads.")
68
  if st.session_state.data_enhancement == True:
69
- if os.path.isdir(main_lead_info_file):
70
- fig_ind, fig_coun, fig_btype, fig_rev = chart_data.create_chart("data/all_cleaned_companies.json")
 
 
71
  col1, col2, col3 = st.columns(3)
72
 
73
  with col1:
@@ -81,30 +86,36 @@ if st.session_state.page == "Dashboard":
81
  with col3:
82
  st.subheader("Business type-wise Distribution")
83
  st.plotly_chart(fig_btype, use_container_width=True)
84
-
85
  st.subheader("Revenue-based Distribution")
86
  st.plotly_chart(fig_rev, use_container_width=True)
87
  else:
88
- st.subheader("Do Data Enhancement first in order to view the diagramatic details!!")
89
- if os.path.isdir(main_lead_info_file):
90
- df_display = chart_data.df_creator_from_json_and_process(main_lead_info_file).sort_values(by="score", ascending=False).rename(columns={
91
- "company_name": "Company Name",
92
- "key_industry": "Industry Type",
93
- "industry_type": "Speciality",
94
- "street": "Street",
95
- "city": "City",
96
- "state": "State",
97
- "country": "Country",
98
- "phone": "Phone",
99
- "email": "Email",
100
- "company_size": "Number of Employees",
101
- "approx_revenue": "Revenue",
102
- "business_type": "Business Type",
103
- "website_url": "Website",
104
- "country": "Country"
105
- })
 
 
 
 
 
106
  st.subheader("All Company Details")
107
  st.dataframe(df_display)
 
108
  else:
109
  # st.subheader("All Company Details")
110
  st.write("_There are no leads yet.Go to Data Enrichment to create leads!!_")
@@ -151,9 +162,15 @@ if st.session_state.page == "Enrich Companies":
151
  key="manual_entry_b",
152
  )
153
  if manual_entry_button:
154
- if not (st.session_state.data_enhancement and st.session_state.intelliscore):
155
- st.warning("Complete the Data Enhancement and Intelliscore Lead Scoring first!!")
 
 
 
 
 
156
  pass
 
157
  else:
158
  lead_data = {
159
  "company_name": lead_name,
@@ -174,9 +191,9 @@ if st.session_state.page == "Enrich Companies":
174
  cleaned_data = clean_json.clean_json_f(lead_data)
175
  cleaned_data_obj = json.loads(cleaned_data)
176
  cleaned_data_obj = add_leads_f(
177
- "data/all_cleaned_companies.json", cleaned_data_obj
178
  )
179
- with open("data/all_cleaned_companies.json", "w") as f:
180
  json.dump(cleaned_data_obj, f, indent=2)
181
 
182
  print("Cleaned JSON saved to all_cleaned_companies.json")
@@ -192,7 +209,6 @@ if st.session_state.page == "Enrich Companies":
192
  st.session_state.data_enhancement = False
193
  st.session_state.intelliscore = False
194
  st.session_state.lead_conditions = False
195
-
196
 
197
  with tab2:
198
  st.subheader("Advanced Intelligent Scrapper and Data Completion")
@@ -258,9 +274,9 @@ if st.session_state.page == "Enrich Companies":
258
  cleaned_data = clean_json.clean_json_f(data)
259
  cleaned_data_obj = json.loads(cleaned_data)
260
  cleaned_data_obj = add_leads_f(
261
- "data/all_cleaned_companies.json", cleaned_data_obj
262
  )
263
- with open("data/all_cleaned_companies.json", "w") as f:
264
  json.dump(cleaned_data_obj, f, indent=2)
265
 
266
  print("Cleaned JSON saved to all_cleaned_companies.json")
@@ -278,7 +294,7 @@ if st.session_state.page == "Enrich Companies":
278
  final_data, embedder
279
  )[0]
280
 
281
- with open("data/all_cleaned_companies.json", "w") as f:
282
  json.dump(enhanced_data, f, indent=2)
283
 
284
  print(
@@ -291,7 +307,7 @@ if st.session_state.page == "Enrich Companies":
291
  res = lead_scorer.scrape_and_augment(
292
  own_comp_info, own_comp_web_url
293
  )
294
- with open("data/lead_conditions.json", "w") as f:
295
  json.dump(res, f, indent=2)
296
 
297
  scored_leads = lead_scorer.score(enhanced_data, res)
@@ -308,39 +324,47 @@ if st.session_state.page == "Enhance Data Quality":
308
  st.text("This tool uses embedding model to ensure clean and reliable data quality.")
309
  with st.container(border=True):
310
  st.subheader("Your Current Data")
311
- with open("data/all_cleaned_companies.json", "r") as f:
312
- temp_data = json.load(f)
313
- temp_df = pd.DataFrame(temp_data.get("companies", []))
314
- st.dataframe(temp_df)
315
- col13, col14, col15 = st.columns([1, 1, 1])
316
- with col14:
317
- enhance_data_b = st.button(
318
- "Enhance Data", type="primary", use_container_width=True
319
- )
320
- if enhance_data_b and st.session_state.data_enhancement == False and st.session_state.pipeline_executed == False:
321
- with st.spinner("Enhancing the data..."):
322
- enhancer_output = data_quality_enhancer.enhancer(
323
- temp_data, embedder
324
- )
325
- enhanced_data, duplicate_comps = (
326
- enhancer_output[0],
327
- enhancer_output[1]["duplicate_company_names"],
328
- )
329
- st.success("Enhancement Completed!!")
 
 
 
 
 
 
330
 
331
- with open("data/all_cleaned_companies.json", "w") as f:
332
- json.dump(enhanced_data, f, indent=2)
333
- if duplicate_comps == []:
334
- st.text("No Duplicate Entries Found!!")
335
- else:
336
- st.text(f"Removed {len(duplicate_comps)} duplicate companies!!")
337
- st.text("Removed Companies: ")
338
- for c in duplicate_comps:
339
- st.text(c)
340
- st.session_state.data_enhancement = True
341
 
342
- elif enhance_data_b and st.session_state.data_enhancement == True:
343
- st.text("Already Enhanced!!")
 
 
344
 
345
  if st.session_state.page == "IntelliSCORE":
346
  st.subheader("Advanced Lead Scoring Tool")
@@ -374,7 +398,7 @@ if st.session_state.page == "IntelliSCORE":
374
  )
375
  if intelliscore_b:
376
  if st.session_state.data_enhancement == True:
377
- with open("data/all_cleaned_companies.json", "r") as f:
378
  leads = json.load(f)
379
 
380
  if ask_scrap_per == "yes" or ask_scrap_per == None:
@@ -382,13 +406,13 @@ if st.session_state.page == "IntelliSCORE":
382
  res = lead_scorer.scrape_and_augment(
383
  additional_info, comp_url
384
  )
385
- with open("data/lead_conditions.json", "w") as f:
386
  json.dump(res, f, indent=2)
387
  st.success("Scrapping Completed!")
388
  if res and "error" not in res:
389
  st.session_state.lead_conditions = True
390
 
391
- with open("data/lead_conditions.json", "r") as f:
392
  lead_cond = json.load(f)
393
  with st.spinner("Scoring the leads..."):
394
  scored_leads = lead_scorer.score(leads, lead_cond)
@@ -399,7 +423,7 @@ if st.session_state.page == "IntelliSCORE":
399
 
400
  else:
401
  st.text("Skipping url scrapping...")
402
- with open("data/lead_conditions.json", "r") as f:
403
  lead_cond = json.load(f)
404
  with st.spinner("Scoring the leads..."):
405
  scored_leads = lead_scorer.score(leads, lead_cond)
@@ -409,4 +433,4 @@ if st.session_state.page == "IntelliSCORE":
409
  st.session_state.intelliscore = True
410
 
411
  else:
412
- st.warning("Complete the Data Enhancement first!!")
 
2
  import os
3
  from services.queryService import QService
4
  from services.llm_client import LLMClient
5
+ from services.get_file_status import check_lead_existance
6
  from sentence_transformers import SentenceTransformer
7
  from post_extraction_tools import (
8
  website_adder,
9
  clean_json,
10
  lead_scoring,
11
  data_quality_enhancer,
12
+ chart_data,
13
  )
14
  from services.add_leads import add_leads_f
15
  import json
 
26
  os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp/huggingface/sentence_transformers"
27
 
28
  os.makedirs("/tmp/huggingface", exist_ok=True)
29
+ return SentenceTransformer(
30
+ "sentence-transformers/all-MiniLM-L6-v2", cache_folder="/tmp/huggingface"
31
+ )
32
 
33
 
34
  embedder = load_model()
35
  lead_scorer = lead_scoring.LeadScoring(llm, embedder)
36
 
37
  st.set_page_config(page_title="Caprae Capital Lead Generation Tool", layout="wide")
38
+ main_lead_info_file = "src/data/all_cleaned_companies.json"
39
  # st.title("Lead Management Dashboard")
40
  # This is the navigation section
41
  if "page" not in st.session_state:
 
66
  st.session_state.page = page_name
67
 
68
  if st.session_state.page == "Dashboard":
 
69
  st.header("Welcome!!")
70
  st.text("Here you will find all about your leads.")
71
  if st.session_state.data_enhancement == True:
72
+ if check_lead_existance(main_lead_info_file):
73
+ fig_ind, fig_coun, fig_btype, fig_rev = chart_data.create_chart(
74
+ main_lead_info_file
75
+ )
76
  col1, col2, col3 = st.columns(3)
77
 
78
  with col1:
 
86
  with col3:
87
  st.subheader("Business type-wise Distribution")
88
  st.plotly_chart(fig_btype, use_container_width=True)
89
+
90
  st.subheader("Revenue-based Distribution")
91
  st.plotly_chart(fig_rev, use_container_width=True)
92
  else:
93
+ st.subheader(
94
+ "Do Data Enhancement first in order to view the diagramatic details!!"
95
+ )
96
+
97
+ if check_lead_existance(main_lead_info_file):
98
+ df_display = chart_data.df_creator_from_json_and_process(main_lead_info_file).sort_values(by="score", ascending=False).rename(
99
+ columns={
100
+ "company_name": "Company Name",
101
+ "key_industry": "Industry Type",
102
+ "industry_type": "Speciality",
103
+ "street": "Street",
104
+ "city": "City",
105
+ "state": "State",
106
+ "country": "Country",
107
+ "phone": "Phone",
108
+ "email": "Email",
109
+ "company_size": "Number of Employees",
110
+ "approx_revenue": "Revenue",
111
+ "business_type": "Business Type",
112
+ "website_url": "Website",
113
+ "country": "Country",
114
+ }
115
+ )
116
  st.subheader("All Company Details")
117
  st.dataframe(df_display)
118
+
119
  else:
120
  # st.subheader("All Company Details")
121
  st.write("_There are no leads yet.Go to Data Enrichment to create leads!!_")
 
162
  key="manual_entry_b",
163
  )
164
  if manual_entry_button:
165
+ if (check_lead_existance(main_lead_info_file)) and not (
166
+ st.session_state.data_enhancement
167
+ and st.session_state.intelliscore
168
+ ):
169
+ st.warning(
170
+ "Complete the Data Enhancement and Intelliscore Lead Scoring first!!"
171
+ )
172
  pass
173
+
174
  else:
175
  lead_data = {
176
  "company_name": lead_name,
 
191
  cleaned_data = clean_json.clean_json_f(lead_data)
192
  cleaned_data_obj = json.loads(cleaned_data)
193
  cleaned_data_obj = add_leads_f(
194
+ main_lead_info_file, cleaned_data_obj
195
  )
196
+ with open(main_lead_info_file, "w") as f:
197
  json.dump(cleaned_data_obj, f, indent=2)
198
 
199
  print("Cleaned JSON saved to all_cleaned_companies.json")
 
209
  st.session_state.data_enhancement = False
210
  st.session_state.intelliscore = False
211
  st.session_state.lead_conditions = False
 
212
 
213
  with tab2:
214
  st.subheader("Advanced Intelligent Scrapper and Data Completion")
 
274
  cleaned_data = clean_json.clean_json_f(data)
275
  cleaned_data_obj = json.loads(cleaned_data)
276
  cleaned_data_obj = add_leads_f(
277
+ main_lead_info_file, cleaned_data_obj
278
  )
279
+ with open(main_lead_info_file, "w") as f:
280
  json.dump(cleaned_data_obj, f, indent=2)
281
 
282
  print("Cleaned JSON saved to all_cleaned_companies.json")
 
294
  final_data, embedder
295
  )[0]
296
 
297
+ with open(main_lead_info_file, "w") as f:
298
  json.dump(enhanced_data, f, indent=2)
299
 
300
  print(
 
307
  res = lead_scorer.scrape_and_augment(
308
  own_comp_info, own_comp_web_url
309
  )
310
+ with open("src/data/lead_conditions.json", "w") as f:
311
  json.dump(res, f, indent=2)
312
 
313
  scored_leads = lead_scorer.score(enhanced_data, res)
 
324
  st.text("This tool uses embedding model to ensure clean and reliable data quality.")
325
  with st.container(border=True):
326
  st.subheader("Your Current Data")
327
+ if check_lead_existance(main_lead_info_file):
328
+ with open(main_lead_info_file, "r") as f:
329
+ temp_data = json.load(f)
330
+
331
+ temp_df = pd.DataFrame(temp_data.get("companies", []))
332
+ st.dataframe(temp_df)
333
+ col13, col14, col15 = st.columns([1, 1, 1])
334
+ with col14:
335
+ enhance_data_b = st.button(
336
+ "Enhance Data", type="primary", use_container_width=True
337
+ )
338
+ if (
339
+ enhance_data_b
340
+ and st.session_state.data_enhancement == False
341
+ and st.session_state.pipeline_executed == False
342
+ ):
343
+ with st.spinner("Enhancing the data..."):
344
+ enhancer_output = data_quality_enhancer.enhancer(
345
+ temp_data, embedder
346
+ )
347
+ enhanced_data, duplicate_comps = (
348
+ enhancer_output[0],
349
+ enhancer_output[1]["duplicate_company_names"],
350
+ )
351
+ st.success("Enhancement Completed!!")
352
 
353
+ with open(main_lead_info_file, "w") as f:
354
+ json.dump(enhanced_data, f, indent=2)
355
+ if duplicate_comps == []:
356
+ st.text("No Duplicate Entries Found!!")
357
+ else:
358
+ st.text(f"Removed {len(duplicate_comps)} duplicate companies!!")
359
+ st.text("Removed Companies: ")
360
+ for c in duplicate_comps:
361
+ st.text(c)
362
+ st.session_state.data_enhancement = True
363
 
364
+ elif enhance_data_b and st.session_state.data_enhancement == True:
365
+ st.text("Already Enhanced!!")
366
+ else:
367
+ st.warning("No Leads Found! Go to Enrichment tool to add leads.")
368
 
369
  if st.session_state.page == "IntelliSCORE":
370
  st.subheader("Advanced Lead Scoring Tool")
 
398
  )
399
  if intelliscore_b:
400
  if st.session_state.data_enhancement == True:
401
+ with open(main_lead_info_file, "r") as f:
402
  leads = json.load(f)
403
 
404
  if ask_scrap_per == "yes" or ask_scrap_per == None:
 
406
  res = lead_scorer.scrape_and_augment(
407
  additional_info, comp_url
408
  )
409
+ with open("src/data/lead_conditions.json", "w") as f:
410
  json.dump(res, f, indent=2)
411
  st.success("Scrapping Completed!")
412
  if res and "error" not in res:
413
  st.session_state.lead_conditions = True
414
 
415
+ with open("src/data/lead_conditions.json", "r") as f:
416
  lead_cond = json.load(f)
417
  with st.spinner("Scoring the leads..."):
418
  scored_leads = lead_scorer.score(leads, lead_cond)
 
423
 
424
  else:
425
  st.text("Skipping url scrapping...")
426
+ with open("src/data/lead_conditions.json", "r") as f:
427
  lead_cond = json.load(f)
428
  with st.spinner("Scoring the leads..."):
429
  scored_leads = lead_scorer.score(leads, lead_cond)
 
433
  st.session_state.intelliscore = True
434
 
435
  else:
436
+ st.warning("Complete the Data Enhancement first!!")
src/data/all_cleaned_companies.json DELETED
@@ -1,191 +0,0 @@
1
- {
2
- "companies": [
3
- {
4
- "company_name": "Sprout Social",
5
- "industry_type": "Social Media Management Software",
6
- "location": "Chicago, Illinois, USA",
7
- "company_size": "1001",
8
- "street": "131 S Dearborn St",
9
- "city": "Chicago",
10
- "state": "IL",
11
- "country": "USA",
12
- "phone": "(312) 593-3600",
13
- "email": null,
14
- "approx_revenue": "$340.2 million",
15
- "business_type": "B2B",
16
- "website_url": "https://sproutsocial.com/",
17
- "score": 2.16,
18
- "key_industry": "Software & SaaS"
19
- },
20
- {
21
- "company_name": "Oracle",
22
- "industry_type": "Software Development, Cloud Computing, Database Technology",
23
- "location": "Bangalore, India",
24
- "company_size": "170000",
25
- "street": "Bagmane Tech Park, Outer Ring Road, Doddanekundi Village, Mahadevapura",
26
- "city": "Bangalore",
27
- "state": "Karnataka",
28
- "country": "India",
29
- "phone": "91-80-41070000",
30
- "email": null,
31
- "approx_revenue": "$53 billion",
32
- "business_type": "B2B",
33
- "website_url": "https://www.oracle.com/",
34
- "key_industry": "Cloud Computing & DevOps",
35
- "score": 2.4699999999999998
36
- },
37
- {
38
- "company_name": "Microsoft",
39
- "industry_type": "Software Development, Cloud Computing, Operating Systems",
40
- "location": "Bangalore, India",
41
- "company_size": "221000",
42
- "street": "Microsoft Signature Building, Embassy Golf Links Business Park, Off Intermediate Ring Road, Domlur",
43
- "city": "Bangalore",
44
- "state": "Karnataka",
45
- "country": "India",
46
- "phone": "080 4010 3000",
47
- "email": null,
48
- "approx_revenue": "$211.9 billion",
49
- "business_type": "B2B",
50
- "website_url": "https://www.microsoft.com/en-in",
51
- "key_industry": "Cloud Computing & DevOps",
52
- "score": 3.54
53
- },
54
- {
55
- "company_name": "Accenture",
56
- "industry_type": "IT Services, Consulting, Software Development",
57
- "location": "Bangalore, India",
58
- "company_size": "733000",
59
- "street": "1, Old Madras Road, Bagmane Constellation Business Park, Doddanekundi, Marathahalli",
60
- "city": "Bangalore",
61
- "state": "Karnataka",
62
- "country": "India",
63
- "phone": "91-80-62150000",
64
- "email": null,
65
- "approx_revenue": "$64.1 billion",
66
- "business_type": "B2B",
67
- "website_url": "https://www.accenture.com/in-en",
68
- "key_industry": "Consulting & Business Services",
69
- "score": 2.52
70
- },
71
- {
72
- "company_name": "Infosys",
73
- "industry_type": "IT Services, Business Consulting, Software Development, Outsourcing",
74
- "location": "Bangalore, India",
75
- "company_size": "317000",
76
- "street": "Plot No. 44 and 97A, Electronics City, Hosur Road",
77
- "city": "Bangalore",
78
- "state": "Karnataka",
79
- "country": "India",
80
- "phone": "080 2852 0261",
81
- "email": null,
82
- "approx_revenue": "$18.2 billion",
83
- "business_type": "B2B",
84
- "website_url": "https://www.infosys.com/",
85
- "key_industry": "Consulting & Business Services",
86
- "score": 2.52
87
- },
88
- {
89
- "company_name": "Wipro",
90
- "industry_type": "IT Services, Consulting, Business Process Services, Software Development",
91
- "location": "Bangalore, India",
92
- "company_size": "240000",
93
- "street": "Doddakannelli, Sarjapur Road",
94
- "city": "Bangalore",
95
- "state": "Karnataka",
96
- "country": "India",
97
- "phone": "080 2844 0011",
98
- "email": null,
99
- "approx_revenue": "$10.8 billion",
100
- "business_type": "B2B",
101
- "website_url": "https://www.wipro.com/",
102
- "key_industry": "Consulting & Business Services",
103
- "score": 2.49
104
- },
105
- {
106
- "company_name": "The Kraft Heinz Company",
107
- "industry_type": "Food and Beverage",
108
- "location": "Chicago, Illinois, USA",
109
- "company_size": "36000",
110
- "street": "Aon Center, 200 E Randolph St",
111
- "city": "Chicago",
112
- "state": "IL",
113
- "country": "USA",
114
- "phone": null,
115
- "email": null,
116
- "approx_revenue": "$26 billion",
117
- "business_type": "Both",
118
- "website_url": "https://www.kraftheinzcompany.com/",
119
- "key_industry": "Food & Beverages",
120
- "score": 2.2
121
- },
122
- {
123
- "company_name": "Mondel\u00c4\u201cz International",
124
- "industry_type": "Food and Beverage",
125
- "location": "Chicago, Illinois, USA",
126
- "company_size": "91000",
127
- "street": "905 W Fulton Market",
128
- "city": "Chicago",
129
- "state": "IL",
130
- "country": "USA",
131
- "phone": "+1 847 943 4000",
132
- "email": null,
133
- "approx_revenue": "$36.01 billion",
134
- "business_type": "Both",
135
- "website_url": "https://www.mondelezinternational.com/",
136
- "key_industry": "Food & Beverages",
137
- "score": 2.2
138
- },
139
- {
140
- "company_name": "US Foods",
141
- "industry_type": "Foodservice Distributor",
142
- "location": "Rosemont, Illinois, USA",
143
- "company_size": "29000",
144
- "street": "9399 W. Higgins Road, Suite 100",
145
- "city": "Rosemont",
146
- "state": "IL",
147
- "country": "USA",
148
- "phone": "847-720-8000",
149
- "email": null,
150
- "approx_revenue": "$35 billion",
151
- "business_type": "B2B",
152
- "website_url": "https://www.usfoods.com/",
153
- "key_industry": "Logistics, Supply Chain & Warehousing",
154
- "score": 3.1799999999999997
155
- },
156
- {
157
- "company_name": "The Quaker Oats Company",
158
- "industry_type": "Food and Beverage",
159
- "location": "Chicago, Illinois, USA",
160
- "company_size": "2500",
161
- "street": "433 W Van Buren St",
162
- "city": "Chicago",
163
- "state": "IL",
164
- "country": "USA",
165
- "phone": "+1 312 821 1000",
166
- "email": null,
167
- "approx_revenue": "$2.8 billion",
168
- "business_type": "Both",
169
- "website_url": "https://www.quakeroats.com/",
170
- "key_industry": "Food & Beverages",
171
- "score": 2.2
172
- },
173
- {
174
- "company_name": "Conagra Brands, Inc.",
175
- "industry_type": "Food and Beverage",
176
- "location": "Chicago, Illinois, USA",
177
- "company_size": "18000",
178
- "street": "222 W Merchandise Mart Plaza, 13th Fl",
179
- "city": "Chicago",
180
- "state": "IL",
181
- "country": "USA",
182
- "phone": "+1 312 549 5000",
183
- "email": null,
184
- "approx_revenue": "$12.28 billion",
185
- "business_type": "Both",
186
- "website_url": "https://www.conagrabrands.com/",
187
- "key_industry": "Food & Beverages",
188
- "score": 2.2
189
- }
190
- ]
191
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/post_extraction_tools/chart_data.py CHANGED
@@ -21,7 +21,9 @@ def parse_revenue(rev_str):
21
  def df_creator_from_json_and_process(filepath: str):
22
  with open(filepath, "r") as f:
23
  data = json.load(f)["companies"]
24
-
 
 
25
  return pd.DataFrame(data)
26
 
27
  def create_chart(filepath: str):
 
21
  def df_creator_from_json_and_process(filepath: str):
22
  with open(filepath, "r") as f:
23
  data = json.load(f)["companies"]
24
+ for c in data:
25
+ if "score" not in c:
26
+ c["score"] = None
27
  return pd.DataFrame(data)
28
 
29
  def create_chart(filepath: str):
src/post_extraction_tools/data_quality_enhancer.py CHANGED
@@ -124,7 +124,7 @@ def enhancer(data: object, embedder) -> list:
124
 
125
 
126
  def add_ind_key(data: list, embedder) -> list:
127
- with open("data/key_industry_embeddings.json", "r") as f:
128
  key_ind_embs = json.load(f)["industry_embeddings"]
129
  for c in data:
130
  if "key_industry" not in c:
 
124
 
125
 
126
  def add_ind_key(data: list, embedder) -> list:
127
+ with open("src/data/key_industry_embeddings.json", "r") as f:
128
  key_ind_embs = json.load(f)["industry_embeddings"]
129
  for c in data:
130
  if "key_industry" not in c:
src/services/get_file_status.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ def check_lead_existance(filepath: str)->bool:
5
+ if os.path.isfile(filepath) and os.path.getsize(filepath):
6
+ with open(filepath, 'r') as f:
7
+ data = json.load(f)["companies"]
8
+ if len(data) > 0:
9
+ return True
10
+ else:
11
+ return False
12
+ else:
13
+ return False