Subhajit Chakraborty commited on
Commit
8f6827d
·
1 Parent(s): 9cdbd5b

update files(3)

Browse files
src/app.py CHANGED
@@ -205,15 +205,15 @@ if st.session_state.page == "Enrich Companies":
205
  with open(main_lead_info_file, "w") as f:
206
  json.dump(cleaned_data_obj, f, indent=2)
207
 
208
- print("Cleaned JSON saved to all_cleaned_companies.json")
209
- print("Now enriching the data with website URLs...")
210
 
211
  companies = cleaned_data_obj.get("companies", [])
212
  intermediate_data = website_adder.find_all_company_websites(
213
  companies
214
  )
215
  final_data = website_adder.wiki_search_mode(intermediate_data)
216
- print("Website URL enrichment completed.")
217
  st.session_state.pipeline_executed = False
218
  st.session_state.data_enhancement = False
219
  st.session_state.intelliscore = False
@@ -276,7 +276,7 @@ if st.session_state.page == "Enrich Companies":
276
  )
277
  response = qservice.query()
278
  print(response)
279
- print("Initial extraction is done. Now cleaning the JSON...")
280
  with open("/tmp/data/uncleaned_companies.json", "r") as f:
281
  data = json.load(f)
282
 
@@ -288,17 +288,17 @@ if st.session_state.page == "Enrich Companies":
288
  with open(main_lead_info_file, "w") as f:
289
  json.dump(cleaned_data_obj, f, indent=2)
290
 
291
- print("Cleaned JSON saved to all_cleaned_companies.json")
292
- print("Now enriching the data with website URLs...")
293
 
294
  companies = cleaned_data_obj.get("companies", [])
295
  intermediate_data = website_adder.find_all_company_websites(
296
  companies
297
  )
298
  final_data = website_adder.wiki_search_mode(intermediate_data)
299
- print("Website URL enrichment completed.")
300
 
301
- print("Now enhancing the data quality by removing duplicates...")
302
  enhanced_data = data_quality_enhancer.enhancer(
303
  final_data, embedder
304
  )[0]
@@ -307,10 +307,10 @@ if st.session_state.page == "Enrich Companies":
307
  json.dump(enhanced_data, f, indent=2)
308
 
309
  print(
310
- "Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json"
311
  )
312
  print(
313
- "Now scoring the leads based on relevance (Intelligent scoring)..."
314
  )
315
 
316
  res = lead_scorer.scrape_and_augment(
@@ -320,8 +320,8 @@ if st.session_state.page == "Enrich Companies":
320
  json.dump(res, f, indent=2)
321
 
322
  scored_leads = lead_scorer.score(enhanced_data, res)
323
- print("Lead scoring completed. Here are the scored leads:")
324
- print(scored_leads)
325
  st.session_state.pipeline_executed = True
326
  st.session_state.data_enhancement = True
327
  st.session_state.intelliscore = True
@@ -415,7 +415,7 @@ if st.session_state.page == "IntelliSCORE":
415
  res = lead_scorer.scrape_and_augment(
416
  additional_info, comp_url
417
  )
418
- print(res)
419
  with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
420
  json.dump(res, f, indent=2)
421
  st.success("Scrapping Completed!")
 
205
  with open(main_lead_info_file, "w") as f:
206
  json.dump(cleaned_data_obj, f, indent=2)
207
 
208
+ print("Cleaned JSON saved to all_cleaned_companies.json",flush=True)
209
+ print("Now enriching the data with website URLs...",flush=True)
210
 
211
  companies = cleaned_data_obj.get("companies", [])
212
  intermediate_data = website_adder.find_all_company_websites(
213
  companies
214
  )
215
  final_data = website_adder.wiki_search_mode(intermediate_data)
216
+ print("Website URL enrichment completed.", flush=True)
217
  st.session_state.pipeline_executed = False
218
  st.session_state.data_enhancement = False
219
  st.session_state.intelliscore = False
 
276
  )
277
  response = qservice.query()
278
  print(response)
279
+ print("Initial extraction is done. Now cleaning the JSON...",flush=True)
280
  with open("/tmp/data/uncleaned_companies.json", "r") as f:
281
  data = json.load(f)
282
 
 
288
  with open(main_lead_info_file, "w") as f:
289
  json.dump(cleaned_data_obj, f, indent=2)
290
 
291
+ print("Cleaned JSON saved to all_cleaned_companies.json", flush=True)
292
+ print("Now enriching the data with website URLs...", flush=True)
293
 
294
  companies = cleaned_data_obj.get("companies", [])
295
  intermediate_data = website_adder.find_all_company_websites(
296
  companies
297
  )
298
  final_data = website_adder.wiki_search_mode(intermediate_data)
299
+ print("Website URL enrichment completed.", flush=True)
300
 
301
+ print("Now enhancing the data quality by removing duplicates...", flush=True)
302
  enhanced_data = data_quality_enhancer.enhancer(
303
  final_data, embedder
304
  )[0]
 
307
  json.dump(enhanced_data, f, indent=2)
308
 
309
  print(
310
+ "Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json", flush=True
311
  )
312
  print(
313
+ "Now scoring the leads based on relevance (Intelligent scoring)...", flush=True
314
  )
315
 
316
  res = lead_scorer.scrape_and_augment(
 
320
  json.dump(res, f, indent=2)
321
 
322
  scored_leads = lead_scorer.score(enhanced_data, res)
323
+ print("Lead scoring completed. Here are the scored leads:", flush=True)
324
+ print(scored_leads, flush=True)
325
  st.session_state.pipeline_executed = True
326
  st.session_state.data_enhancement = True
327
  st.session_state.intelliscore = True
 
415
  res = lead_scorer.scrape_and_augment(
416
  additional_info, comp_url
417
  )
418
+ print(res, flush=True)
419
  with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
420
  json.dump(res, f, indent=2)
421
  st.success("Scrapping Completed!")
src/post_extraction_tools/data_quality_enhancer.py CHANGED
@@ -110,13 +110,13 @@ def enhancer(data: object, embedder) -> list:
110
  print(duplicate_idx)
111
  duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
112
  companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
113
- print(f"Removed {len(duplicate_idx)} duplicate entries.")
114
  else:
115
- print("No duplicate entries found.")
116
 
117
  print("Now adding the industry keys...")
118
  companies = add_ind_key(companies, embedder)
119
- print("Added Industry keys")
120
 
121
  return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]
122
 
 
110
  print(duplicate_idx)
111
  duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
112
  companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
113
+ print(f"Removed {len(duplicate_idx)} duplicate entries.",flush=True)
114
  else:
115
+ print("No duplicate entries found.",flush=True)
116
 
117
  print("Now adding the industry keys...")
118
  companies = add_ind_key(companies, embedder)
119
+ print("Added Industry keys",flush=True)
120
 
121
  return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]
122
 
src/post_extraction_tools/website_adder.py CHANGED
@@ -149,9 +149,9 @@ def wiki_search_mode(companies):
149
  website_url = extract_website_from_tables(soup, mod_comp_name)
150
  if website_url:
151
  c["website_url"] = clean_url(website_url)
152
- print(f"Found website via Wikipedia: {c['website_url']}")
153
  else:
154
- print(f"No website found on Wikipedia for {c['company_name']}")
155
  except Exception as e:
156
  print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
157
  continue
@@ -167,7 +167,7 @@ def wiki_search_mode(companies):
167
 
168
  with open(file_path, "w") as f:
169
  json.dump({"companies": companies}, f, indent=2)
170
- print("Enriched company list saved to all_cleaned_companies.json")
171
  return {"companies": companies}
172
 
173
 
 
149
  website_url = extract_website_from_tables(soup, mod_comp_name)
150
  if website_url:
151
  c["website_url"] = clean_url(website_url)
152
+ print(f"Found website via Wikipedia: {c['website_url']}",flush=True)
153
  else:
154
+ print(f"No website found on Wikipedia for {c['company_name']}", flush=True)
155
  except Exception as e:
156
  print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
157
  continue
 
167
 
168
  with open(file_path, "w") as f:
169
  json.dump({"companies": companies}, f, indent=2)
170
+ print("Enriched company list saved to all_cleaned_companies.json",flush=True)
171
  return {"companies": companies}
172
 
173