Spaces:

Subhajit01
/

SmartLead

Running

App Files Files Community

Subhajit Chakraborty commited on Oct 21, 2025

Commit

8f6827d

1 Parent(s): 9cdbd5b

update files(3)

Browse files

Files changed (3) hide show

src/app.py +13 -13
src/post_extraction_tools/data_quality_enhancer.py +3 -3
src/post_extraction_tools/website_adder.py +3 -3

src/app.py CHANGED Viewed

@@ -205,15 +205,15 @@ if st.session_state.page == "Enrich Companies":
                         with open(main_lead_info_file, "w") as f:
                             json.dump(cleaned_data_obj, f, indent=2)
-                        print("Cleaned JSON saved to all_cleaned_companies.json")
-                        print("Now enriching the data with website URLs...")
                         companies = cleaned_data_obj.get("companies", [])
                         intermediate_data = website_adder.find_all_company_websites(
                             companies
                         )
                         final_data = website_adder.wiki_search_mode(intermediate_data)
-                        print("Website URL enrichment completed.")
                         st.session_state.pipeline_executed = False
                         st.session_state.data_enhancement = False
                         st.session_state.intelliscore = False
@@ -276,7 +276,7 @@ if st.session_state.page == "Enrich Companies":
                     )
                     response = qservice.query()
                     print(response)
-                    print("Initial extraction is done. Now cleaning the JSON...")
                     with open("/tmp/data/uncleaned_companies.json", "r") as f:
                         data = json.load(f)
@@ -288,17 +288,17 @@ if st.session_state.page == "Enrich Companies":
                     with open(main_lead_info_file, "w") as f:
                         json.dump(cleaned_data_obj, f, indent=2)
-                    print("Cleaned JSON saved to all_cleaned_companies.json")
-                    print("Now enriching the data with website URLs...")
                     companies = cleaned_data_obj.get("companies", [])
                     intermediate_data = website_adder.find_all_company_websites(
                         companies
                     )
                     final_data = website_adder.wiki_search_mode(intermediate_data)
-                    print("Website URL enrichment completed.")
-                    print("Now enhancing the data quality by removing duplicates...")
                     enhanced_data = data_quality_enhancer.enhancer(
                         final_data, embedder
                     )[0]
@@ -307,10 +307,10 @@ if st.session_state.page == "Enrich Companies":
                         json.dump(enhanced_data, f, indent=2)
                     print(
-                        "Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json"
                     )
                     print(
-                        "Now scoring the leads based on relevance (Intelligent scoring)..."
                     )
                     res = lead_scorer.scrape_and_augment(
@@ -320,8 +320,8 @@ if st.session_state.page == "Enrich Companies":
                         json.dump(res, f, indent=2)
                     scored_leads = lead_scorer.score(enhanced_data, res)
-                    print("Lead scoring completed. Here are the scored leads:")
-                    print(scored_leads)
                     st.session_state.pipeline_executed = True
                     st.session_state.data_enhancement = True
                     st.session_state.intelliscore = True
@@ -415,7 +415,7 @@ if st.session_state.page == "IntelliSCORE":
                             res = lead_scorer.scrape_and_augment(
                                 additional_info, comp_url
                             )
-                            print(res)
                             with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
                                 json.dump(res, f, indent=2)
                             st.success("Scrapping Completed!")

                         with open(main_lead_info_file, "w") as f:
                             json.dump(cleaned_data_obj, f, indent=2)
+                        print("Cleaned JSON saved to all_cleaned_companies.json",flush=True)
+                        print("Now enriching the data with website URLs...",flush=True)
                         companies = cleaned_data_obj.get("companies", [])
                         intermediate_data = website_adder.find_all_company_websites(
                             companies
                         )
                         final_data = website_adder.wiki_search_mode(intermediate_data)
+                        print("Website URL enrichment completed.", flush=True)
                         st.session_state.pipeline_executed = False
                         st.session_state.data_enhancement = False
                         st.session_state.intelliscore = False
                     )
                     response = qservice.query()
                     print(response)
+                    print("Initial extraction is done. Now cleaning the JSON...",flush=True)
                     with open("/tmp/data/uncleaned_companies.json", "r") as f:
                         data = json.load(f)
                     with open(main_lead_info_file, "w") as f:
                         json.dump(cleaned_data_obj, f, indent=2)
+                    print("Cleaned JSON saved to all_cleaned_companies.json", flush=True)
+                    print("Now enriching the data with website URLs...", flush=True)
                     companies = cleaned_data_obj.get("companies", [])
                     intermediate_data = website_adder.find_all_company_websites(
                         companies
                     )
                     final_data = website_adder.wiki_search_mode(intermediate_data)
+                    print("Website URL enrichment completed.", flush=True)
+                    print("Now enhancing the data quality by removing duplicates...", flush=True)
                     enhanced_data = data_quality_enhancer.enhancer(
                         final_data, embedder
                     )[0]
                         json.dump(enhanced_data, f, indent=2)
                     print(
+                        "Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json", flush=True
                     )
                     print(
+                        "Now scoring the leads based on relevance (Intelligent scoring)...", flush=True
                     )
                     res = lead_scorer.scrape_and_augment(
                         json.dump(res, f, indent=2)
                     scored_leads = lead_scorer.score(enhanced_data, res)
+                    print("Lead scoring completed. Here are the scored leads:", flush=True)
+                    print(scored_leads, flush=True)
                     st.session_state.pipeline_executed = True
                     st.session_state.data_enhancement = True
                     st.session_state.intelliscore = True
                             res = lead_scorer.scrape_and_augment(
                                 additional_info, comp_url
                             )
+                            print(res, flush=True)
                             with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
                                 json.dump(res, f, indent=2)
                             st.success("Scrapping Completed!")

src/post_extraction_tools/data_quality_enhancer.py CHANGED Viewed

@@ -110,13 +110,13 @@ def enhancer(data: object, embedder) -> list:
         print(duplicate_idx)
         duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
         companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
-        print(f"Removed {len(duplicate_idx)} duplicate entries.")
     else:
-        print("No duplicate entries found.")
     print("Now adding the industry keys...")
     companies = add_ind_key(companies, embedder)
-    print("Added Industry keys")
     return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]

         print(duplicate_idx)
         duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
         companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
+        print(f"Removed {len(duplicate_idx)} duplicate entries.",flush=True)
     else:
+        print("No duplicate entries found.",flush=True)
     print("Now adding the industry keys...")
     companies = add_ind_key(companies, embedder)
+    print("Added Industry keys",flush=True)
     return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]

src/post_extraction_tools/website_adder.py CHANGED Viewed

@@ -149,9 +149,9 @@ def wiki_search_mode(companies):
                     website_url = extract_website_from_tables(soup, mod_comp_name)
                     if website_url:
                         c["website_url"] = clean_url(website_url)
-                        print(f"Found website via Wikipedia: {c['website_url']}")
                     else:
-                        print(f"No website found on Wikipedia for {c['company_name']}")
                 except Exception as e:
                     print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
                     continue
@@ -167,7 +167,7 @@ def wiki_search_mode(companies):
     with open(file_path, "w") as f:
         json.dump({"companies": companies}, f, indent=2)
-    print("Enriched company list saved to all_cleaned_companies.json")
     return {"companies": companies}

                     website_url = extract_website_from_tables(soup, mod_comp_name)
                     if website_url:
                         c["website_url"] = clean_url(website_url)
+                        print(f"Found website via Wikipedia: {c['website_url']}",flush=True)
                     else:
+                        print(f"No website found on Wikipedia for {c['company_name']}", flush=True)
                 except Exception as e:
                     print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
                     continue
     with open(file_path, "w") as f:
         json.dump({"companies": companies}, f, indent=2)
+    print("Enriched company list saved to all_cleaned_companies.json",flush=True)
     return {"companies": companies}