Spaces:
Running
Running
Subhajit Chakraborty
commited on
Commit
·
8f6827d
1
Parent(s):
9cdbd5b
update files(3)
Browse files
src/app.py
CHANGED
|
@@ -205,15 +205,15 @@ if st.session_state.page == "Enrich Companies":
|
|
| 205 |
with open(main_lead_info_file, "w") as f:
|
| 206 |
json.dump(cleaned_data_obj, f, indent=2)
|
| 207 |
|
| 208 |
-
print("Cleaned JSON saved to all_cleaned_companies.json")
|
| 209 |
-
print("Now enriching the data with website URLs...")
|
| 210 |
|
| 211 |
companies = cleaned_data_obj.get("companies", [])
|
| 212 |
intermediate_data = website_adder.find_all_company_websites(
|
| 213 |
companies
|
| 214 |
)
|
| 215 |
final_data = website_adder.wiki_search_mode(intermediate_data)
|
| 216 |
-
print("Website URL enrichment completed.")
|
| 217 |
st.session_state.pipeline_executed = False
|
| 218 |
st.session_state.data_enhancement = False
|
| 219 |
st.session_state.intelliscore = False
|
|
@@ -276,7 +276,7 @@ if st.session_state.page == "Enrich Companies":
|
|
| 276 |
)
|
| 277 |
response = qservice.query()
|
| 278 |
print(response)
|
| 279 |
-
print("Initial extraction is done. Now cleaning the JSON...")
|
| 280 |
with open("/tmp/data/uncleaned_companies.json", "r") as f:
|
| 281 |
data = json.load(f)
|
| 282 |
|
|
@@ -288,17 +288,17 @@ if st.session_state.page == "Enrich Companies":
|
|
| 288 |
with open(main_lead_info_file, "w") as f:
|
| 289 |
json.dump(cleaned_data_obj, f, indent=2)
|
| 290 |
|
| 291 |
-
print("Cleaned JSON saved to all_cleaned_companies.json")
|
| 292 |
-
print("Now enriching the data with website URLs...")
|
| 293 |
|
| 294 |
companies = cleaned_data_obj.get("companies", [])
|
| 295 |
intermediate_data = website_adder.find_all_company_websites(
|
| 296 |
companies
|
| 297 |
)
|
| 298 |
final_data = website_adder.wiki_search_mode(intermediate_data)
|
| 299 |
-
print("Website URL enrichment completed.")
|
| 300 |
|
| 301 |
-
print("Now enhancing the data quality by removing duplicates...")
|
| 302 |
enhanced_data = data_quality_enhancer.enhancer(
|
| 303 |
final_data, embedder
|
| 304 |
)[0]
|
|
@@ -307,10 +307,10 @@ if st.session_state.page == "Enrich Companies":
|
|
| 307 |
json.dump(enhanced_data, f, indent=2)
|
| 308 |
|
| 309 |
print(
|
| 310 |
-
"Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json"
|
| 311 |
)
|
| 312 |
print(
|
| 313 |
-
"Now scoring the leads based on relevance (Intelligent scoring)..."
|
| 314 |
)
|
| 315 |
|
| 316 |
res = lead_scorer.scrape_and_augment(
|
|
@@ -320,8 +320,8 @@ if st.session_state.page == "Enrich Companies":
|
|
| 320 |
json.dump(res, f, indent=2)
|
| 321 |
|
| 322 |
scored_leads = lead_scorer.score(enhanced_data, res)
|
| 323 |
-
print("Lead scoring completed. Here are the scored leads:")
|
| 324 |
-
print(scored_leads)
|
| 325 |
st.session_state.pipeline_executed = True
|
| 326 |
st.session_state.data_enhancement = True
|
| 327 |
st.session_state.intelliscore = True
|
|
@@ -415,7 +415,7 @@ if st.session_state.page == "IntelliSCORE":
|
|
| 415 |
res = lead_scorer.scrape_and_augment(
|
| 416 |
additional_info, comp_url
|
| 417 |
)
|
| 418 |
-
print(res)
|
| 419 |
with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
|
| 420 |
json.dump(res, f, indent=2)
|
| 421 |
st.success("Scrapping Completed!")
|
|
|
|
| 205 |
with open(main_lead_info_file, "w") as f:
|
| 206 |
json.dump(cleaned_data_obj, f, indent=2)
|
| 207 |
|
| 208 |
+
print("Cleaned JSON saved to all_cleaned_companies.json",flush=True)
|
| 209 |
+
print("Now enriching the data with website URLs...",flush=True)
|
| 210 |
|
| 211 |
companies = cleaned_data_obj.get("companies", [])
|
| 212 |
intermediate_data = website_adder.find_all_company_websites(
|
| 213 |
companies
|
| 214 |
)
|
| 215 |
final_data = website_adder.wiki_search_mode(intermediate_data)
|
| 216 |
+
print("Website URL enrichment completed.", flush=True)
|
| 217 |
st.session_state.pipeline_executed = False
|
| 218 |
st.session_state.data_enhancement = False
|
| 219 |
st.session_state.intelliscore = False
|
|
|
|
| 276 |
)
|
| 277 |
response = qservice.query()
|
| 278 |
print(response)
|
| 279 |
+
print("Initial extraction is done. Now cleaning the JSON...",flush=True)
|
| 280 |
with open("/tmp/data/uncleaned_companies.json", "r") as f:
|
| 281 |
data = json.load(f)
|
| 282 |
|
|
|
|
| 288 |
with open(main_lead_info_file, "w") as f:
|
| 289 |
json.dump(cleaned_data_obj, f, indent=2)
|
| 290 |
|
| 291 |
+
print("Cleaned JSON saved to all_cleaned_companies.json", flush=True)
|
| 292 |
+
print("Now enriching the data with website URLs...", flush=True)
|
| 293 |
|
| 294 |
companies = cleaned_data_obj.get("companies", [])
|
| 295 |
intermediate_data = website_adder.find_all_company_websites(
|
| 296 |
companies
|
| 297 |
)
|
| 298 |
final_data = website_adder.wiki_search_mode(intermediate_data)
|
| 299 |
+
print("Website URL enrichment completed.", flush=True)
|
| 300 |
|
| 301 |
+
print("Now enhancing the data quality by removing duplicates...", flush=True)
|
| 302 |
enhanced_data = data_quality_enhancer.enhancer(
|
| 303 |
final_data, embedder
|
| 304 |
)[0]
|
|
|
|
| 307 |
json.dump(enhanced_data, f, indent=2)
|
| 308 |
|
| 309 |
print(
|
| 310 |
+
"Data quality enhancement completed. Cleaned data saved to all_cleaned_companies.json", flush=True
|
| 311 |
)
|
| 312 |
print(
|
| 313 |
+
"Now scoring the leads based on relevance (Intelligent scoring)...", flush=True
|
| 314 |
)
|
| 315 |
|
| 316 |
res = lead_scorer.scrape_and_augment(
|
|
|
|
| 320 |
json.dump(res, f, indent=2)
|
| 321 |
|
| 322 |
scored_leads = lead_scorer.score(enhanced_data, res)
|
| 323 |
+
print("Lead scoring completed. Here are the scored leads:", flush=True)
|
| 324 |
+
print(scored_leads, flush=True)
|
| 325 |
st.session_state.pipeline_executed = True
|
| 326 |
st.session_state.data_enhancement = True
|
| 327 |
st.session_state.intelliscore = True
|
|
|
|
| 415 |
res = lead_scorer.scrape_and_augment(
|
| 416 |
additional_info, comp_url
|
| 417 |
)
|
| 418 |
+
print(res, flush=True)
|
| 419 |
with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
|
| 420 |
json.dump(res, f, indent=2)
|
| 421 |
st.success("Scrapping Completed!")
|
src/post_extraction_tools/data_quality_enhancer.py
CHANGED
|
@@ -110,13 +110,13 @@ def enhancer(data: object, embedder) -> list:
|
|
| 110 |
print(duplicate_idx)
|
| 111 |
duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
|
| 112 |
companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
|
| 113 |
-
print(f"Removed {len(duplicate_idx)} duplicate entries.")
|
| 114 |
else:
|
| 115 |
-
print("No duplicate entries found.")
|
| 116 |
|
| 117 |
print("Now adding the industry keys...")
|
| 118 |
companies = add_ind_key(companies, embedder)
|
| 119 |
-
print("Added Industry keys")
|
| 120 |
|
| 121 |
return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]
|
| 122 |
|
|
|
|
| 110 |
print(duplicate_idx)
|
| 111 |
duplicate_comps = [companies[i]["company_name"] for i in duplicate_idx]
|
| 112 |
companies = [c for idx, c in enumerate(companies) if idx not in duplicate_idx]
|
| 113 |
+
print(f"Removed {len(duplicate_idx)} duplicate entries.",flush=True)
|
| 114 |
else:
|
| 115 |
+
print("No duplicate entries found.",flush=True)
|
| 116 |
|
| 117 |
print("Now adding the industry keys...")
|
| 118 |
companies = add_ind_key(companies, embedder)
|
| 119 |
+
print("Added Industry keys",flush=True)
|
| 120 |
|
| 121 |
return [{"companies": companies}, {"duplicate_company_names": duplicate_comps}]
|
| 122 |
|
src/post_extraction_tools/website_adder.py
CHANGED
|
@@ -149,9 +149,9 @@ def wiki_search_mode(companies):
|
|
| 149 |
website_url = extract_website_from_tables(soup, mod_comp_name)
|
| 150 |
if website_url:
|
| 151 |
c["website_url"] = clean_url(website_url)
|
| 152 |
-
print(f"Found website via Wikipedia: {c['website_url']}")
|
| 153 |
else:
|
| 154 |
-
print(f"No website found on Wikipedia for {c['company_name']}")
|
| 155 |
except Exception as e:
|
| 156 |
print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
|
| 157 |
continue
|
|
@@ -167,7 +167,7 @@ def wiki_search_mode(companies):
|
|
| 167 |
|
| 168 |
with open(file_path, "w") as f:
|
| 169 |
json.dump({"companies": companies}, f, indent=2)
|
| 170 |
-
print("Enriched company list saved to all_cleaned_companies.json")
|
| 171 |
return {"companies": companies}
|
| 172 |
|
| 173 |
|
|
|
|
| 149 |
website_url = extract_website_from_tables(soup, mod_comp_name)
|
| 150 |
if website_url:
|
| 151 |
c["website_url"] = clean_url(website_url)
|
| 152 |
+
print(f"Found website via Wikipedia: {c['website_url']}",flush=True)
|
| 153 |
else:
|
| 154 |
+
print(f"No website found on Wikipedia for {c['company_name']}", flush=True)
|
| 155 |
except Exception as e:
|
| 156 |
print(f"Error accessing Wikipedia for {c['company_name']}: {str(e)}")
|
| 157 |
continue
|
|
|
|
| 167 |
|
| 168 |
with open(file_path, "w") as f:
|
| 169 |
json.dump({"companies": companies}, f, indent=2)
|
| 170 |
+
print("Enriched company list saved to all_cleaned_companies.json",flush=True)
|
| 171 |
return {"companies": companies}
|
| 172 |
|
| 173 |
|