Spaces:

Subhajit01
/

SmartLead

Sleeping

App Files Files Community

Subhajit Chakraborty commited on Oct 21, 2025

Commit

45021e5

1 Parent(s): 99de885

update files(7)

Browse files

Files changed (6) hide show

src/app.py +30 -19
src/post_extraction_tools/add_key_industry.py +0 -0
src/post_extraction_tools/lead_scoring.py +4 -4
src/post_extraction_tools/website_adder.py +4 -4
src/services/queryService.py +10 -6
src/services/session_utils.py +37 -0

src/app.py CHANGED Viewed

@@ -8,7 +8,6 @@ os.makedirs("/tmp/huggingface", exist_ok=True)
 DATA_DIR = "/tmp/data"
 os.makedirs(DATA_DIR, exist_ok=True)
-main_lead_info_file = os.path.join(DATA_DIR, "all_cleaned_companies.json")
 import streamlit as st
@@ -24,12 +23,14 @@ from post_extraction_tools import (
     chart_data,
 )
 from services.add_leads import add_leads_f
 import json
 import pandas as pd
 # INITIALIZATION
 llm = LLMClient().client
 @st.cache_resource
 def load_model():
@@ -59,7 +60,10 @@ if "lead_conditions" not in st.session_state:
     st.session_state.lead_conditions = False
 if "ask_for_scrap_per" not in st.session_state:
     st.session_state.ask_scrap_per = False
 with st.sidebar:
     for page_name in [
         "Dashboard",
@@ -196,6 +200,9 @@ if st.session_state.page == "Enrich Companies":
                             "score": None
                         }
                         lead_data = {"companies": [lead_data]}
                         cleaned_data = clean_json.clean_json_f(lead_data)
                         print(cleaned_data)
                         cleaned_data_obj = json.loads(cleaned_data)
@@ -212,7 +219,7 @@ if st.session_state.page == "Enrich Companies":
                         intermediate_data = website_adder.find_all_company_websites(
                             companies
                         )
-                        final_data = website_adder.wiki_search_mode(intermediate_data)
                         print("Website URL enrichment completed.", flush=True)
                         st.session_state.pipeline_executed = False
                         st.session_state.data_enhancement = False
@@ -277,9 +284,9 @@ if st.session_state.page == "Enrich Companies":
                     response = qservice.query()
                     print(response)
                     print("Initial extraction is done. Now cleaning the JSON...",flush=True)
-                    with open("/tmp/data/uncleaned_companies.json", "r") as f:
-                        data = json.load(f)
                     cleaned_data = clean_json.clean_json_f(data)
                     cleaned_data_obj = json.loads(cleaned_data)
                     cleaned_data_obj = add_leads_f(
@@ -295,7 +302,7 @@ if st.session_state.page == "Enrich Companies":
                     intermediate_data = website_adder.find_all_company_websites(
                         companies
                     )
-                    final_data = website_adder.wiki_search_mode(intermediate_data)
                     print("Website URL enrichment completed.", flush=True)
                     print("Now enhancing the data quality by removing duplicates...", flush=True)
@@ -316,10 +323,11 @@ if st.session_state.page == "Enrich Companies":
                     res = lead_scorer.scrape_and_augment(
                         own_comp_info, own_comp_web_url
                     )
-                    with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
-                        json.dump(res, f, indent=2)
-                    scored_leads = lead_scorer.score(enhanced_data, res)
                     print("Lead scoring completed. Here are the scored leads:", flush=True)
                     print(scored_leads, flush=True)
                     st.session_state.pipeline_executed = True
@@ -416,16 +424,18 @@ if st.session_state.page == "IntelliSCORE":
                                 additional_info, comp_url
                             )
                             print(res, flush=True)
-                            with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
-                                json.dump(res, f, indent=2)
                             st.success("Scrapping Completed!")
                             if res and "error" not in res:
                                 st.session_state.lead_conditions = True
-                        with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
-                            lead_cond = json.load(f)
                         with st.spinner("Scoring the leads..."):
-                            scored_leads = lead_scorer.score(leads, lead_cond)
                             st.success("Scoring Completed!")
                         st.text("See Dashboard for latest scored leads!!")
@@ -433,10 +443,11 @@ if st.session_state.page == "IntelliSCORE":
                     else:
                         st.text("Skipping url scrapping...")
-                        with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
-                            lead_cond = json.load(f)
                         with st.spinner("Scoring the leads..."):
-                            scored_leads = lead_scorer.score(leads, lead_cond)
                             st.success("Scoring Completed!")
                         st.text("See Dashboard for latest scored leads!!")

 DATA_DIR = "/tmp/data"
 os.makedirs(DATA_DIR, exist_ok=True)
 import streamlit as st
     chart_data,
 )
 from services.add_leads import add_leads_f
+from services.session_utils import get_session_temp_dir
 import json
 import pandas as pd
 # INITIALIZATION
 llm = LLMClient().client
+ISOLATED_SESSION_DIR = get_session_temp_dir(DATA_DIR)
+main_lead_info_file = os.path.join(ISOLATED_SESSION_DIR, "all_cleaned_companies.json")
 @st.cache_resource
 def load_model():
     st.session_state.lead_conditions = False
 if "ask_for_scrap_per" not in st.session_state:
     st.session_state.ask_scrap_per = False
+if 'uncleaned_companies' not in st.session_state:
+    st.session_state.uncleaned_companies = {}
+if "lead_conditions_data" not in st.session_state:
+    st.session_state.lead_conditions_data = {}
 with st.sidebar:
     for page_name in [
         "Dashboard",
                             "score": None
                         }
                         lead_data = {"companies": [lead_data]}
+                        st.session_state.uncleaned_companies = lead_data
+                        # print("Type of actual data: ", type(st.session_state.uncleaned_companies))
+                        # print("json data: ", json.loads(st.session_state.uncleaned_companies))
                         cleaned_data = clean_json.clean_json_f(lead_data)
                         print(cleaned_data)
                         cleaned_data_obj = json.loads(cleaned_data)
                         intermediate_data = website_adder.find_all_company_websites(
                             companies
                         )
+                        final_data = website_adder.wiki_search_mode(intermediate_data, ISOLATED_SESSION_DIR)
                         print("Website URL enrichment completed.", flush=True)
                         st.session_state.pipeline_executed = False
                         st.session_state.data_enhancement = False
                     response = qservice.query()
                     print(response)
                     print("Initial extraction is done. Now cleaning the JSON...",flush=True)
+                    # with open("/tmp/data/uncleaned_companies.json", "r") as f:
+                    #     data = json.load(f)
+                    data = st.session_state.uncleaned_companies
                     cleaned_data = clean_json.clean_json_f(data)
                     cleaned_data_obj = json.loads(cleaned_data)
                     cleaned_data_obj = add_leads_f(
                     intermediate_data = website_adder.find_all_company_websites(
                         companies
                     )
+                    final_data = website_adder.wiki_search_mode(intermediate_data, ISOLATED_SESSION_DIR)
                     print("Website URL enrichment completed.", flush=True)
                     print("Now enhancing the data quality by removing duplicates...", flush=True)
                     res = lead_scorer.scrape_and_augment(
                         own_comp_info, own_comp_web_url
                     )
+                    # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
+                    #     json.dump(res, f, indent=2)
+                    st.session_state.lead_conditions_data = res
+                    scored_leads = lead_scorer.score(enhanced_data, res, ISOLATED_SESSION_DIR)
                     print("Lead scoring completed. Here are the scored leads:", flush=True)
                     print(scored_leads, flush=True)
                     st.session_state.pipeline_executed = True
                                 additional_info, comp_url
                             )
                             print(res, flush=True)
+                            # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
+                            #     json.dump(res, f, indent=2)
+                            st.session_state.lead_conditions_data = res
                             st.success("Scrapping Completed!")
                             if res and "error" not in res:
                                 st.session_state.lead_conditions = True
+                        # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
+                        #     lead_cond = json.load(f)
+                        lead_cond = st.session_state.lead_conditions_data
                         with st.spinner("Scoring the leads..."):
+                            scored_leads = lead_scorer.score(leads, lead_cond, ISOLATED_SESSION_DIR)
                             st.success("Scoring Completed!")
                         st.text("See Dashboard for latest scored leads!!")
                     else:
                         st.text("Skipping url scrapping...")
+                        # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
+                        #     lead_cond = json.load(f)
+                        lead_cond = st.session_state.lead_conditions_data
                         with st.spinner("Scoring the leads..."):
+                            scored_leads = lead_scorer.score(leads, lead_cond, ISOLATED_SESSION_DIR)
                             st.success("Scoring Completed!")
                         st.text("See Dashboard for latest scored leads!!")

src/post_extraction_tools/add_key_industry.py DELETED Viewed

File without changes

src/post_extraction_tools/lead_scoring.py CHANGED Viewed

@@ -68,7 +68,7 @@ class LeadScoring:
         parsed_res = self.parser.parse(res.content)
         return parsed_res
-    def score(self, leads: object, conditions: object):
         # scored_leads = []
         for lead in leads["companies"]:
             if (lead["score"] is None or lead["score"] == 0):
@@ -113,10 +113,10 @@ class LeadScoring:
                 lead['score'] = score
                 # scored_leads.append(lead)
-                data_folder = "/tmp/data"
-                os.makedirs(data_folder, exist_ok=True)
-                file_path = os.path.join(data_folder, "all_cleaned_companies.json")
                 with open(file_path, "w") as f:
                     json.dump(leads, f, indent=2)

         parsed_res = self.parser.parse(res.content)
         return parsed_res
+    def score(self, leads: object, conditions: object, main_data_folder):
         # scored_leads = []
         for lead in leads["companies"]:
             if (lead["score"] is None or lead["score"] == 0):
                 lead['score'] = score
                 # scored_leads.append(lead)
+                # data_folder = "/tmp/data"
+                os.makedirs(main_data_folder, exist_ok=True)
+                file_path = os.path.join(main_data_folder, "all_cleaned_companies.json")
                 with open(file_path, "w") as f:
                     json.dump(leads, f, indent=2)

src/post_extraction_tools/website_adder.py CHANGED Viewed

@@ -120,7 +120,7 @@ def check_percent_with_urls(companies):
     percent_with_urls = sum(1 for c in companies if c.get("website_url")) / len(companies) * 100
     return percent_with_urls
-def wiki_search_mode(companies):
     percent_with_urls = check_percent_with_urls(companies)
     if percent_with_urls < 100:
         print("Less than 100% of companies have website URLs. Going to wikisearch mode...")
@@ -160,10 +160,10 @@ def wiki_search_mode(companies):
     else:
         print("All companies already have website URLs. Skipping wikisearch mode...")
     print("Saving results...")
-    data_folder = "/tmp/data"
-    os.makedirs(data_folder, exist_ok=True)
-    file_path = os.path.join(data_folder, "all_cleaned_companies.json")
     with open(file_path, "w") as f:
         json.dump({"companies": companies}, f, indent=2)

     percent_with_urls = sum(1 for c in companies if c.get("website_url")) / len(companies) * 100
     return percent_with_urls
+def wiki_search_mode(companies, main_data_folder):
     percent_with_urls = check_percent_with_urls(companies)
     if percent_with_urls < 100:
         print("Less than 100% of companies have website URLs. Going to wikisearch mode...")
     else:
         print("All companies already have website URLs. Skipping wikisearch mode...")
     print("Saving results...")
+    # data_folder = "/tmp/data"
+    os.makedirs(main_data_folder, exist_ok=True)
+    file_path = os.path.join(main_data_folder, "all_cleaned_companies.json")
     with open(file_path, "w") as f:
         json.dump({"companies": companies}, f, indent=2)

src/services/queryService.py CHANGED Viewed

@@ -3,6 +3,7 @@ from post_extraction_tools.jsonparser import JSONOutputParser
 from services.parametricSearch import ParametricSearch
 from langchain.output_parsers import StructuredOutputParser # Import the parser
 from data_models import CompanyList
 import re
 import json
@@ -70,7 +71,8 @@ class QService:
                 ```
             8. You must return a minimum of 5 companies that meet the criteria. If you cannot find enough companies, return as many as you can.
-            **YOUR FINAL OUTPUT MUST FOLLOW THIS STRUCTURE (INCLUDING THE THOUGHT AND FINAL ANSWER TAGS):**
             Thought: I have successfully gathered the required data. I will now output the final answer in the requested JSON format.
             Final Answer:
@@ -112,12 +114,14 @@ class QService:
         json_output = final_response.model_dump_json(indent=2)
         # filename = f"companies_{self.industry_type}_{self.location}.json".replace(" ", "_").lower()
-        filename = "uncleaned_companies.json"
         try:
-            with open(f"/tmp/data/{filename}", 'w', encoding="utf-8") as f:
-                f.write(json_output)
-                print(f"Data successfully written to {filename}")
         except Exception as e:
-            print(f"Error writing to file: {str(e)}")
         return final_response

 from services.parametricSearch import ParametricSearch
 from langchain.output_parsers import StructuredOutputParser # Import the parser
 from data_models import CompanyList
+import streamlit as st
 import re
 import json
                 ```
             8. You must return a minimum of 5 companies that meet the criteria. If you cannot find enough companies, return as many as you can.
+            **YOUR FINAL OUTPUT MUST FOLLOW THIS STRUCTURE:**
+            ... (intermediate Action/Thought/Observation blocks)
             Thought: I have successfully gathered the required data. I will now output the final answer in the requested JSON format.
             Final Answer:
         json_output = final_response.model_dump_json(indent=2)
         # filename = f"companies_{self.industry_type}_{self.location}.json".replace(" ", "_").lower()
+        # filename = "uncleaned_companies.json"
         try:
+            # with open(f"/tmp/data/{filename}", 'w', encoding="utf-8") as f:
+            #     f.write(json_output)
+            #     print(f"Data successfully written to {filename}")
+            st.session_state.uncleaned_companies = json.loads(json_output)
         except Exception as e:
+            print(f"Error writing to session state: {str(e)}")
         return final_response

src/services/session_utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import uuid
+import streamlit as st
+import os
+import time
+import sys
+import shutil
+MAX_DIR_AGE_SECONDS = 1800
+def get_session_id():
+    if 'session_id' not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())
+    return st.session_state.session_id
+def cleanup_stale_directories(base_dir: str):
+    """Deletes directories under base_dir older than MAX_DIR_AGE_SECONDS."""
+    now = time.time()
+    for item_name in os.listdir(base_dir):
+        item_path = os.path.join(base_dir, item_name)
+        if os.path.isdir(item_path):
+            try:
+                mtime = os.path.getmtime(item_path)
+                if (now - mtime) > MAX_DIR_AGE_SECONDS:
+                    shutil.rmtree(item_path)
+                    print(f"Cleaned up stale directory: {item_path}", file=sys.stderr)
+            except Exception as e:
+                print(f"Error cleaning directory {item_path}: {e}", file=sys.stderr)
+                pass
+def get_session_temp_dir(main_data_dir):
+    cleanup_stale_directories(main_data_dir)
+    session_id = get_session_id()
+    temp_dir = os.path.join(main_data_dir, session_id)
+    os.makedirs(temp_dir, exist_ok=True)
+    return temp_dir