Subhajit Chakraborty commited on
Commit
45021e5
·
1 Parent(s): 99de885

update files(7)

Browse files
src/app.py CHANGED
@@ -8,7 +8,6 @@ os.makedirs("/tmp/huggingface", exist_ok=True)
8
 
9
  DATA_DIR = "/tmp/data"
10
  os.makedirs(DATA_DIR, exist_ok=True)
11
- main_lead_info_file = os.path.join(DATA_DIR, "all_cleaned_companies.json")
12
 
13
  import streamlit as st
14
 
@@ -24,12 +23,14 @@ from post_extraction_tools import (
24
  chart_data,
25
  )
26
  from services.add_leads import add_leads_f
 
27
  import json
28
  import pandas as pd
29
 
30
  # INITIALIZATION
31
  llm = LLMClient().client
32
-
 
33
 
34
  @st.cache_resource
35
  def load_model():
@@ -59,7 +60,10 @@ if "lead_conditions" not in st.session_state:
59
  st.session_state.lead_conditions = False
60
  if "ask_for_scrap_per" not in st.session_state:
61
  st.session_state.ask_scrap_per = False
62
-
 
 
 
63
  with st.sidebar:
64
  for page_name in [
65
  "Dashboard",
@@ -196,6 +200,9 @@ if st.session_state.page == "Enrich Companies":
196
  "score": None
197
  }
198
  lead_data = {"companies": [lead_data]}
 
 
 
199
  cleaned_data = clean_json.clean_json_f(lead_data)
200
  print(cleaned_data)
201
  cleaned_data_obj = json.loads(cleaned_data)
@@ -212,7 +219,7 @@ if st.session_state.page == "Enrich Companies":
212
  intermediate_data = website_adder.find_all_company_websites(
213
  companies
214
  )
215
- final_data = website_adder.wiki_search_mode(intermediate_data)
216
  print("Website URL enrichment completed.", flush=True)
217
  st.session_state.pipeline_executed = False
218
  st.session_state.data_enhancement = False
@@ -277,9 +284,9 @@ if st.session_state.page == "Enrich Companies":
277
  response = qservice.query()
278
  print(response)
279
  print("Initial extraction is done. Now cleaning the JSON...",flush=True)
280
- with open("/tmp/data/uncleaned_companies.json", "r") as f:
281
- data = json.load(f)
282
-
283
  cleaned_data = clean_json.clean_json_f(data)
284
  cleaned_data_obj = json.loads(cleaned_data)
285
  cleaned_data_obj = add_leads_f(
@@ -295,7 +302,7 @@ if st.session_state.page == "Enrich Companies":
295
  intermediate_data = website_adder.find_all_company_websites(
296
  companies
297
  )
298
- final_data = website_adder.wiki_search_mode(intermediate_data)
299
  print("Website URL enrichment completed.", flush=True)
300
 
301
  print("Now enhancing the data quality by removing duplicates...", flush=True)
@@ -316,10 +323,11 @@ if st.session_state.page == "Enrich Companies":
316
  res = lead_scorer.scrape_and_augment(
317
  own_comp_info, own_comp_web_url
318
  )
319
- with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
320
- json.dump(res, f, indent=2)
321
 
322
- scored_leads = lead_scorer.score(enhanced_data, res)
 
323
  print("Lead scoring completed. Here are the scored leads:", flush=True)
324
  print(scored_leads, flush=True)
325
  st.session_state.pipeline_executed = True
@@ -416,16 +424,18 @@ if st.session_state.page == "IntelliSCORE":
416
  additional_info, comp_url
417
  )
418
  print(res, flush=True)
419
- with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
420
- json.dump(res, f, indent=2)
 
421
  st.success("Scrapping Completed!")
422
  if res and "error" not in res:
423
  st.session_state.lead_conditions = True
424
 
425
- with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
426
- lead_cond = json.load(f)
 
427
  with st.spinner("Scoring the leads..."):
428
- scored_leads = lead_scorer.score(leads, lead_cond)
429
  st.success("Scoring Completed!")
430
 
431
  st.text("See Dashboard for latest scored leads!!")
@@ -433,10 +443,11 @@ if st.session_state.page == "IntelliSCORE":
433
 
434
  else:
435
  st.text("Skipping url scrapping...")
436
- with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
437
- lead_cond = json.load(f)
 
438
  with st.spinner("Scoring the leads..."):
439
- scored_leads = lead_scorer.score(leads, lead_cond)
440
  st.success("Scoring Completed!")
441
 
442
  st.text("See Dashboard for latest scored leads!!")
 
8
 
9
  DATA_DIR = "/tmp/data"
10
  os.makedirs(DATA_DIR, exist_ok=True)
 
11
 
12
  import streamlit as st
13
 
 
23
  chart_data,
24
  )
25
  from services.add_leads import add_leads_f
26
+ from services.session_utils import get_session_temp_dir
27
  import json
28
  import pandas as pd
29
 
30
  # INITIALIZATION
31
  llm = LLMClient().client
32
+ ISOLATED_SESSION_DIR = get_session_temp_dir(DATA_DIR)
33
+ main_lead_info_file = os.path.join(ISOLATED_SESSION_DIR, "all_cleaned_companies.json")
34
 
35
  @st.cache_resource
36
  def load_model():
 
60
  st.session_state.lead_conditions = False
61
  if "ask_for_scrap_per" not in st.session_state:
62
  st.session_state.ask_scrap_per = False
63
+ if 'uncleaned_companies' not in st.session_state:
64
+ st.session_state.uncleaned_companies = {}
65
+ if "lead_conditions_data" not in st.session_state:
66
+ st.session_state.lead_conditions_data = {}
67
  with st.sidebar:
68
  for page_name in [
69
  "Dashboard",
 
200
  "score": None
201
  }
202
  lead_data = {"companies": [lead_data]}
203
+ st.session_state.uncleaned_companies = lead_data
204
+ # print("Type of actual data: ", type(st.session_state.uncleaned_companies))
205
+ # print("json data: ", json.loads(st.session_state.uncleaned_companies))
206
  cleaned_data = clean_json.clean_json_f(lead_data)
207
  print(cleaned_data)
208
  cleaned_data_obj = json.loads(cleaned_data)
 
219
  intermediate_data = website_adder.find_all_company_websites(
220
  companies
221
  )
222
+ final_data = website_adder.wiki_search_mode(intermediate_data, ISOLATED_SESSION_DIR)
223
  print("Website URL enrichment completed.", flush=True)
224
  st.session_state.pipeline_executed = False
225
  st.session_state.data_enhancement = False
 
284
  response = qservice.query()
285
  print(response)
286
  print("Initial extraction is done. Now cleaning the JSON...",flush=True)
287
+ # with open("/tmp/data/uncleaned_companies.json", "r") as f:
288
+ # data = json.load(f)
289
+ data = st.session_state.uncleaned_companies
290
  cleaned_data = clean_json.clean_json_f(data)
291
  cleaned_data_obj = json.loads(cleaned_data)
292
  cleaned_data_obj = add_leads_f(
 
302
  intermediate_data = website_adder.find_all_company_websites(
303
  companies
304
  )
305
+ final_data = website_adder.wiki_search_mode(intermediate_data, ISOLATED_SESSION_DIR)
306
  print("Website URL enrichment completed.", flush=True)
307
 
308
  print("Now enhancing the data quality by removing duplicates...", flush=True)
 
323
  res = lead_scorer.scrape_and_augment(
324
  own_comp_info, own_comp_web_url
325
  )
326
+ # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
327
+ # json.dump(res, f, indent=2)
328
 
329
+ st.session_state.lead_conditions_data = res
330
+ scored_leads = lead_scorer.score(enhanced_data, res, ISOLATED_SESSION_DIR)
331
  print("Lead scoring completed. Here are the scored leads:", flush=True)
332
  print(scored_leads, flush=True)
333
  st.session_state.pipeline_executed = True
 
424
  additional_info, comp_url
425
  )
426
  print(res, flush=True)
427
+ # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "w") as f:
428
+ # json.dump(res, f, indent=2)
429
+ st.session_state.lead_conditions_data = res
430
  st.success("Scrapping Completed!")
431
  if res and "error" not in res:
432
  st.session_state.lead_conditions = True
433
 
434
+ # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
435
+ # lead_cond = json.load(f)
436
+ lead_cond = st.session_state.lead_conditions_data
437
  with st.spinner("Scoring the leads..."):
438
+ scored_leads = lead_scorer.score(leads, lead_cond, ISOLATED_SESSION_DIR)
439
  st.success("Scoring Completed!")
440
 
441
  st.text("See Dashboard for latest scored leads!!")
 
443
 
444
  else:
445
  st.text("Skipping url scrapping...")
446
+ # with open(os.path.join(DATA_DIR, "lead_conditions.json"), "r") as f:
447
+ # lead_cond = json.load(f)
448
+ lead_cond = st.session_state.lead_conditions_data
449
  with st.spinner("Scoring the leads..."):
450
+ scored_leads = lead_scorer.score(leads, lead_cond, ISOLATED_SESSION_DIR)
451
  st.success("Scoring Completed!")
452
 
453
  st.text("See Dashboard for latest scored leads!!")
src/post_extraction_tools/add_key_industry.py DELETED
File without changes
src/post_extraction_tools/lead_scoring.py CHANGED
@@ -68,7 +68,7 @@ class LeadScoring:
68
  parsed_res = self.parser.parse(res.content)
69
  return parsed_res
70
 
71
- def score(self, leads: object, conditions: object):
72
  # scored_leads = []
73
  for lead in leads["companies"]:
74
  if (lead["score"] is None or lead["score"] == 0):
@@ -113,10 +113,10 @@ class LeadScoring:
113
 
114
  lead['score'] = score
115
  # scored_leads.append(lead)
116
- data_folder = "/tmp/data"
117
- os.makedirs(data_folder, exist_ok=True)
118
 
119
- file_path = os.path.join(data_folder, "all_cleaned_companies.json")
120
 
121
  with open(file_path, "w") as f:
122
  json.dump(leads, f, indent=2)
 
68
  parsed_res = self.parser.parse(res.content)
69
  return parsed_res
70
 
71
+ def score(self, leads: object, conditions: object, main_data_folder):
72
  # scored_leads = []
73
  for lead in leads["companies"]:
74
  if (lead["score"] is None or lead["score"] == 0):
 
113
 
114
  lead['score'] = score
115
  # scored_leads.append(lead)
116
+ # data_folder = "/tmp/data"
117
+ os.makedirs(main_data_folder, exist_ok=True)
118
 
119
+ file_path = os.path.join(main_data_folder, "all_cleaned_companies.json")
120
 
121
  with open(file_path, "w") as f:
122
  json.dump(leads, f, indent=2)
src/post_extraction_tools/website_adder.py CHANGED
@@ -120,7 +120,7 @@ def check_percent_with_urls(companies):
120
  percent_with_urls = sum(1 for c in companies if c.get("website_url")) / len(companies) * 100
121
  return percent_with_urls
122
 
123
- def wiki_search_mode(companies):
124
  percent_with_urls = check_percent_with_urls(companies)
125
  if percent_with_urls < 100:
126
  print("Less than 100% of companies have website URLs. Going to wikisearch mode...")
@@ -160,10 +160,10 @@ def wiki_search_mode(companies):
160
  else:
161
  print("All companies already have website URLs. Skipping wikisearch mode...")
162
  print("Saving results...")
163
- data_folder = "/tmp/data"
164
- os.makedirs(data_folder, exist_ok=True)
165
 
166
- file_path = os.path.join(data_folder, "all_cleaned_companies.json")
167
 
168
  with open(file_path, "w") as f:
169
  json.dump({"companies": companies}, f, indent=2)
 
120
  percent_with_urls = sum(1 for c in companies if c.get("website_url")) / len(companies) * 100
121
  return percent_with_urls
122
 
123
+ def wiki_search_mode(companies, main_data_folder):
124
  percent_with_urls = check_percent_with_urls(companies)
125
  if percent_with_urls < 100:
126
  print("Less than 100% of companies have website URLs. Going to wikisearch mode...")
 
160
  else:
161
  print("All companies already have website URLs. Skipping wikisearch mode...")
162
  print("Saving results...")
163
+ # data_folder = "/tmp/data"
164
+ os.makedirs(main_data_folder, exist_ok=True)
165
 
166
+ file_path = os.path.join(main_data_folder, "all_cleaned_companies.json")
167
 
168
  with open(file_path, "w") as f:
169
  json.dump({"companies": companies}, f, indent=2)
src/services/queryService.py CHANGED
@@ -3,6 +3,7 @@ from post_extraction_tools.jsonparser import JSONOutputParser
3
  from services.parametricSearch import ParametricSearch
4
  from langchain.output_parsers import StructuredOutputParser # Import the parser
5
  from data_models import CompanyList
 
6
  import re
7
  import json
8
 
@@ -70,7 +71,8 @@ class QService:
70
  ```
71
  8. You must return a minimum of 5 companies that meet the criteria. If you cannot find enough companies, return as many as you can.
72
 
73
- **YOUR FINAL OUTPUT MUST FOLLOW THIS STRUCTURE (INCLUDING THE THOUGHT AND FINAL ANSWER TAGS):**
 
74
 
75
  Thought: I have successfully gathered the required data. I will now output the final answer in the requested JSON format.
76
  Final Answer:
@@ -112,12 +114,14 @@ class QService:
112
 
113
  json_output = final_response.model_dump_json(indent=2)
114
  # filename = f"companies_{self.industry_type}_{self.location}.json".replace(" ", "_").lower()
115
- filename = "uncleaned_companies.json"
 
116
  try:
117
- with open(f"/tmp/data/{filename}", 'w', encoding="utf-8") as f:
118
- f.write(json_output)
119
- print(f"Data successfully written to {filename}")
 
120
  except Exception as e:
121
- print(f"Error writing to file: {str(e)}")
122
 
123
  return final_response
 
3
  from services.parametricSearch import ParametricSearch
4
  from langchain.output_parsers import StructuredOutputParser # Import the parser
5
  from data_models import CompanyList
6
+ import streamlit as st
7
  import re
8
  import json
9
 
 
71
  ```
72
  8. You must return a minimum of 5 companies that meet the criteria. If you cannot find enough companies, return as many as you can.
73
 
74
+ **YOUR FINAL OUTPUT MUST FOLLOW THIS STRUCTURE:**
75
+ ... (intermediate Action/Thought/Observation blocks)
76
 
77
  Thought: I have successfully gathered the required data. I will now output the final answer in the requested JSON format.
78
  Final Answer:
 
114
 
115
  json_output = final_response.model_dump_json(indent=2)
116
  # filename = f"companies_{self.industry_type}_{self.location}.json".replace(" ", "_").lower()
117
+ # filename = "uncleaned_companies.json"
118
+
119
  try:
120
+ # with open(f"/tmp/data/{filename}", 'w', encoding="utf-8") as f:
121
+ # f.write(json_output)
122
+ # print(f"Data successfully written to {filename}")
123
+ st.session_state.uncleaned_companies = json.loads(json_output)
124
  except Exception as e:
125
+ print(f"Error writing to session state: {str(e)}")
126
 
127
  return final_response
src/services/session_utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import streamlit as st
3
+ import os
4
+ import time
5
+ import sys
6
+ import shutil
7
+
8
+ MAX_DIR_AGE_SECONDS = 1800
9
+
10
+ def get_session_id():
11
+ if 'session_id' not in st.session_state:
12
+ st.session_state.session_id = str(uuid.uuid4())
13
+ return st.session_state.session_id
14
+
15
+
16
+ def cleanup_stale_directories(base_dir: str):
17
+ """Deletes directories under base_dir older than MAX_DIR_AGE_SECONDS."""
18
+ now = time.time()
19
+ for item_name in os.listdir(base_dir):
20
+ item_path = os.path.join(base_dir, item_name)
21
+
22
+ if os.path.isdir(item_path):
23
+ try:
24
+ mtime = os.path.getmtime(item_path)
25
+ if (now - mtime) > MAX_DIR_AGE_SECONDS:
26
+ shutil.rmtree(item_path)
27
+ print(f"Cleaned up stale directory: {item_path}", file=sys.stderr)
28
+ except Exception as e:
29
+ print(f"Error cleaning directory {item_path}: {e}", file=sys.stderr)
30
+ pass
31
+
32
+ def get_session_temp_dir(main_data_dir):
33
+ cleanup_stale_directories(main_data_dir)
34
+ session_id = get_session_id()
35
+ temp_dir = os.path.join(main_data_dir, session_id)
36
+ os.makedirs(temp_dir, exist_ok=True)
37
+ return temp_dir