Lebossoti commited on
Commit
e54d8fa
·
verified ·
1 Parent(s): f91e546

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +820 -0
app.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import openai
5
+ import tabula
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.service import Service
8
+ from selenium.webdriver.common.by import By
9
+ from bs4 import BeautifulSoup
10
+ from webdriver_manager.chrome import ChromeDriverManager
11
+ from urllib.parse import urlparse
12
+ import time
13
+ import json
14
+ import os
15
+ from datetime import datetime
16
+ from selenium.common.exceptions import WebDriverException
17
+ from tqdm import tqdm
18
+ from streamlit.runtime.scriptrunner import RerunException, RerunData
19
+ import requests
20
+ from requests.exceptions import RequestException
21
+ import pandas as pd
22
+ import time
23
+ import os
24
+
25
+
26
+
27
+ # Initialize OpenAI API
28
+ openai.api_key = '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'
29
+
30
+ # Google Custom Search API credentials
31
+ API_KEY = 'AIzaSyAeSfIupcFpZinMsM8W2DlkLo9lrTjAjN0'
32
+ CSE_ID = '86e6c34afa9ac4904'
33
+
34
+ # Initialize session state
35
+ if 'processed_df' not in st.session_state:
36
+ st.session_state.processed_df = None
37
+ if 'saved_path' not in st.session_state:
38
+ st.session_state.saved_path = None
39
+ if 'start_step' not in st.session_state:
40
+ st.session_state.start_step = "Step 1: Upload & Process Raw Data"
41
+
42
+ # Function to read input file (PDF, Excel, CSV)
43
+ def read_input_file(file):
44
+ if file.name.endswith('.pdf'):
45
+ tables = tabula.read_pdf(file, pages='all', multiple_tables=True)
46
+ df = pd.concat(tables, axis=0, ignore_index=True)
47
+ elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
48
+ df = pd.read_excel(file, sheet_name=None)
49
+ df = pd.concat(df.values(), ignore_index=True)
50
+ elif file.name.endswith('.csv'):
51
+ df = pd.read_csv(file, delimiter=';', on_bad_lines='skip')
52
+ else:
53
+ st.error("Unsupported file format!")
54
+ return None
55
+ return df
56
+
57
+ def query_openai_api(prompt):
58
+ try:
59
+ response = openai.ChatCompletion.create(
60
+ model="gpt-4o",
61
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
62
+ {"role": "user", "content": prompt}],
63
+ max_tokens=2500,
64
+ temperature=0.1
65
+ )
66
+ responses = [choice['message']['content'].strip() for choice in response['choices']]
67
+ return responses
68
+ except Exception as e:
69
+ print(f"Error querying OpenAI API: {e}")
70
+ return []
71
+
72
+ def process_data(df):
73
+ if df is None or not isinstance(df, pd.DataFrame):
74
+ st.error("Invalid input: DataFrame is None or not a pandas DataFrame")
75
+ return None
76
+
77
+ prompt = f'''Here is my dataframe columns: {list(df.columns)}.
78
+ Generate Python code to:
79
+ 1. Rename columns to: 'Name', 'City', 'contact_info', 'website'
80
+ (map from closest matching columns, use your judgment)
81
+ 2. Select only these four columns
82
+ 3. If any columns are missing, create them with NA values
83
+
84
+ Return ONLY the code as two lines:
85
+ - First line: df.rename() with all column mappings
86
+ - Second line: df[] with column selection
87
+ No explanations, no markdown, just the two lines of code.'''
88
+
89
+ api_response = query_openai_api(prompt)
90
+
91
+ if not api_response:
92
+ st.error("No response from OpenAI API")
93
+ return df
94
+
95
+ try:
96
+ raw_code = api_response[0].strip()
97
+ code_lines = []
98
+ for line in raw_code.split('\n'):
99
+ line = line.strip()
100
+ if line and not line.startswith('```'):
101
+ code_lines.append(line)
102
+
103
+ formatted_code = '\n'.join(code_lines[:2])
104
+ st.write("Generated Code:")
105
+ st.code(formatted_code)
106
+
107
+ exec_globals = {'pd': pd}
108
+ exec_locals = {'df': df.copy()}
109
+ exec(formatted_code, exec_globals, exec_locals)
110
+
111
+ processed_df = exec_locals['df']
112
+ required_columns = ['Name', 'City', 'contact_info', 'website']
113
+ missing_cols = [col for col in required_columns if col not in processed_df.columns]
114
+
115
+ if missing_cols:
116
+ for col in missing_cols:
117
+ processed_df[col] = pd.NA
118
+ processed_df = processed_df[required_columns]
119
+ st.warning(f"Added missing columns: {missing_cols}")
120
+
121
+ processed_df.drop_duplicates(inplace=True)
122
+ processed_df.reset_index(drop=True, inplace=True)
123
+ return processed_df
124
+
125
+ except Exception as e:
126
+ st.error(f"Error executing generated code: {str(e)}")
127
+ st.error("Generated code that failed:")
128
+ st.code(formatted_code)
129
+ return df
130
+
131
+ def google_search(query, api_key, cse_id, **kwargs):
132
+ url = 'https://www.googleapis.com/customsearch/v1'
133
+ params = {'q': query, 'key': api_key, 'cx': cse_id, **kwargs}
134
+ response = requests.get(url, params=params)
135
+ response.raise_for_status()
136
+ results = response.json()
137
+ return results.get('items', [])
138
+
139
+ def score_domain(link, company_name):
140
+ if not link:
141
+ return -1
142
+ parsed = urlparse(link)
143
+ domain = parsed.netloc.lower()
144
+ path = parsed.path.lower()
145
+ core_name = company_name.split()[0].lower()
146
+
147
+ score = 0
148
+ if f"www.{core_name}" in domain:
149
+ return 100
150
+ if core_name in domain:
151
+ score += 5
152
+ if path == "/" or path == "":
153
+ score += 5
154
+ elif len(path.strip("/").split("/")) == 1:
155
+ score += 2
156
+ score -= domain.count(".")
157
+ return score
158
+
159
+ def add_google_links_to_df(df, start_index=0, sleep_time=1):
160
+ for i in range(start_index, len(df)):
161
+ row = df.iloc[i]
162
+ if pd.isna(row['website']):
163
+ query = row['Name']+' website'+ ' ' + row['City']
164
+ print(f"Row {i} - Fetching link for: {query}")
165
+
166
+ try:
167
+ items = google_search(query, API_KEY, CSE_ID)
168
+ best_link = None
169
+ best_score = -float('inf')
170
+
171
+ for item in items:
172
+ potential_link = item.get('link')
173
+ score = score_domain(potential_link, row['Name'])
174
+ if score > best_score:
175
+ best_link = potential_link
176
+ best_score = score
177
+ if best_score > 90:
178
+ print(f"Match found! Title: {item.get('title')}")
179
+ print(f"Link: {best_link}")
180
+ print(f"Score: {best_score}")
181
+ df.at[i, 'website'] = best_link
182
+ break
183
+
184
+ print(f"Best link for row {i}: {best_link} with score {best_score}")
185
+ df.at[i, 'website'] = best_link if best_link else pd.NA
186
+ time.sleep(sleep_time)
187
+
188
+ except requests.exceptions.HTTPError as e:
189
+ print(f"HTTP Error occurred: {e}")
190
+ break
191
+ return df
192
+
193
+ # Step 3 Functions
194
+ def treat_link(url):
195
+ if pd.isna(url):
196
+ return None
197
+ elif url.startswith("http://www."):
198
+ return url.replace("http://www.", "https://www.")
199
+ elif url.startswith("http://"):
200
+ return url.replace("http://", "https://")
201
+ elif url.startswith("www."):
202
+ return "https://" + url
203
+ elif url.startswith("https://"):
204
+ return url
205
+ else:
206
+ return "https://www." + url
207
+
208
+
209
+ def get_relevant_links(url):
210
+ relevant_links = []
211
+ links = []
212
+
213
+ # First attempt using requests
214
+ try:
215
+ headers = {
216
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
217
+ }
218
+ response = requests.get(url, headers=headers, timeout=3)
219
+ response.raise_for_status() # This will raise an exception for 4xx/5xx responses
220
+
221
+ soup = BeautifulSoup(response.text, "html.parser")
222
+ links = soup.find_all("a", href=True)
223
+
224
+
225
+ except RequestException as e:
226
+ print(f"Error with requests: {e}")
227
+ links = None # Set links to None to trigger Selenium fallback
228
+
229
+ # If the links are still None, use Selenium to fetch the links
230
+ if not links:
231
+ print("Falling back to Selenium...")
232
+ try:
233
+ # Set up Chrome driver (ensure ChromeDriver is available)
234
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
235
+ driver.get(url)
236
+
237
+ # Wait for the page to load (you can adjust the sleep time if necessary)
238
+ driver.implicitly_wait(3) # wait for elements to load
239
+
240
+ # Extract all links (anchor tags) using Selenium
241
+ selenium_links = driver.find_elements(By.TAG_NAME, 'a')
242
+ selenium_links = list(set(selenium_links)) # Remove duplicates
243
+
244
+ # Filter and collect relevant links
245
+ for link in selenium_links:
246
+ href = link.get_attribute('href')
247
+ if href:
248
+ if any(keyword in href.lower() or keyword in link.text.lower() for keyword in ['a-propos','board','portrait', 'portrat', 'ueber_uns','About', 'presentation', 'about','profil', 'kontakt', 'famille', 'ueber-uns', 'contact', 'team', 'members', 'equipe', 'about-us', 'house', 'who-we-are', 'our-experts','company', 'board-of-directors', 'présentation', 'à-propos', 'contact', 'membres', 'équipe', 'nostri-esperti', 'team', 'chi-siamo', 'consiglio-di-amministrazione', 'people']):
249
+ relevant_links.append(href)
250
+
251
+ driver.quit() # Close the browser after scraping
252
+
253
+ except WebDriverException as e:
254
+ print(f"Error with Selenium (WebDriverException): {e}")
255
+ relevant_links = [] # Set relevant_links to an empty list in case of failure
256
+
257
+ else:
258
+ # If links were retrieved using requests
259
+ links = list(set(links)) # Remove duplicates
260
+ for link in links:
261
+ if any(keyword in link.get('href').lower() or keyword in link.text.lower() for keyword in ['a-propos','board','portrait', 'portrat', 'presentation','About', 'about', 'kontakt','profil', 'famille', 'ueber-uns','ueber_uns', 'contact', 'team', 'members', 'equipe', 'about-us','la-maison','gouvernance','who-we-are', 'company', 'our-experts', 'board-of-directors','the-company', 'people']):
262
+ relevant_links.append(link['href'])
263
+
264
+ # Remove any duplicates and return the relevant links
265
+ relevant_links = list(set(relevant_links))
266
+ print(f"Relevant links found for {url}: {relevant_links}")
267
+ if len(relevant_links)==0:
268
+ return url
269
+ return relevant_links
270
+
271
+ def filter_links(link_dict):
272
+ # Define priority categories
273
+ team_related_keywords = ['team','portrait', 'portrat','board', 'members', 'equipe','about', 'our-experts', 'board-of-directors', 'famille', 'la-maison', 'gouvernance', 'presentation', 'membres', 'équipe', 'nostri-esperti', 'chi-siamo', 'consiglio-di-amministrazione','profil', 'people']
274
+ about_related_keywords = ['About', 'a-propos', 'about', 'about-us', 'the-company', 'ueber-uns', 'ueber_uns', 'who-we-are', 'présentation','profil', 'à-propos', 'a-proposito', 'company']
275
+ contact_related_keywords = ['kontakt', 'contact']
276
+
277
+
278
+ filtered_dict = {}
279
+
280
+ for key, links in link_dict.items():
281
+ # Create empty lists for each category
282
+ team_links = []
283
+ about_links = []
284
+ contact_links = []
285
+
286
+ # Classify the links based on categories
287
+ for link in links:
288
+ if any(keyword in link for keyword in team_related_keywords):
289
+ team_links.append(link)
290
+ elif any(keyword in link for keyword in about_related_keywords):
291
+ about_links.append(link)
292
+ elif any(keyword in link for keyword in contact_related_keywords):
293
+ contact_links.append(link)
294
+
295
+ # Prioritize team links, then about links, and then contact links
296
+ if team_links:
297
+ # Keep only the shortest team-related link
298
+ filtered_dict[key] = min(team_links, key=len)
299
+ elif about_links:
300
+ filtered_dict[key] = about_links[:1][0] # Keep only the first about-related link
301
+ elif contact_links:
302
+ filtered_dict[key] = contact_links[:1][0] # Keep only the first contact-related link
303
+ else:
304
+ filtered_dict[key] = key # If no matches, keep an empty list or handle accordingly
305
+
306
+ return filtered_dict
307
+
308
+
309
+ def get_jina(url):
310
+ return url[0:8]+'r.jina.ai/'+url[8:]
311
+
312
+
313
+
314
+
315
+ from urllib.parse import urlparse
316
+ import requests
317
+ from bs4 import BeautifulSoup
318
+ from selenium import webdriver
319
+ from selenium.webdriver.chrome.service import Service
320
+ from webdriver_manager.chrome import ChromeDriverManager
321
+ from selenium.webdriver.common.by import By
322
+ from selenium.common.exceptions import WebDriverException
323
+ from requests.exceptions import RequestException
324
+ from tqdm import tqdm # Importing tqdm for the progress bar
325
+
326
+ # Adding a progress bar to the DataFrame's apply function
327
+ tqdm.pandas() # This allows tqdm to be used with pandas apply
328
+
329
+
330
+
331
+
332
+ def apply_pipeline(row):
333
+ print(f"Processing row: {row['Name']}")
334
+ base_url = row['website']
335
+ # Ensure the URL is treated correctly
336
+ base_url = treat_link(base_url)
337
+ parsed_url = urlparse(base_url)
338
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
339
+
340
+ relevant_links = get_relevant_links(base_url)
341
+ print(f"Relevant links for {base_url}:")
342
+ print(relevant_links)
343
+ # Filter and modify the links
344
+ relevant_links = [base_url + link if link.startswith('/')
345
+ else link if link.startswith('https://')
346
+ else base_url + '/' + link
347
+ for link in relevant_links]
348
+
349
+ # Filter links
350
+ filtered_links = filter_links({base_url: relevant_links})
351
+
352
+
353
+ # If no links were found, return the original URL
354
+ if not filtered_links.get(base_url):
355
+ row['Processed_Links'] = get_jina(base_url)
356
+ else:
357
+ print(f"Chosen link for {base_url}:")
358
+ print(get_jina(filtered_links.get(base_url, [base_url])))
359
+ row['Processed_Links'] = get_jina(filtered_links.get(base_url, [base_url]))
360
+
361
+ return row
362
+
363
+
364
+ def get_text(url):
365
+ try:
366
+ response = requests.get(url)
367
+ response.raise_for_status()
368
+ soup = BeautifulSoup(response.text, "html.parser")
369
+ text = soup.get_text()
370
+ return text
371
+ except requests.exceptions.RequestException as e:
372
+ print(f"Error with requests: {e}")
373
+ return None
374
+
375
+ def process_in_chunks(df, chunk_size, output_file):
376
+ first_chunk = not os.path.exists(output_file)
377
+ for start in range(0, len(df), chunk_size):
378
+ chunk = df.iloc[start:start + chunk_size]
379
+ chunk['Text'] = chunk['Processed_Links'].apply(get_text)
380
+ time.sleep(1)
381
+ df.loc[start:start + chunk_size - 1, 'Text'] = chunk['Text']
382
+ if first_chunk:
383
+ chunk.to_csv(output_file, mode='w', index=False, header=True)
384
+ first_chunk = False
385
+ else:
386
+ chunk.to_csv(output_file, mode='a', index=False, header=False)
387
+ print(f"Processed chunk {start // chunk_size + 1} and saved.")
388
+ return df
389
+
390
+ def step3(df):
391
+ st.write("Starting Step 3 processing...")
392
+
393
+ # Create progress bar
394
+ progress_bar = st.progress(0)
395
+ status_text = st.empty()
396
+
397
+ df = df.apply(apply_pipeline, axis=1)
398
+ progress_bar.progress(50)
399
+
400
+ # Step 4: Extract text in chunks
401
+ status_text.text("Step 4/4: Extracting text from websites...")
402
+ output_file = "processed_data/step3_output.csv"
403
+ df = process_in_chunks(df, chunk_size=5, output_file=output_file)
404
+ progress_bar.progress(100)
405
+
406
+ status_text.text("Processing complete!")
407
+ time.sleep(1)
408
+ status_text.empty()
409
+
410
+ return df
411
+
412
+ import json
413
+ import pandas as pd
414
+ import time
415
+
416
+ def count_closing_braces_between_companies(input_string):
417
+ first_company_pos = input_string.find('"company"')
418
+ if first_company_pos == -1:
419
+ return 0 # "company" not found
420
+
421
+ second_company_pos = input_string.find('"company"', first_company_pos + 1)
422
+ if second_company_pos == -1:
423
+ return 0 # Only one "company" found
424
+
425
+ substring_between = input_string[first_company_pos:second_company_pos]
426
+
427
+ closing_braces_count = substring_between.count('}')
428
+
429
+ return closing_braces_count
430
+
431
+ def fix_incomplete_json(json_input):
432
+ json_clean = json_input.strip()
433
+ if json_clean.endswith('}]'):
434
+ return json_clean
435
+
436
+ m = count_closing_braces_between_companies(json_clean)
437
+ if m == 2:
438
+ last_valid_index = -1
439
+ last_brace = 0
440
+
441
+ for i in range(len(json_clean) - 1, 0, -1):
442
+ if json_clean[i] == '}':
443
+ if last_brace != 0:
444
+ last_valid_index = last_brace
445
+ break
446
+ else:
447
+ last_brace = i
448
+ if json_clean[i] == '{':
449
+ last_brace = 0
450
+
451
+ if last_valid_index != -1:
452
+ json_clean = json_clean[:last_valid_index + 1] + ']'
453
+ else:
454
+ last_valid_index = json_clean.rfind('}')
455
+
456
+ if last_valid_index != -1:
457
+ json_clean = json_clean[:last_valid_index + 1] + ']'
458
+
459
+ return json_clean
460
+
461
+ def json_to_pandas(json_input):
462
+ lines = json_input.strip().splitlines()
463
+
464
+ if lines[0].startswith("```"):
465
+ lines = lines[1:]
466
+ if lines and lines[-1].startswith("```"):
467
+ lines = lines[:-1]
468
+
469
+ json_clean = "\n".join(lines)
470
+
471
+ try:
472
+ data = json.loads(json_clean)
473
+ except json.JSONDecodeError as e:
474
+ json_clean = fix_incomplete_json(json_clean)
475
+ data = json.loads(json_clean)
476
+
477
+ if isinstance(data, dict):
478
+ data = [data]
479
+
480
+ return pd.json_normalize(data)
481
+
482
+ def save_df(df, tag="processed"):
483
+ os.makedirs("processed_data", exist_ok=True)
484
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
485
+ filename = f"{tag}_{timestamp}.csv"
486
+ save_path = os.path.join("processed_data", filename)
487
+ df.to_csv(save_path, index=False)
488
+ st.session_state.processed_df = df
489
+ st.session_state.saved_path = save_path
490
+ return save_path
491
+
492
+ # Streamlit UI
493
+ st.title("Data Processing Pipeline")
494
+
495
+ # Sidebar: choose starting step
496
+ with st.sidebar:
497
+ st.header("Pipeline Options")
498
+ start_step = st.selectbox(
499
+ '''Select Starting Step''',
500
+ [
501
+ "Step 1: Upload & Process Raw Data",
502
+ "Step 2: Add Website Links (CSV only)",
503
+ "Step 3: Execute Next Processing Step (CSV only)",
504
+ "Step 4: Process and Combine Team Info (CSV only)",
505
+ "Step 5: Fetch LinkedIn URLs (CSV only)"
506
+ ],
507
+ index=[
508
+ "Step 1: Upload & Process Raw Data",
509
+ "Step 2: Add Website Links (CSV only)",
510
+ "Step 3: Execute Next Processing Step (CSV only)",
511
+ "Step 4: Process and Combine Team Info (CSV only)",
512
+ "Step 5: Fetch LinkedIn URLs (CSV only)"
513
+ ].index(st.session_state.start_step)
514
+ )
515
+
516
+ st.session_state.start_step = start_step
517
+
518
+ # Step 1
519
+ if st.session_state.start_step == "Step 1: Upload & Process Raw Data":
520
+ st.sidebar.markdown("Upload raw PDF, CSV, or Excel to start processing.")
521
+ uploaded_file = st.sidebar.file_uploader("Choose a file", type=['pdf', 'csv', 'xlsx'])
522
+
523
+ if uploaded_file:
524
+ st.success("File uploaded successfully!")
525
+ st.write(f"Filename: {uploaded_file.name}")
526
+ df = read_input_file(uploaded_file)
527
+ if df is not None:
528
+ st.subheader("Initial Data Preview")
529
+ st.dataframe(df.head())
530
+ if st.button("Process Data"):
531
+ with st.spinner("Processing data..."):
532
+ processed_df = process_data(df)
533
+ if processed_df is not None:
534
+ save_path = save_df(processed_df, tag="processed")
535
+ st.success("Data processing complete!")
536
+ st.session_state.start_step = "Step 2: Add Website Links (CSV only)"
537
+ raise RerunException(RerunData())
538
+ else:
539
+ st.error("Data processing failed")
540
+ else:
541
+ st.error("Failed to read the uploaded file")
542
+ else:
543
+ st.warning("Please upload a file to begin Step 1")
544
+
545
+ # Step 2
546
+ elif st.session_state.start_step == "Step 2: Add Website Links (CSV only)":
547
+ st.sidebar.markdown("Upload a CSV of your initial dataframe to add website links.")
548
+ if st.session_state.processed_df is not None:
549
+ df = st.session_state.processed_df
550
+ else:
551
+ uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
552
+ if uploaded_csv:
553
+ try:
554
+ df = pd.read_csv(uploaded_csv)
555
+ st.session_state.processed_df = df
556
+ except Exception as e:
557
+ st.error(f"Error reading CSV: {e}")
558
+ df = None
559
+ else:
560
+ st.warning("Please upload a CSV file to begin Step 2")
561
+ df = None
562
+
563
+ if df is not None:
564
+ st.subheader("Data Preview (before adding links)")
565
+ st.dataframe(df.head())
566
+ if st.button("Step 2: Add Website Links"):
567
+ with st.spinner("Searching for websites..."):
568
+ df_with_links = add_google_links_to_df(df)
569
+ save_path = save_df(df_with_links, tag="with_links")
570
+ st.success("Website links added!")
571
+ st.session_state.start_step = "Step 3: Extract Contact Info (CSV only)"
572
+ raise RerunException(RerunData())
573
+
574
+ # Step 3
575
+ elif st.session_state.start_step == "Step 3: Extract Contact Info (CSV only)":
576
+ st.sidebar.markdown("Upload a CSV with websites already added to extract contact info.")
577
+ if st.session_state.processed_df is not None:
578
+ df = st.session_state.processed_df
579
+ else:
580
+ uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
581
+ if uploaded_csv:
582
+ try:
583
+ df = pd.read_csv(uploaded_csv)
584
+ st.session_state.processed_df = df
585
+ except Exception as e:
586
+ st.error(f"Error reading CSV: {e}")
587
+ df = None
588
+ else:
589
+ st.warning("Please upload a CSV file to begin Step 3")
590
+ df = None
591
+
592
+ if df is not None:
593
+ st.subheader("Data Preview (before Step 3)")
594
+ st.dataframe(df.head())
595
+
596
+ st.warning("Note: Step 3 will:")
597
+ st.markdown("- Treat website URLs to ensure proper formatting")
598
+ st.markdown("- Find relevant contact/about pages")
599
+ st.markdown("- Extract text content from these pages")
600
+ st.markdown("- This process may take several minutes")
601
+
602
+ if st.button("Step 3: Extract Page content"):
603
+ with st.spinner("Extracting (this may take several minutes)..."):
604
+ df_next = step3(df)
605
+ save_path = save_df(df_next, tag="step3")
606
+ st.success("Step 3 complete!")
607
+ st.subheader("Processed Data Preview")
608
+ st.dataframe(df_next.head())
609
+
610
+ # Offer download button
611
+ csv = df_next.to_csv(index=False).encode('utf-8')
612
+ st.download_button(
613
+ label="Download Processed Data",
614
+ data=csv,
615
+ file_name='processed_data_with_text.csv',
616
+ mime='text/csv'
617
+ )
618
+
619
+ # Step 4: Process and Combine Team Info
620
+ elif st.session_state.start_step == "Step 4: Process and Combine Team Info (CSV only)":
621
+ st.sidebar.markdown("Upload a CSV to process and combine team information.")
622
+ if st.session_state.processed_df is not None:
623
+ df = st.session_state.processed_df
624
+ else:
625
+ uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
626
+ if uploaded_csv:
627
+ try:
628
+ df = pd.read_csv(uploaded_csv)
629
+ st.session_state.processed_df = df
630
+ except Exception as e:
631
+ st.error(f"Error reading CSV: {e}")
632
+ df = None
633
+ else:
634
+ st.warning("Please upload a CSV to begin Step 4")
635
+ df = None
636
+
637
+ if df is not None:
638
+ st.subheader("Data Preview (before combining)")
639
+ st.dataframe(df.head())
640
+ if st.button("Execute Step 4: Combine and Process"):
641
+ with st.spinner("Running team info combination..."):
642
+ # Process the markdown in the DataFrame to extract and combine company and team member information
643
+ for i, markdown_input in enumerate(df['Text']):
644
+ try:
645
+ prompt = f"""
646
+ Extract company information from the following markdown:
647
+ {markdown_input}
648
+ Return for EACH MEMBER OF THE COMPANY, please provide the following information in JSON format based on the structure below:
649
+
650
+ - **company**:
651
+ - **name**: Name of the company.
652
+ - **team_member_name**: The name of the team member.
653
+ - **position**: The role or position of the team member in the company.
654
+ - **contact_info**: Contact information of the team member, including:
655
+ - **email**: The email address.
656
+ - **phone**: The phone number.
657
+ - **company_description**: A brief, factual, and objective description of the company (maximum 5 words).
658
+
659
+ Make sure to follow this structure exactly. If some info is missing, just put the column name in the JSON with the value `None`.
660
+ """
661
+
662
+ res = query_openai_api(prompt) # Replace with actual OpenAI query
663
+ text_fixed = res[0]
664
+
665
+ # Convert the JSON result into a pandas DataFrame
666
+ df_json = json_to_pandas(text_fixed)
667
+
668
+ if 'final_res' not in locals():
669
+ final_res = pd.DataFrame()
670
+
671
+ # Append the current result to the final DataFrame
672
+ final_res = pd.concat([final_res, df_json], ignore_index=True)
673
+
674
+
675
+ except Exception as e:
676
+ print(f"Error processing markdown {i + 1}: {e}")
677
+
678
+ st.write(" DataFrame:")
679
+ st.write(final_res.head())
680
+
681
+
682
+ prompt2 = f'''Here is final_res.HEAD{final_res.head()} , I want to merge the columns based on their names.
683
+
684
+ Always combine company and team_member_name:
685
+ company should merge the values from columns that seem related to the company name (like company.name).
686
+ team_member_name should merge the values from columns that seem related to the team member name (like company.team_member_name, name).
687
+ For other columns:
688
+ Based on the column headers, the script should identify and merge the appropriate columns into the target ones.
689
+ The merging should prioritize non-null values, using combine_first() or similar logic in pandas.
690
+ If no matching columns are found for a target, skip the merging or leave the target as None or empty.
691
+ Example target columns might include email, phone, position, company_description, etc.
692
+ From the column name, just try to extract the simplest name possible.
693
+ I need the API to:
694
+ Identify the relevant columns by their names.
695
+ Merge the columns dynamically based on similarity to target column names.
696
+ Handle missing columns gracefully, not causing any errors if a source column is missing.
697
+ Please provide Python code that does the above. I want only code, no Introduction no conclusion, only code '''
698
+
699
+ # Remove the markdown syntax and extract the Python code
700
+
701
+ res2 = query_openai_api(prompt2)
702
+ formatted_code2 = res2[0].strip("```python\n").strip("```").strip()
703
+
704
+ # Print the formatted code to verify
705
+ print("Formatted Code:\n", formatted_code2)
706
+ # Execute the formatted code
707
+ try:
708
+ exec(formatted_code2)
709
+ print("Code executed successfully.")
710
+ except Exception as e:
711
+ print(f"Error executing code: {e}")
712
+ # Display the modified DataFrame
713
+ st.write("final DataFrame:")
714
+ st.write(final_res.head())
715
+
716
+
717
+
718
+ # Save and download the final result
719
+ save_path = save_df(final_res, tag="final_team_info")
720
+ st.success("Step 4 complete: Combined team info ready!")
721
+ st.download_button(
722
+ label="Download Final CSV",
723
+ data=open(save_path, 'rb'),
724
+ file_name=os.path.basename(save_path),
725
+ mime='text/csv'
726
+ )
727
+
728
+ # Button to automatically move to Step 5
729
+ if st.button("Proceed to Step 5: Fetch LinkedIn URLs"):
730
+ st.session_state.start_step = "Step 5: Fetch LinkedIn URLs (CSV only)"
731
+ st.experimental_rerun()
732
+
733
+ # STEP 5: Fetch LinkedIn URLs
734
+ elif st.session_state.start_step == "Step 5: Fetch LinkedIn URLs (CSV only)":
735
+ st.sidebar.markdown("Upload a CSV to add LinkedIn URLs.")
736
+
737
+ if st.session_state.processed_df is not None:
738
+ df = st.session_state.processed_df.copy()
739
+ else:
740
+ uploaded_csv = st.sidebar.file_uploader("Upload CSV for Step 5", type=['csv'])
741
+ if uploaded_csv:
742
+ df = pd.read_csv(uploaded_csv)
743
+ st.session_state.processed_df = df
744
+ else:
745
+ df = None
746
+ st.warning("Please upload a CSV to begin Step 5.")
747
+
748
+ if df is not None:
749
+ st.subheader("Data Preview (before fetching LinkedIn URLs)")
750
+ st.dataframe(df.head())
751
+
752
+ if st.button("Execute Step 5: Fetch LinkedIn URLs"):
753
+ with st.spinner("Fetching LinkedIn URLs..."):
754
+
755
+ # Function to add LinkedIn links
756
+ def add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv"):
757
+ start_index = 0
758
+
759
+ for i in range(start_index, len(df)):
760
+ row = df.iloc[i]
761
+ row_tn = row['team_member_name'] if pd.notna(row['team_member_name']) else " "
762
+ row_cp = row['company'] if pd.notna(row['company']) else " "
763
+ query = row_tn + " " + row_cp + " linkedin"
764
+
765
+ st.write(f"Fetching link for: {query}")
766
+
767
+ gs = google_search(query, API_KEY, CSE_ID)
768
+ if gs:
769
+ link = gs[0]['link']
770
+ else:
771
+ link = None
772
+ st.warning(f"No results found for query: {query}")
773
+
774
+ df.loc[i, 'linkedin'] = link
775
+
776
+ time.sleep(sleep_time)
777
+
778
+ if (i + 1) % batch_size == 0 or i == len(df) - 1:
779
+ df.to_csv(output_file, index=False)
780
+ st.info(f"Batch {(i // batch_size) + 1} processed and saved.")
781
+
782
+ return df
783
+
784
+ # Execute LinkedIn URL fetching
785
+ df_linkedin = add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv")
786
+
787
+ prompt_3 = f'''Given the following list of job titles at investment-related companies, select only the positions that are relevant for contacting in the context of investor relations, investments, advisory, or general management.
788
+ Keep associates and senior-level positions.
789
+ Drop roles that are strictly non-investment or operational, such as marketing, HR, middle office, project management,legal, or talent operations.
790
+ The results should be in a python list format.
791
+ Don't include any other text or explanation, just the 2 lists.
792
+ One for kept positions and one for dropped positions.
793
+ {df_linkedin['position'].unique()}'''
794
+
795
+ res_3 = query_openai_api(prompt_3)[0]
796
+
797
+
798
+ formatted_code_3 = res_3.strip("```python\n").strip("```").strip()
799
+
800
+
801
+ # Execute the formatted code
802
+ try:
803
+ exec(formatted_code_3)
804
+ print("Code executed successfully.")
805
+ except Exception as e:
806
+ print(f"Error executing code: {e}")
807
+
808
+ df_linkedin = df_linkedin[df_linkedin['position'].isin(kept_positions)]
809
+
810
+
811
+ save_path = save_df(df_linkedin, tag="final_with_linkedin")
812
+ st.success("Step 5 complete: LinkedIn URLs fetched!")
813
+ st.download_button(
814
+ label="Download Final CSV with LinkedIn",
815
+ data=open(save_path, 'rb'),
816
+ file_name=os.path.basename(save_path),
817
+ mime='text/csv'
818
+ )
819
+
820
+