Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| import openai | |
| import tabula | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.common.by import By | |
| from bs4 import BeautifulSoup | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from urllib.parse import urlparse | |
| import time | |
| import json | |
| from datetime import datetime | |
| from selenium.common.exceptions import WebDriverException | |
| from tqdm import tqdm | |
| from streamlit.runtime.scriptrunner import RerunException, RerunData | |
| import requests | |
| from requests.exceptions import RequestException | |
| import pandas as pd | |
| import time | |
| import os | |
| from dotenv import load_dotenv | |
| # Load environment variables from the .env file | |
| load_dotenv() | |
| # Retrieve the keys | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| CSE_ID = os.getenv("GOOGLE_CSE_ID") | |
| client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| # Initialize session state | |
| if 'processed_df' not in st.session_state: | |
| st.session_state.processed_df = None | |
| if 'saved_path' not in st.session_state: | |
| st.session_state.saved_path = None | |
| if 'start_step' not in st.session_state: | |
| st.session_state.start_step = "Step 1: Upload & Process Raw Data" | |
| # Function to read input file (PDF, Excel, CSV) | |
| def read_input_file(file): | |
| if file.name.endswith('.pdf'): | |
| tables = tabula.read_pdf(file, pages='all', multiple_tables=True) | |
| df = pd.concat(tables, axis=0, ignore_index=True) | |
| elif file.name.endswith('.xlsx') or file.name.endswith('.xls'): | |
| df = pd.read_excel(file, sheet_name=None) | |
| df = pd.concat(df.values(), ignore_index=True) | |
| elif file.name.endswith('.csv'): | |
| df = pd.read_csv(file, delimiter=';', on_bad_lines='skip') | |
| else: | |
| st.error("Unsupported file format!") | |
| return None | |
| return df | |
| def query_openai_api(prompt): | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=2500, | |
| temperature=0.1 | |
| ) | |
| return [response.choices[0].message.content.strip()] | |
| except Exception as e: | |
| st.error(f"Error querying OpenAI API: {e}") | |
| return [] | |
| def process_data(df): | |
| if df is None or not isinstance(df, pd.DataFrame): | |
| st.error("Invalid input: DataFrame is None or not a pandas DataFrame") | |
| return None | |
| prompt = f'''Here is my dataframe columns: {list(df.columns)}. | |
| Generate Python code to: | |
| 1. Rename columns to: 'Name', 'City', 'contact_info', 'website' | |
| (map from closest matching columns, use your judgment) | |
| The Columns SHOULD BE RENAMED | |
| 2. Select only these four columns | |
| 3. If any columns are missing, create them with NA values | |
| Return ONLY the code as two lines: | |
| - First line: df.rename() with all column mappings | |
| - Second line: df.reindex(columns=['Name', 'City', 'contact_info', 'website']) | |
| - Third line: df=df['Name', 'City', 'contact_info', 'website'] | |
| No explanations, no markdown, just the two lines of code.''' | |
| api_response = query_openai_api(prompt) | |
| if not api_response: | |
| st.error("No response from OpenAI API") | |
| time.sleep(3) | |
| return df | |
| try: | |
| raw_code = api_response[0].strip() | |
| code_lines = [] | |
| for line in raw_code.split('\n'): | |
| line = line.strip() | |
| if line and not line.startswith('```'): | |
| code_lines.append(line) | |
| formatted_code = '\n'.join(code_lines[:2]) | |
| st.code(formatted_code) | |
| exec_globals = {'pd': pd} | |
| exec_locals = {'df': df.copy()} | |
| exec(formatted_code, exec_globals, exec_locals) | |
| processed_df = exec_locals['df'] | |
| required_columns = ['Name', 'City', 'contact_info', 'website'] | |
| missing_cols = [col for col in required_columns if col not in processed_df.columns] | |
| if missing_cols: | |
| for col in missing_cols: | |
| processed_df[col] = pd.NA | |
| processed_df = processed_df[required_columns] | |
| st.warning(f"Added missing columns: {missing_cols}") | |
| processed_df.drop_duplicates(inplace=True) | |
| processed_df.reset_index(drop=True, inplace=True) | |
| return processed_df | |
| except Exception as e: | |
| st.error(f"Error executing generated code: {str(e)}") | |
| st.error("Generated code that failed:") | |
| st.code(formatted_code) | |
| return df | |
| def google_search(query, api_key, cse_id, **kwargs): | |
| url = 'https://www.googleapis.com/customsearch/v1' | |
| params = {'q': query, 'key': api_key, 'cx': cse_id, **kwargs} | |
| response = requests.get(url, params=params) | |
| response.raise_for_status() | |
| results = response.json() | |
| return results.get('items', []) | |
| def score_domain(link, company_name): | |
| if not link: | |
| return -1 | |
| parsed = urlparse(link) | |
| domain = parsed.netloc.lower() | |
| path = parsed.path.lower() | |
| core_name = company_name.split()[0].lower() | |
| score = 0 | |
| if f"www.{core_name}" in domain: | |
| return 100 | |
| if core_name in domain: | |
| score += 5 | |
| if path == "/" or path == "": | |
| score += 5 | |
| elif len(path.strip("/").split("/")) == 1: | |
| score += 2 | |
| score -= domain.count(".") | |
| return score | |
| def add_google_links_to_df(df, start_index=0, sleep_time=1): | |
| for i in range(start_index, len(df)): | |
| row = df.iloc[i] | |
| if pd.isna(row['website']): | |
| query = row['Name']+' website'+ ' ' + row['City'] | |
| print(f"Row {i} - Fetching link for: {query}") | |
| try: | |
| items = google_search(query, GOOGLE_API_KEY, CSE_ID) | |
| best_link = None | |
| best_score = -float('inf') | |
| for item in items: | |
| potential_link = item.get('link') | |
| score = score_domain(potential_link, row['Name']) | |
| if score > best_score: | |
| best_link = potential_link | |
| best_score = score | |
| if best_score > 90: | |
| print(f"Match found! Title: {item.get('title')}") | |
| print(f"Link: {best_link}") | |
| print(f"Score: {best_score}") | |
| df.at[i, 'website'] = best_link | |
| break | |
| print(f"Best link for row {i}: {best_link} with score {best_score}") | |
| df.at[i, 'website'] = best_link if best_link else pd.NA | |
| time.sleep(sleep_time) | |
| except requests.exceptions.HTTPError as e: | |
| print(f"HTTP Error occurred: {e}") | |
| break | |
| return df | |
| # Step 3 Functions | |
| def treat_link(url): | |
| if pd.isna(url): | |
| return None | |
| elif url.startswith("http://www."): | |
| return url.replace("http://www.", "https://www.") | |
| elif url.startswith("http://"): | |
| return url.replace("http://", "https://") | |
| elif url.startswith("www."): | |
| return "https://" + url | |
| elif url.startswith("https://"): | |
| return url | |
| else: | |
| return "https://www." + url | |
| def get_relevant_links(url): | |
| relevant_links = [] | |
| links = [] | |
| # First attempt using requests | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=3) | |
| response.raise_for_status() # This will raise an exception for 4xx/5xx responses | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| links = soup.find_all("a", href=True) | |
| except RequestException as e: | |
| print(f"Error with requests: {e}") | |
| links = None # Set links to None to trigger Selenium fallback | |
| # If the links are still None, use Selenium to fetch the links | |
| if not links: | |
| print("Falling back to Selenium...") | |
| try: | |
| # Set up Chrome driver (ensure ChromeDriver is available) | |
| driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) | |
| driver.get(url) | |
| # Wait for the page to load (you can adjust the sleep time if necessary) | |
| driver.implicitly_wait(3) # wait for elements to load | |
| # Extract all links (anchor tags) using Selenium | |
| selenium_links = driver.find_elements(By.TAG_NAME, 'a') | |
| selenium_links = list(set(selenium_links)) # Remove duplicates | |
| # Filter and collect relevant links | |
| for link in selenium_links: | |
| href = link.get_attribute('href') | |
| if href: | |
| if any(keyword in href.lower() or keyword in link.text.lower() for keyword in ['a-propos','board','portrait','governance', 'gouvernance', 'portrat', 'ueber_uns','About', 'presentation', 'about','profil', 'kontakt', 'famille', 'ueber-uns', 'contact', 'team', 'members', 'equipe', 'about-us', 'house', 'who-we-are', 'our-experts','company', 'board-of-directors', 'présentation', 'à-propos', 'contact', 'membres', 'équipe', 'nostri-esperti', 'team', 'chi-siamo', 'consiglio-di-amministrazione', 'people']): | |
| relevant_links.append(href) | |
| driver.quit() # Close the browser after scraping | |
| except WebDriverException as e: | |
| print(f"Error with Selenium (WebDriverException): {e}") | |
| relevant_links = [] # Set relevant_links to an empty list in case of failure | |
| else: | |
| # If links were retrieved using requests | |
| links = list(set(links)) # Remove duplicates | |
| for link in links: | |
| if any(keyword in link.get('href').lower() or keyword in link.text.lower() for keyword in ['a-propos','board','gouvernance', 'governance', 'portrait', 'portrat', 'presentation','About', 'about', 'kontakt','profil', 'famille', 'ueber-uns','ueber_uns', 'contact', 'team', 'members', 'equipe', 'about-us','la-maison','gouvernance','who-we-are', 'company', 'our-experts', 'board-of-directors','the-company', 'people']): | |
| relevant_links.append(link['href']) | |
| # Remove any duplicates and return the relevant links | |
| relevant_links = list(set(relevant_links)) | |
| print(f"Relevant links found for {url}: {relevant_links}") | |
| if len(relevant_links)==0: | |
| return url | |
| return relevant_links | |
| def filter_links(link_dict): | |
| # Define priority categories | |
| team_related_keywords = ['team','portrait', 'portrat','board', 'members', 'equipe','governance','gouvernance', 'our-experts', 'board-of-directors', 'famille', 'la-maison', 'gouvernance', 'presentation', 'membres', 'équipe', 'nostri-esperti', 'chi-siamo', 'consiglio-di-amministrazione','profil', 'people'] | |
| about_related_keywords = ['About', 'a-propos', 'about', 'about-us', 'the-company', 'ueber-uns', 'ueber_uns', 'who-we-are', 'présentation','profil', 'à-propos', 'a-proposito', 'company'] | |
| contact_related_keywords = ['kontakt', 'contact'] | |
| filtered_dict = {} | |
| for key, links in link_dict.items(): | |
| # Create empty lists for each category | |
| team_links = [] | |
| about_links = [] | |
| contact_links = [] | |
| # Classify the links based on categories | |
| for link in links: | |
| if any(keyword in link for keyword in team_related_keywords): | |
| team_links.append(link) | |
| elif any(keyword in link for keyword in about_related_keywords): | |
| about_links.append(link) | |
| elif any(keyword in link for keyword in contact_related_keywords): | |
| contact_links.append(link) | |
| # Prioritize team links, then about links, and then contact links | |
| if team_links: | |
| # Keep only the shortest team-related link | |
| filtered_dict[key] = min(team_links, key=len) | |
| elif about_links: | |
| filtered_dict[key] = about_links[:1][0] # Keep only the first about-related link | |
| elif contact_links: | |
| filtered_dict[key] = contact_links[:1][0] # Keep only the first contact-related link | |
| else: | |
| filtered_dict[key] = key # If no matches, keep an empty list or handle accordingly | |
| return filtered_dict | |
| def get_jina(url): | |
| return url[0:8]+'r.jina.ai/'+url[8:] | |
| from urllib.parse import urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import WebDriverException | |
| from requests.exceptions import RequestException | |
| from tqdm import tqdm # Importing tqdm for the progress bar | |
| # Adding a progress bar to the DataFrame's apply function | |
| tqdm.pandas() # This allows tqdm to be used with pandas apply | |
| def apply_pipeline(row): | |
| print(f"Processing row: {row['Name']}") | |
| base_url = row['website'] | |
| # Ensure the URL is treated correctly | |
| base_url = treat_link(base_url) | |
| parsed_url = urlparse(base_url) | |
| base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" | |
| relevant_links = get_relevant_links(base_url) | |
| print(f"Relevant links for {base_url}:") | |
| print(relevant_links) | |
| # Filter and modify the links | |
| relevant_links = [base_url + link if link.startswith('/') | |
| else link if link.startswith('https://') | |
| else base_url + '/' + link | |
| for link in relevant_links] | |
| # Filter links | |
| filtered_links = filter_links({base_url: relevant_links}) | |
| # If no links were found, return the original URL | |
| if not filtered_links.get(base_url): | |
| row['Processed_Links'] = get_jina(base_url) | |
| else: | |
| print(f"Chosen link for {base_url}:") | |
| print(get_jina(filtered_links.get(base_url, [base_url]))) | |
| row['Processed_Links'] = get_jina(filtered_links.get(base_url, [base_url])) | |
| return row | |
| def get_text(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| text = soup.get_text() | |
| return text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error with requests: {e}") | |
| return None | |
| def process_in_chunks(df, chunk_size, output_file): | |
| first_chunk = not os.path.exists(output_file) | |
| for start in range(0, len(df), chunk_size): | |
| chunk = df.iloc[start:start + chunk_size] | |
| chunk['Text'] = chunk['Processed_Links'].apply(get_text) | |
| time.sleep(1) | |
| df.loc[start:start + chunk_size - 1, 'Text'] = chunk['Text'] | |
| if first_chunk: | |
| chunk.to_csv(output_file, mode='w', index=False, header=True) | |
| first_chunk = False | |
| else: | |
| chunk.to_csv(output_file, mode='a', index=False, header=False) | |
| print(f"Processed chunk {start // chunk_size + 1} and saved.") | |
| return df | |
| def step3(df): | |
| st.write("Starting Step 3 processing...") | |
| # Create progress bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| df = df.apply(apply_pipeline, axis=1) | |
| progress_bar.progress(50) | |
| # Step 4: Extract text in chunks | |
| status_text.text("Step 4/4: Extracting text from websites...") | |
| output_file = "processed_data/step3_output.csv" | |
| df = process_in_chunks(df, chunk_size=5, output_file=output_file) | |
| progress_bar.progress(100) | |
| status_text.text("Processing complete!") | |
| time.sleep(1) | |
| status_text.empty() | |
| return df | |
| import json | |
| import pandas as pd | |
| import time | |
| def count_closing_braces_between_companies(input_string): | |
| first_company_pos = input_string.find('"company"') | |
| if first_company_pos == -1: | |
| return 0 # "company" not found | |
| second_company_pos = input_string.find('"company"', first_company_pos + 1) | |
| if second_company_pos == -1: | |
| return 0 # Only one "company" found | |
| substring_between = input_string[first_company_pos:second_company_pos] | |
| closing_braces_count = substring_between.count('}') | |
| return closing_braces_count | |
| def fix_incomplete_json(json_input): | |
| json_clean = json_input.strip() | |
| if json_clean.endswith('}]'): | |
| return json_clean | |
| m = count_closing_braces_between_companies(json_clean) | |
| if m == 2: | |
| last_valid_index = -1 | |
| last_brace = 0 | |
| for i in range(len(json_clean) - 1, 0, -1): | |
| if json_clean[i] == '}': | |
| if last_brace != 0: | |
| last_valid_index = last_brace | |
| break | |
| else: | |
| last_brace = i | |
| if json_clean[i] == '{': | |
| last_brace = 0 | |
| if last_valid_index != -1: | |
| json_clean = json_clean[:last_valid_index + 1] + ']' | |
| else: | |
| last_valid_index = json_clean.rfind('}') | |
| if last_valid_index != -1: | |
| json_clean = json_clean[:last_valid_index + 1] + ']' | |
| return json_clean | |
| def json_to_pandas(json_input): | |
| lines = json_input.strip().splitlines() | |
| if lines[0].startswith("```"): | |
| lines = lines[1:] | |
| if lines and lines[-1].startswith("```"): | |
| lines = lines[:-1] | |
| json_clean = "\n".join(lines) | |
| try: | |
| data = json.loads(json_clean) | |
| except json.JSONDecodeError as e: | |
| json_clean = fix_incomplete_json(json_clean) | |
| data = json.loads(json_clean) | |
| if isinstance(data, dict): | |
| data = [data] | |
| return pd.json_normalize(data) | |
| def save_df(df, tag="processed"): | |
| os.makedirs("processed_data", exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"{tag}_{timestamp}.csv" | |
| save_path = os.path.join("processed_data", filename) | |
| df.to_csv(save_path, index=False) | |
| st.session_state.processed_df = df | |
| st.session_state.saved_path = save_path | |
| return save_path | |
| # Streamlit UI | |
| st.title("Data Processing Pipeline") | |
| # Sidebar: choose starting step | |
| with st.sidebar: | |
| st.header("Pipeline Options") | |
| start_step = st.selectbox( | |
| '''Select Starting Step''', | |
| [ | |
| "Step 1: Upload & Process Raw Data", | |
| "Step 2: Add Website Links (CSV only)", | |
| "Step 3: Extract Contact Info (CSV only)", | |
| "Step 4: Process and Combine Team Info (CSV only)", | |
| "Step 5: Fetch LinkedIn URLs (CSV only)" | |
| ], | |
| index=[ | |
| "Step 1: Upload & Process Raw Data", | |
| "Step 2: Add Website Links (CSV only)", | |
| "Step 3: Extract Contact Info (CSV only)", | |
| "Step 4: Process and Combine Team Info (CSV only)", | |
| "Step 5: Fetch LinkedIn URLs (CSV only)" | |
| ].index(st.session_state.start_step) | |
| ) | |
| st.session_state.start_step = start_step | |
| # Step 1 | |
| if st.session_state.start_step == "Step 1: Upload & Process Raw Data": | |
| st.sidebar.markdown("Upload raw PDF, CSV, or Excel to start processing.") | |
| uploaded_file = st.sidebar.file_uploader("Choose a file", type=['pdf', 'csv', 'xlsx']) | |
| if uploaded_file: | |
| st.success("File uploaded successfully!") | |
| st.write(f"Filename: {uploaded_file.name}") | |
| df = read_input_file(uploaded_file) | |
| if df is not None: | |
| st.subheader("Initial Data Preview") | |
| st.dataframe(df.head()) | |
| if st.button("Process Data"): | |
| with st.spinner("Processing data..."): | |
| processed_df = process_data(df) | |
| if processed_df is not None: | |
| save_path = save_df(processed_df, tag="processed") | |
| st.success("Data processing complete!") | |
| st.write('Processed_data') | |
| st.write(processed_df.head()) | |
| st.session_state.start_step = "Step 2: Add Website Links (CSV only)" | |
| raise RerunException(RerunData()) | |
| else: | |
| st.error("Data processing failed") | |
| else: | |
| st.error("Failed to read the uploaded file") | |
| else: | |
| st.warning("Please upload a file to begin Step 1") | |
| # Step 2 | |
| elif st.session_state.start_step == "Step 2: Add Website Links (CSV only)": | |
| st.sidebar.markdown("Upload a CSV of your initial dataframe to add website links.") | |
| if st.session_state.processed_df is not None: | |
| df = st.session_state.processed_df | |
| else: | |
| uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv']) | |
| if uploaded_csv: | |
| try: | |
| df = pd.read_csv(uploaded_csv) | |
| st.session_state.processed_df = df | |
| except Exception as e: | |
| st.error(f"Error reading CSV: {e}") | |
| df = None | |
| else: | |
| st.warning("Please upload a CSV file to begin Step 2") | |
| df = None | |
| if df is not None: | |
| st.subheader("Data Preview (before adding links)") | |
| st.dataframe(df.head()) | |
| if st.button("Step 2: Add Website Links"): | |
| with st.spinner("Searching for websites..."): | |
| df_with_links = add_google_links_to_df(df) | |
| save_path = save_df(df_with_links, tag="with_links") | |
| st.success("Website links added!") | |
| st.session_state.start_step = "Step 3: Extract Contact Info (CSV only)" | |
| raise RerunException(RerunData()) | |
| # Step 3 | |
| elif st.session_state.start_step == "Step 3: Extract Contact Info (CSV only)": | |
| st.sidebar.markdown("Upload a CSV with websites already added to extract contact info.") | |
| if st.session_state.processed_df is not None: | |
| df = st.session_state.processed_df | |
| else: | |
| uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv']) | |
| if uploaded_csv: | |
| try: | |
| df = pd.read_csv(uploaded_csv) | |
| st.session_state.processed_df = df | |
| except Exception as e: | |
| st.error(f"Error reading CSV: {e}") | |
| df = None | |
| else: | |
| st.warning("Please upload a CSV file to begin Step 3") | |
| df = None | |
| if df is not None: | |
| st.subheader("Data Preview (before Step 3)") | |
| st.dataframe(df.head()) | |
| st.warning("Note: Step 3 will:") | |
| st.markdown("- Treat website URLs to ensure proper formatting") | |
| st.markdown("- Find relevant contact/about pages") | |
| st.markdown("- Extract text content from these pages") | |
| st.markdown("- This process may take several minutes") | |
| if st.button("Step 3: Extract Page content"): | |
| with st.spinner("Extracting (this may take several minutes)..."): | |
| df_next = step3(df) | |
| save_path = save_df(df_next, tag="step3") | |
| st.success("Step 3 complete!") | |
| st.subheader("Processed Data Preview") | |
| st.dataframe(df_next.head()) | |
| # Offer download button | |
| csv = df_next.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Download Processed Data", | |
| data=csv, | |
| file_name='processed_data_with_text.csv', | |
| mime='text/csv' | |
| ) | |
| st.session_state.start_step = "Step 4: Process and Combine Team Info (CSV only)" | |
| raise RerunException(RerunData()) | |
| # Step 4: Process and Combine Team Info | |
| elif st.session_state.start_step == "Step 4: Process and Combine Team Info (CSV only)": | |
| st.sidebar.markdown("Upload a CSV to process and combine team information.") | |
| if st.session_state.processed_df is not None: | |
| df = st.session_state.processed_df | |
| else: | |
| uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv']) | |
| if uploaded_csv: | |
| try: | |
| df = pd.read_csv(uploaded_csv) | |
| st.session_state.processed_df = df | |
| except Exception as e: | |
| st.error(f"Error reading CSV: {e}") | |
| df = None | |
| else: | |
| st.warning("Please upload a CSV to begin Step 4") | |
| df = None | |
| if df is not None: | |
| st.subheader("Data Preview (before combining)") | |
| st.dataframe(df.head()) | |
| if st.button("Execute Step 4: Combine and Process"): | |
| with st.spinner("Running team info combination..."): | |
| # Process the markdown in the DataFrame to extract and combine company and team member information | |
| for i, markdown_input in enumerate(df['Text']): | |
| try: | |
| prompt = f""" | |
| Extract company information from the following markdown: | |
| {markdown_input} | |
| Return for EACH MEMBER OF THE COMPANY, please provide the following information in JSON format based on the structure below: | |
| - **company**: | |
| - **name**: Name of the company. | |
| - **team_member_name**: The name of the team member. | |
| - **position**: The role or position of the team member in the company. | |
| - **contact_info**: Contact information of the team member, including: | |
| - **email**: The email address. | |
| - **phone**: The phone number. | |
| - **company_description**: A brief, factual, and objective description of the company (maximum 5 words). | |
| Make sure to follow this structure exactly. If some info is missing, just put the column name in the JSON with the value `None`. | |
| """ | |
| res = query_openai_api(prompt) # Replace with actual OpenAI query | |
| text_fixed = res[0] | |
| # Convert the JSON result into a pandas DataFrame | |
| df_json = json_to_pandas(text_fixed) | |
| if 'final_res' not in locals(): | |
| final_res = pd.DataFrame() | |
| # Append the current result to the final DataFrame | |
| final_res = pd.concat([final_res, df_json], ignore_index=True) | |
| except Exception as e: | |
| print(f"Error processing markdown {i + 1}: {e}") | |
| st.write(" DataFrame:") | |
| st.write(final_res.head()) | |
| prompt2 = f'''Here is final_res.HEAD{final_res.head()} , I want to merge the columns based on their names. | |
| Always combine company and team_member_name: | |
| company should merge the values from columns that seem related to the company name (like company.name). | |
| team_member_name should merge the values from columns that seem related to the team member name (like company.team_member_name, name). | |
| For other columns: | |
| Based on the column headers, the script should identify and merge the appropriate columns into the target ones. | |
| The merging should prioritize non-null values, using combine_first() or similar logic in pandas. | |
| If no matching columns are found for a target, skip the merging or leave the target as None or empty. | |
| Example target columns might include email, phone, position, company_description, etc. | |
| From the column name, just try to extract the simplest name possible. | |
| I need the API to: | |
| Identify the relevant columns by their names. | |
| Merge the columns dynamically based on similarity to target column names. | |
| Handle missing columns gracefully, not causing any errors if a source column is missing. | |
| Please provide Python code that does the above. I want only code, no Introduction no conclusion, only code ''' | |
| # Remove the markdown syntax and extract the Python code | |
| res2 = query_openai_api(prompt2) | |
| formatted_code2 = res2[0].strip("```python\n").strip("```").strip() | |
| # Print the formatted code to verify | |
| print("Formatted Code:\n", formatted_code2) | |
| # Execute the formatted code | |
| try: | |
| exec(formatted_code2) | |
| print("Code executed successfully.") | |
| except Exception as e: | |
| print(f"Error executing code: {e}") | |
| # Display the modified DataFrame | |
| st.write("final DataFrame:") | |
| if 'Name' in final_res.columns: | |
| final_res.drop(columns=['Name'], inplace=True) | |
| if 'name' in final_res.columns: | |
| final_res.drop(columns=['name'], inplace=True) | |
| st.write(final_res.head()) | |
| # Save and download the final result | |
| save_path = save_df(final_res, tag="final_team_info") | |
| st.success("Step 4 complete: Combined team info ready!") | |
| st.download_button( | |
| label="Download Final CSV", | |
| data=open(save_path, 'rb'), | |
| file_name=os.path.basename(save_path), | |
| mime='text/csv' | |
| ) | |
| # Button to automatically move to Step 5 | |
| # Button to proceed to Step 5 | |
| if st.button("Proceed to Step 5: Fetch LinkedIn URLs"): | |
| st.session_state.start_step = "Step 5: Fetch LinkedIn URLs (CSV only)" | |
| st.session_state.processed_df = st.session_state.final_res # Pass data to Step 5 | |
| st.experimental_rerun() # Refresh to move to Step 5 | |
| # STEP 5: Fetch LinkedIn URLs | |
| elif st.session_state.start_step == "Step 5: Fetch LinkedIn URLs (CSV only)": | |
| st.sidebar.markdown("Upload a CSV to add LinkedIn URLs.") | |
| if st.session_state.processed_df is not None: | |
| df = st.session_state.processed_df.copy() | |
| else: | |
| uploaded_csv = st.sidebar.file_uploader("Upload CSV for Step 5", type=['csv']) | |
| if uploaded_csv: | |
| df = pd.read_csv(uploaded_csv) | |
| st.session_state.processed_df = df | |
| else: | |
| df = None | |
| st.warning("Please upload a CSV to begin Step 5.") | |
| if df is not None: | |
| st.subheader("Data Preview (before fetching LinkedIn URLs)") | |
| st.dataframe(df.head()) | |
| if st.button("Execute Step 5: Fetch LinkedIn URLs"): | |
| with st.spinner("Fetching LinkedIn URLs..."): | |
| # Function to add LinkedIn links | |
| def add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv"): | |
| start_index = 0 | |
| for i in range(start_index, len(df)): | |
| row = df.iloc[i] | |
| row_tn = row['team_member_name'] if pd.notna(row['team_member_name']) else " " | |
| row_cp = row['company'] if pd.notna(row['company']) else " " | |
| query = row_tn + " " + row_cp + " linkedin" | |
| st.write(f"Fetching link for: {query}") | |
| gs = google_search(query, GOOGLE_API_KEY, CSE_ID) | |
| if gs: | |
| link = gs[0]['link'] | |
| else: | |
| link = None | |
| st.warning(f"No results found for query: {query}") | |
| df.loc[i, 'linkedin'] = link | |
| time.sleep(sleep_time) | |
| if (i + 1) % batch_size == 0 or i == len(df) - 1: | |
| df.to_csv(output_file, index=False) | |
| st.info(f"Batch {(i // batch_size) + 1} processed and saved.") | |
| return df | |
| # Execute LinkedIn URL fetching | |
| df_linkedin = add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv") | |
| prompt_3 = f'''Given the following list of job titles at investment-related companies, select only the positions that are relevant for contacting in the context of investor relations, investments, advisory, or general management. | |
| Keep associates and senior-level positions. | |
| Drop roles that are strictly non-investment or operational, such as marketing, HR, middle office, project management,legal, or talent operations. | |
| The results should be in a python list format. | |
| Don't include any other text or explanation, just the 2 lists. | |
| One for kept positions and one for dropped positions. | |
| {df_linkedin['position'].unique()}''' | |
| res_3 = query_openai_api(prompt_3)[0] | |
| formatted_code_3 = res_3.strip("```python\n").strip("```").strip() | |
| # Execute the formatted code | |
| try: | |
| exec(formatted_code_3) | |
| print("Code executed successfully.") | |
| except Exception as e: | |
| print(f"Error executing code: {e}") | |
| df_linkedin = df_linkedin[df_linkedin['position'].isin(kept_positions)] | |
| df_linkedin = df_linkedin.dropna(axis=1, how='all') | |
| save_path = save_df(df_linkedin, tag="final_with_linkedin") | |
| st.success("Step 5 complete: LinkedIn URLs fetched!") | |
| st.download_button( | |
| label="Download Final CSV with LinkedIn", | |
| data=open(save_path, 'rb'), | |
| file_name=os.path.basename(save_path), | |
| mime='text/csv' | |
| ) | |