Spaces:

Lebossoti
/

Investors_linkedin

Build error

App Files Files Community

Lebossoti commited on Apr 25, 2025

Commit

e54d8fa

verified ·

1 Parent(s): f91e546

Create app.py

Browse files

Files changed (1) hide show

app.py +820 -0

app.py ADDED Viewed

	@@ -0,0 +1,820 @@

+import streamlit as st
+import pandas as pd
+import requests
+import openai
+import tabula
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from bs4 import BeautifulSoup
+from webdriver_manager.chrome import ChromeDriverManager
+from urllib.parse import urlparse
+import time
+import json
+import os
+from datetime import datetime
+from selenium.common.exceptions import WebDriverException
+from tqdm import tqdm
+from streamlit.runtime.scriptrunner import RerunException, RerunData
+import requests
+from requests.exceptions import RequestException
+import pandas as pd
+import time
+import os
+# Initialize OpenAI API
+openai.api_key = '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'
+# Google Custom Search API credentials
+API_KEY = 'AIzaSyAeSfIupcFpZinMsM8W2DlkLo9lrTjAjN0'
+CSE_ID = '86e6c34afa9ac4904'
+# Initialize session state
+if 'processed_df' not in st.session_state:
+    st.session_state.processed_df = None
+if 'saved_path' not in st.session_state:
+    st.session_state.saved_path = None
+if 'start_step' not in st.session_state:
+    st.session_state.start_step = "Step 1: Upload & Process Raw Data"
+# Function to read input file (PDF, Excel, CSV)
+def read_input_file(file):
+    if file.name.endswith('.pdf'):
+        tables = tabula.read_pdf(file, pages='all', multiple_tables=True)
+        df = pd.concat(tables, axis=0, ignore_index=True)
+    elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
+        df = pd.read_excel(file, sheet_name=None)
+        df = pd.concat(df.values(), ignore_index=True)
+    elif file.name.endswith('.csv'):
+        df = pd.read_csv(file, delimiter=';', on_bad_lines='skip')
+    else:
+        st.error("Unsupported file format!")
+        return None
+    return df
+def query_openai_api(prompt):
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o",
+            messages=[{"role": "system", "content": "You are a helpful assistant."},
+                      {"role": "user", "content": prompt}],
+            max_tokens=2500,
+            temperature=0.1
+        )
+        responses = [choice['message']['content'].strip() for choice in response['choices']]
+        return responses
+    except Exception as e:
+        print(f"Error querying OpenAI API: {e}")
+        return []
+def process_data(df):
+    if df is None or not isinstance(df, pd.DataFrame):
+        st.error("Invalid input: DataFrame is None or not a pandas DataFrame")
+        return None
+    prompt = f'''Here is my dataframe columns: {list(df.columns)}.
+    Generate Python code to:
+    1. Rename columns to: 'Name', 'City', 'contact_info', 'website'
+    (map from closest matching columns, use your judgment)
+    2. Select only these four columns
+    3. If any columns are missing, create them with NA values
+    Return ONLY the code as two lines:
+    - First line: df.rename() with all column mappings
+    - Second line: df[] with column selection
+    No explanations, no markdown, just the two lines of code.'''
+    api_response = query_openai_api(prompt)
+    if not api_response:
+        st.error("No response from OpenAI API")
+        return df
+    try:
+        raw_code = api_response[0].strip()
+        code_lines = []
+        for line in raw_code.split('\n'):
+            line = line.strip()
+            if line and not line.startswith('```'):
+                code_lines.append(line)
+        formatted_code = '\n'.join(code_lines[:2])
+        st.write("Generated Code:")
+        st.code(formatted_code)
+        exec_globals = {'pd': pd}
+        exec_locals = {'df': df.copy()}
+        exec(formatted_code, exec_globals, exec_locals)
+        processed_df = exec_locals['df']
+        required_columns = ['Name', 'City', 'contact_info', 'website']
+        missing_cols = [col for col in required_columns if col not in processed_df.columns]
+        if missing_cols:
+            for col in missing_cols:
+                processed_df[col] = pd.NA
+            processed_df = processed_df[required_columns]
+            st.warning(f"Added missing columns: {missing_cols}")
+        processed_df.drop_duplicates(inplace=True)
+        processed_df.reset_index(drop=True, inplace=True)
+        return processed_df
+    except Exception as e:
+        st.error(f"Error executing generated code: {str(e)}")
+        st.error("Generated code that failed:")
+        st.code(formatted_code)
+        return df
+def google_search(query, api_key, cse_id, **kwargs):
+    url = 'https://www.googleapis.com/customsearch/v1'
+    params = {'q': query, 'key': api_key, 'cx': cse_id, **kwargs}
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    results = response.json()
+    return results.get('items', [])
+def score_domain(link, company_name):
+    if not link:
+        return -1
+    parsed = urlparse(link)
+    domain = parsed.netloc.lower()
+    path = parsed.path.lower()
+    core_name = company_name.split()[0].lower()
+    score = 0
+    if f"www.{core_name}" in domain:
+        return 100
+    if core_name in domain:
+        score += 5
+    if path == "/" or path == "":
+        score += 5
+    elif len(path.strip("/").split("/")) == 1:
+        score += 2
+    score -= domain.count(".")
+    return score
+def add_google_links_to_df(df, start_index=0, sleep_time=1):
+    for i in range(start_index, len(df)):
+        row = df.iloc[i]
+        if pd.isna(row['website']):
+            query = row['Name']+' website'+ ' ' + row['City']
+            print(f"Row {i} - Fetching link for: {query}")
+            try:
+                items = google_search(query, API_KEY, CSE_ID)
+                best_link = None
+                best_score = -float('inf')
+                for item in items:
+                    potential_link = item.get('link')
+                    score = score_domain(potential_link, row['Name'])
+                    if score > best_score:
+                        best_link = potential_link
+                        best_score = score
+                    if best_score > 90:
+                        print(f"Match found! Title: {item.get('title')}")
+                        print(f"Link: {best_link}")
+                        print(f"Score: {best_score}")
+                        df.at[i, 'website'] = best_link
+                        break
+                print(f"Best link for row {i}: {best_link} with score {best_score}")
+                df.at[i, 'website'] = best_link if best_link else pd.NA
+                time.sleep(sleep_time)
+            except requests.exceptions.HTTPError as e:
+                print(f"HTTP Error occurred: {e}")
+                break
+    return df
+# Step 3 Functions
+def treat_link(url):
+    if pd.isna(url):
+        return None
+    elif url.startswith("http://www."):
+        return url.replace("http://www.", "https://www.")
+    elif url.startswith("http://"):
+        return url.replace("http://", "https://")
+    elif url.startswith("www."):
+        return "https://" + url
+    elif url.startswith("https://"):
+        return url
+    else:
+        return "https://www." + url
+def get_relevant_links(url):
+    relevant_links = []
+    links = []
+    # First attempt using requests
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=3)
+        response.raise_for_status()  # This will raise an exception for 4xx/5xx responses
+        soup = BeautifulSoup(response.text, "html.parser")
+        links = soup.find_all("a", href=True)
+    except RequestException as e:
+        print(f"Error with requests: {e}")
+        links = None  # Set links to None to trigger Selenium fallback
+    # If the links are still None, use Selenium to fetch the links
+    if not links:
+        print("Falling back to Selenium...")
+        try:
+            # Set up Chrome driver (ensure ChromeDriver is available)
+            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
+            driver.get(url)
+            # Wait for the page to load (you can adjust the sleep time if necessary)
+            driver.implicitly_wait(3)  # wait for elements to load
+            # Extract all links (anchor tags) using Selenium
+            selenium_links = driver.find_elements(By.TAG_NAME, 'a')
+            selenium_links = list(set(selenium_links))  # Remove duplicates
+            # Filter and collect relevant links
+            for link in selenium_links:
+                href = link.get_attribute('href')
+                if href:
+                    if any(keyword in href.lower() or keyword in link.text.lower() for keyword in ['a-propos','board','portrait', 'portrat', 'ueber_uns','About', 'presentation', 'about','profil', 'kontakt', 'famille', 'ueber-uns', 'contact', 'team', 'members', 'equipe', 'about-us', 'house', 'who-we-are', 'our-experts','company', 'board-of-directors', 'présentation', 'à-propos', 'contact', 'membres', 'équipe', 'nostri-esperti', 'team', 'chi-siamo', 'consiglio-di-amministrazione', 'people']):
+                        relevant_links.append(href)
+            driver.quit()  # Close the browser after scraping
+        except WebDriverException as e:
+            print(f"Error with Selenium (WebDriverException): {e}")
+            relevant_links = []  # Set relevant_links to an empty list in case of failure
+    else:
+        # If links were retrieved using requests
+        links = list(set(links))  # Remove duplicates
+        for link in links:
+            if any(keyword in link.get('href').lower() or keyword in link.text.lower() for keyword in ['a-propos','board','portrait', 'portrat', 'presentation','About', 'about', 'kontakt','profil', 'famille', 'ueber-uns','ueber_uns', 'contact', 'team', 'members', 'equipe', 'about-us','la-maison','gouvernance','who-we-are', 'company', 'our-experts', 'board-of-directors','the-company', 'people']):
+                relevant_links.append(link['href'])
+    # Remove any duplicates and return the relevant links
+    relevant_links = list(set(relevant_links))
+    print(f"Relevant links found for {url}: {relevant_links}")
+    if len(relevant_links)==0:
+        return url
+    return relevant_links
+def filter_links(link_dict):
+    # Define priority categories
+    team_related_keywords = ['team','portrait', 'portrat','board', 'members', 'equipe','about', 'our-experts', 'board-of-directors', 'famille', 'la-maison', 'gouvernance', 'presentation', 'membres', 'équipe', 'nostri-esperti', 'chi-siamo', 'consiglio-di-amministrazione','profil', 'people']
+    about_related_keywords = ['About', 'a-propos', 'about', 'about-us', 'the-company', 'ueber-uns', 'ueber_uns', 'who-we-are', 'présentation','profil', 'à-propos', 'a-proposito', 'company']
+    contact_related_keywords = ['kontakt', 'contact']
+    filtered_dict = {}
+    for key, links in link_dict.items():
+        # Create empty lists for each category
+        team_links = []
+        about_links = []
+        contact_links = []
+        # Classify the links based on categories
+        for link in links:
+            if any(keyword in link for keyword in team_related_keywords):
+                team_links.append(link)
+            elif any(keyword in link for keyword in about_related_keywords):
+                about_links.append(link)
+            elif any(keyword in link for keyword in contact_related_keywords):
+                contact_links.append(link)
+        # Prioritize team links, then about links, and then contact links
+        if team_links:
+             # Keep only the shortest team-related link
+            filtered_dict[key] = min(team_links, key=len)
+        elif about_links:
+            filtered_dict[key] = about_links[:1][0]  # Keep only the first about-related link
+        elif contact_links:
+            filtered_dict[key] = contact_links[:1][0]  # Keep only the first contact-related link
+        else:
+            filtered_dict[key] = key  # If no matches, keep an empty list or handle accordingly
+    return filtered_dict
+def get_jina(url):
+    return url[0:8]+'r.jina.ai/'+url[8:]
+from urllib.parse import urlparse
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import WebDriverException
+from requests.exceptions import RequestException
+from tqdm import tqdm  # Importing tqdm for the progress bar
+# Adding a progress bar to the DataFrame's apply function
+tqdm.pandas()  # This allows tqdm to be used with pandas apply
+def apply_pipeline(row):
+    print(f"Processing row: {row['Name']}")
+    base_url = row['website']
+    # Ensure the URL is treated correctly
+    base_url = treat_link(base_url)
+    parsed_url = urlparse(base_url)
+    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    relevant_links = get_relevant_links(base_url)
+    print(f"Relevant links for {base_url}:")
+    print(relevant_links)
+    # Filter and modify the links
+    relevant_links = [base_url + link if link.startswith('/')
+                  else link if link.startswith('https://')
+                  else base_url + '/' + link
+                  for link in relevant_links]
+    # Filter links
+    filtered_links = filter_links({base_url: relevant_links})
+    # If no links were found, return the original URL
+    if not filtered_links.get(base_url):
+        row['Processed_Links'] = get_jina(base_url)
+    else:
+        print(f"Chosen link for {base_url}:")
+        print(get_jina(filtered_links.get(base_url, [base_url])))
+        row['Processed_Links'] = get_jina(filtered_links.get(base_url, [base_url]))
+    return row
+def get_text(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        text = soup.get_text()
+        return text
+    except requests.exceptions.RequestException as e:
+        print(f"Error with requests: {e}")
+        return None
+def process_in_chunks(df, chunk_size, output_file):
+    first_chunk = not os.path.exists(output_file)
+    for start in range(0, len(df), chunk_size):
+        chunk = df.iloc[start:start + chunk_size]
+        chunk['Text'] = chunk['Processed_Links'].apply(get_text)
+        time.sleep(1)
+        df.loc[start:start + chunk_size - 1, 'Text'] = chunk['Text']
+        if first_chunk:
+            chunk.to_csv(output_file, mode='w', index=False, header=True)
+            first_chunk = False
+        else:
+            chunk.to_csv(output_file, mode='a', index=False, header=False)
+        print(f"Processed chunk {start // chunk_size + 1} and saved.")
+    return df
+def step3(df):
+    st.write("Starting Step 3 processing...")
+    # Create progress bar
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    df = df.apply(apply_pipeline, axis=1)
+    progress_bar.progress(50)
+    # Step 4: Extract text in chunks
+    status_text.text("Step 4/4: Extracting text from websites...")
+    output_file = "processed_data/step3_output.csv"
+    df = process_in_chunks(df, chunk_size=5, output_file=output_file)
+    progress_bar.progress(100)
+    status_text.text("Processing complete!")
+    time.sleep(1)
+    status_text.empty()
+    return df
+import json
+import pandas as pd
+import time
+def count_closing_braces_between_companies(input_string):
+    first_company_pos = input_string.find('"company"')
+    if first_company_pos == -1:
+        return 0  # "company" not found
+    second_company_pos = input_string.find('"company"', first_company_pos + 1)
+    if second_company_pos == -1:
+        return 0  # Only one "company" found
+    substring_between = input_string[first_company_pos:second_company_pos]
+    closing_braces_count = substring_between.count('}')
+    return closing_braces_count
+def fix_incomplete_json(json_input):
+    json_clean = json_input.strip()
+    if json_clean.endswith('}]'):
+        return json_clean
+    m = count_closing_braces_between_companies(json_clean)
+    if m == 2:
+        last_valid_index = -1
+        last_brace = 0
+        for i in range(len(json_clean) - 1, 0, -1):
+            if json_clean[i] == '}':
+                if last_brace != 0:
+                    last_valid_index = last_brace
+                    break
+                else:
+                    last_brace = i
+            if json_clean[i] == '{':
+                last_brace = 0
+        if last_valid_index != -1:
+            json_clean = json_clean[:last_valid_index + 1] + ']'
+    else:
+        last_valid_index = json_clean.rfind('}')
+    if last_valid_index != -1:
+        json_clean = json_clean[:last_valid_index + 1] + ']'
+    return json_clean
+def json_to_pandas(json_input):
+    lines = json_input.strip().splitlines()
+    if lines[0].startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].startswith("```"):
+        lines = lines[:-1]
+    json_clean = "\n".join(lines)
+    try:
+        data = json.loads(json_clean)
+    except json.JSONDecodeError as e:
+        json_clean = fix_incomplete_json(json_clean)
+        data = json.loads(json_clean)
+    if isinstance(data, dict):
+        data = [data]
+    return pd.json_normalize(data)
+def save_df(df, tag="processed"):
+    os.makedirs("processed_data", exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"{tag}_{timestamp}.csv"
+    save_path = os.path.join("processed_data", filename)
+    df.to_csv(save_path, index=False)
+    st.session_state.processed_df = df
+    st.session_state.saved_path = save_path
+    return save_path
+# Streamlit UI
+st.title("Data Processing Pipeline")
+# Sidebar: choose starting step
+with st.sidebar:
+    st.header("Pipeline Options")
+    start_step = st.selectbox(
+    '''Select Starting Step''',
+    [
+        "Step 1: Upload & Process Raw Data",
+        "Step 2: Add Website Links (CSV only)",
+        "Step 3: Execute Next Processing Step (CSV only)",
+        "Step 4: Process and Combine Team Info (CSV only)",
+        "Step 5: Fetch LinkedIn URLs (CSV only)"
+    ],
+    index=[
+        "Step 1: Upload & Process Raw Data",
+        "Step 2: Add Website Links (CSV only)",
+        "Step 3: Execute Next Processing Step (CSV only)",
+        "Step 4: Process and Combine Team Info (CSV only)",
+        "Step 5: Fetch LinkedIn URLs (CSV only)"
+    ].index(st.session_state.start_step)
+)
+    st.session_state.start_step = start_step
+# Step 1
+if st.session_state.start_step == "Step 1: Upload & Process Raw Data":
+    st.sidebar.markdown("Upload raw PDF, CSV, or Excel to start processing.")
+    uploaded_file = st.sidebar.file_uploader("Choose a file", type=['pdf', 'csv', 'xlsx'])
+    if uploaded_file:
+        st.success("File uploaded successfully!")
+        st.write(f"Filename: {uploaded_file.name}")
+        df = read_input_file(uploaded_file)
+        if df is not None:
+            st.subheader("Initial Data Preview")
+            st.dataframe(df.head())
+            if st.button("Process Data"):
+                with st.spinner("Processing data..."):
+                    processed_df = process_data(df)
+                    if processed_df is not None:
+                        save_path = save_df(processed_df, tag="processed")
+                        st.success("Data processing complete!")
+                        st.session_state.start_step = "Step 2: Add Website Links (CSV only)"
+                        raise RerunException(RerunData())
+                    else:
+                        st.error("Data processing failed")
+        else:
+            st.error("Failed to read the uploaded file")
+    else:
+        st.warning("Please upload a file to begin Step 1")
+# Step 2
+elif st.session_state.start_step == "Step 2: Add Website Links (CSV only)":
+    st.sidebar.markdown("Upload a CSV of your initial dataframe to add website links.")
+    if st.session_state.processed_df is not None:
+        df = st.session_state.processed_df
+    else:
+        uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
+        if uploaded_csv:
+            try:
+                df = pd.read_csv(uploaded_csv)
+                st.session_state.processed_df = df
+            except Exception as e:
+                st.error(f"Error reading CSV: {e}")
+                df = None
+        else:
+            st.warning("Please upload a CSV file to begin Step 2")
+            df = None
+    if df is not None:
+        st.subheader("Data Preview (before adding links)")
+        st.dataframe(df.head())
+        if st.button("Step 2: Add Website Links"):
+            with st.spinner("Searching for websites..."):
+                df_with_links = add_google_links_to_df(df)
+                save_path = save_df(df_with_links, tag="with_links")
+                st.success("Website links added!")
+                st.session_state.start_step = "Step 3: Extract Contact Info (CSV only)"
+                raise RerunException(RerunData())
+# Step 3
+elif st.session_state.start_step == "Step 3: Extract Contact Info (CSV only)":
+    st.sidebar.markdown("Upload a CSV with websites already added to extract contact info.")
+    if st.session_state.processed_df is not None:
+        df = st.session_state.processed_df
+    else:
+        uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
+        if uploaded_csv:
+            try:
+                df = pd.read_csv(uploaded_csv)
+                st.session_state.processed_df = df
+            except Exception as e:
+                st.error(f"Error reading CSV: {e}")
+                df = None
+        else:
+            st.warning("Please upload a CSV file to begin Step 3")
+            df = None
+    if df is not None:
+        st.subheader("Data Preview (before Step 3)")
+        st.dataframe(df.head())
+        st.warning("Note: Step 3 will:")
+        st.markdown("- Treat website URLs to ensure proper formatting")
+        st.markdown("- Find relevant contact/about pages")
+        st.markdown("- Extract text content from these pages")
+        st.markdown("- This process may take several minutes")
+        if st.button("Step 3: Extract Page content"):
+            with st.spinner("Extracting (this may take several minutes)..."):
+                df_next = step3(df)
+                save_path = save_df(df_next, tag="step3")
+                st.success("Step 3 complete!")
+                st.subheader("Processed Data Preview")
+                st.dataframe(df_next.head())
+                # Offer download button
+                csv = df_next.to_csv(index=False).encode('utf-8')
+                st.download_button(
+                    label="Download Processed Data",
+                    data=csv,
+                    file_name='processed_data_with_text.csv',
+                    mime='text/csv'
+                )
+# Step 4: Process and Combine Team Info
+elif st.session_state.start_step == "Step 4: Process and Combine Team Info (CSV only)":
+    st.sidebar.markdown("Upload a CSV to process and combine team information.")
+    if st.session_state.processed_df is not None:
+        df = st.session_state.processed_df
+    else:
+        uploaded_csv = st.sidebar.file_uploader("Upload CSV", type=['csv'])
+        if uploaded_csv:
+            try:
+                df = pd.read_csv(uploaded_csv)
+                st.session_state.processed_df = df
+            except Exception as e:
+                st.error(f"Error reading CSV: {e}")
+                df = None
+        else:
+            st.warning("Please upload a CSV to begin Step 4")
+            df = None
+    if df is not None:
+        st.subheader("Data Preview (before combining)")
+        st.dataframe(df.head())
+        if st.button("Execute Step 4: Combine and Process"):
+            with st.spinner("Running team info combination..."):
+                # Process the markdown in the DataFrame to extract and combine company and team member information
+                for i, markdown_input in enumerate(df['Text']):
+                    try:
+                        prompt = f"""
+                        Extract company information from the following markdown:
+                        {markdown_input}
+                        Return for EACH MEMBER OF THE COMPANY, please provide the following information in JSON format based on the structure below:
+                        - **company**:
+                        - **name**: Name of the company.
+                        - **team_member_name**: The name of the team member.
+                        - **position**: The role or position of the team member in the company.
+                        - **contact_info**: Contact information of the team member, including:
+                        - **email**: The email address.
+                        - **phone**: The phone number.
+                        - **company_description**: A brief, factual, and objective description of the company (maximum 5 words).
+                        Make sure to follow this structure exactly. If some info is missing, just put the column name in the JSON with the value `None`.
+                        """
+                        res = query_openai_api(prompt)  # Replace with actual OpenAI query
+                        text_fixed = res[0]
+                        # Convert the JSON result into a pandas DataFrame
+                        df_json = json_to_pandas(text_fixed)
+                        if 'final_res' not in locals():
+                            final_res = pd.DataFrame()
+                        # Append the current result to the final DataFrame
+                        final_res = pd.concat([final_res, df_json], ignore_index=True)
+                    except Exception as e:
+                        print(f"Error processing markdown {i + 1}: {e}")
+                st.write(" DataFrame:")
+                st.write(final_res.head())
+                prompt2 = f'''Here is final_res.HEAD{final_res.head()} , I want to merge the columns based on their names.
+                Always combine company and team_member_name:
+                company should merge the values from columns that seem related to the company name (like company.name).
+                team_member_name should merge the values from columns that seem related to the team member name (like company.team_member_name, name).
+                For other columns:
+                Based on the column headers, the script should identify and merge the appropriate columns into the target ones.
+                The merging should prioritize non-null values, using combine_first() or similar logic in pandas.
+                If no matching columns are found for a target, skip the merging or leave the target as None or empty.
+                Example target columns might include email, phone, position, company_description, etc.
+                From the column name, just try to extract the simplest name possible.
+                I need the API to:
+                Identify the relevant columns by their names.
+                Merge the columns dynamically based on similarity to target column names.
+                Handle missing columns gracefully, not causing any errors if a source column is missing.
+                Please provide Python code that does the above. I want only code, no Introduction no conclusion, only code '''
+                # Remove the markdown syntax and extract the Python code
+                res2 = query_openai_api(prompt2)
+                formatted_code2 = res2[0].strip("```python\n").strip("```").strip()
+                # Print the formatted code to verify
+                print("Formatted Code:\n", formatted_code2)
+                # Execute the formatted code
+                try:
+                    exec(formatted_code2)
+                    print("Code executed successfully.")
+                except Exception as e:
+                    print(f"Error executing code: {e}")
+                # Display the modified DataFrame
+                st.write("final DataFrame:")
+                st.write(final_res.head())
+                # Save and download the final result
+                save_path = save_df(final_res, tag="final_team_info")
+                st.success("Step 4 complete: Combined team info ready!")
+                st.download_button(
+                    label="Download Final CSV",
+                    data=open(save_path, 'rb'),
+                    file_name=os.path.basename(save_path),
+                    mime='text/csv'
+                )
+                                # Button to automatically move to Step 5
+                if st.button("Proceed to Step 5: Fetch LinkedIn URLs"):
+                    st.session_state.start_step = "Step 5: Fetch LinkedIn URLs (CSV only)"
+                    st.experimental_rerun()
+# STEP 5: Fetch LinkedIn URLs
+elif st.session_state.start_step == "Step 5: Fetch LinkedIn URLs (CSV only)":
+    st.sidebar.markdown("Upload a CSV to add LinkedIn URLs.")
+    if st.session_state.processed_df is not None:
+        df = st.session_state.processed_df.copy()
+    else:
+        uploaded_csv = st.sidebar.file_uploader("Upload CSV for Step 5", type=['csv'])
+        if uploaded_csv:
+            df = pd.read_csv(uploaded_csv)
+            st.session_state.processed_df = df
+        else:
+            df = None
+            st.warning("Please upload a CSV to begin Step 5.")
+    if df is not None:
+        st.subheader("Data Preview (before fetching LinkedIn URLs)")
+        st.dataframe(df.head())
+        if st.button("Execute Step 5: Fetch LinkedIn URLs"):
+            with st.spinner("Fetching LinkedIn URLs..."):
+                # Function to add LinkedIn links
+                def add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv"):
+                    start_index = 0
+                    for i in range(start_index, len(df)):
+                        row = df.iloc[i]
+                        row_tn = row['team_member_name'] if pd.notna(row['team_member_name']) else " "
+                        row_cp = row['company'] if pd.notna(row['company']) else " "
+                        query = row_tn + " " + row_cp + " linkedin"
+                        st.write(f"Fetching link for: {query}")
+                        gs = google_search(query, API_KEY, CSE_ID)
+                        if gs:
+                            link = gs[0]['link']
+                        else:
+                            link = None
+                            st.warning(f"No results found for query: {query}")
+                        df.loc[i, 'linkedin'] = link
+                        time.sleep(sleep_time)
+                        if (i + 1) % batch_size == 0 or i == len(df) - 1:
+                            df.to_csv(output_file, index=False)
+                            st.info(f"Batch {(i // batch_size) + 1} processed and saved.")
+                    return df
+                # Execute LinkedIn URL fetching
+                df_linkedin = add_linkedin_to_df(df, batch_size=10, sleep_time=0.2, output_file="linkedin_results.csv")
+                prompt_3 = f'''Given the following list of job titles at investment-related companies, select only the positions that are relevant for contacting in the context of investor relations, investments, advisory,  or general management.
+                Keep associates and senior-level positions.
+                Drop roles that are strictly non-investment or operational, such as marketing, HR, middle office, project management,legal,  or talent operations.
+                The results should be in a  python list format.
+                Don't include any other text or explanation, just the 2 lists.
+                One for kept positions and one for dropped positions.
+                {df_linkedin['position'].unique()}'''
+                res_3 = query_openai_api(prompt_3)[0]
+                formatted_code_3 = res_3.strip("```python\n").strip("```").strip()
+                # Execute the formatted code
+                try:
+                    exec(formatted_code_3)
+                    print("Code executed successfully.")
+                except Exception as e:
+                    print(f"Error executing code: {e}")
+                df_linkedin = df_linkedin[df_linkedin['position'].isin(kept_positions)]
+                save_path = save_df(df_linkedin, tag="final_with_linkedin")
+                st.success("Step 5 complete: LinkedIn URLs fetched!")
+                st.download_button(
+                    label="Download Final CSV with LinkedIn",
+                    data=open(save_path, 'rb'),
+                    file_name=os.path.basename(save_path),
+                    mime='text/csv'
+                )