import requests from bs4 import BeautifulSoup import pandas as pd import gradio as gr def scrape_crunchbase(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Extract relevant information company_name = soup.find('h1').text.strip() if soup.find('h1') else 'N/A' short_description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else 'N/A' founded_on = soup.find('span', text='Founded').find_next('span').text.strip() if soup.find('span', text='Founded') else 'N/A' ipo_status = soup.find('span', text='IPO Status').find_next('span').text.strip() if soup.find('span', text='IPO Status') else 'N/A' contact_email = soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A' legal_name = soup.find('span', text='Legal Name').find_next('span').text.strip() if soup.find('span', text='Legal Name') else 'N/A' website = soup.find('a', href=lambda href: href and 'http' in href)['href'] if soup.find('a', href=lambda href: href and 'http' in href) else 'N/A' city = soup.find('span', text='City').find_next('span').text.strip() if soup.find('span', text='City') else 'N/A' region = soup.find('span', text='Region').find_next('span').text.strip() if soup.find('span', text='Region') else 'N/A' country = soup.find('span', text='Country').find_next('span').text.strip() if soup.find('span', text='Country') else 'N/A' continent = soup.find('span', text='Continent').find_next('span').text.strip() if soup.find('span', text='Continent') else 'N/A' rank_org_company = soup.find('span', text='Rank Org Company').find_next('span').text.strip() if soup.find('span', text='Rank Org Company') else 'N/A' operating_status = soup.find('span', text='Operating Status').find_next('span').text.strip() if soup.find('span', text='Operating Status') else 'N/A' last_funding_type = soup.find('span', text='Last Funding Type').find_next('span').text.strip() if soup.find('span', text='Last Funding Type') else 'N/A' total_rounds = soup.find('span', text='Total Rounds').find_next('span').text.strip() if soup.find('span', text='Total Rounds') else 'N/A' total_investors = soup.find('span', text='Total Investors').find_next('span').text.strip() if soup.find('span', text='Total Investors') else 'N/A' total_money_raised_usd = soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Total Money Raised USD') else 'N/A' last_round_money_raised_usd = soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Last Round Money Raised USD') else 'N/A' most_recent_funding_date = soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if soup.find('span', text='Most Recent Funding Date') else 'N/A' industries = soup.find('span', text='Industries').find_next('span').text.strip() if soup.find('span', text='Industries') else 'N/A' similar_companies_permalinks = soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if soup.find('span', text='Similar Companies Permalinks') else 'N/A' min_employees = soup.find('span', text='Min Employees').find_next('span').text.strip() if soup.find('span', text='Min Employees') else 'N/A' max_employees = soup.find('span', text='Max Employees').find_next('span').text.strip() if soup.find('span', text='Max Employees') else 'N/A' max_score = soup.find('span', text='Max Score').find_next('span').text.strip() if soup.find('span', text='Max Score') else 'N/A' # Create a dictionary with the extracted data data = { 'Url': url, 'Company Name': company_name, 'Short Description': short_description, 'Founded On': founded_on, 'Ipo Status': ipo_status, 'Contact Email': contact_email, 'Legal Name': legal_name, 'Website': website, 'City': city, 'Region': region, 'Country': country, 'Continent': continent, 'Rank Org Company': rank_org_company, 'Operating Status': operating_status, 'Last Funding Type': last_funding_type, 'Total Rounds': total_rounds, 'Total Investors': total_investors, 'Total Money Raised USD': total_money_raised_usd, 'Last Round Money Raised USD': last_round_money_raised_usd, 'Most Recent Funding Date': most_recent_funding_date, 'Industries': industries, 'Similar Companies Permalinks': similar_companies_permalinks, 'Min Employees': min_employees, 'Max Employees': max_employees, 'Max Score': max_score } # Convert the dictionary to a DataFrame df = pd.DataFrame([data]) return df def scrape_and_display(url): df = scrape_crunchbase(url) return df # Create a Gradio interface iface = gr.Interface( fn=scrape_and_display, inputs="text", outputs="dataframe", title="Crunchbase Scraper", description="Enter a Crunchbase URL to scrape company information." ) iface.launch()