Spaces:

jaothan
/

crunchbase_test1

Sleeping

App Files Files Community

crunchbase_test1 / app3.py

jaothan

Rename app.py to app3.py

3adbf17 verified 12 months ago

raw

history blame contribute delete

6.9 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import gradio as gr

	# Function to scrape Crunchbase for companies matching the description
	def scrape_crunchbase(description):
	# Simulate a search query on Crunchbase
	search_url = f"https://www.crunchbase.com/textsearch?q={description}"
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}
	response = requests.get(search_url, headers=headers)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract company details
	companies = []
	for item in soup.find_all('div', class_='result-info'): # Adjust based on Crunchbase's HTML structure
	company_name = item.find('a', class_='result-name').text.strip()
	company_url = "https://www.crunchbase.com" + item.find('a', class_='result-name')['href']

	# Fetch additional details from the company's Crunchbase page
	company_response = requests.get(company_url, headers=headers)
	company_soup = BeautifulSoup(company_response.content, 'html.parser')

	# Extract relevant fields
	short_description = company_soup.find('meta', attrs={'name': 'description'})['content'] if company_soup.find('meta', attrs={'name': 'description'}) else 'N/A'
	founded_on = company_soup.find('span', text='Founded').find_next('span').text.strip() if company_soup.find('span', text='Founded') else 'N/A'
	ipo_status = company_soup.find('span', text='IPO Status').find_next('span').text.strip() if company_soup.find('span', text='IPO Status') else 'N/A'
	contact_email = company_soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if company_soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A'
	legal_name = company_soup.find('span', text='Legal Name').find_next('span').text.strip() if company_soup.find('span', text='Legal Name') else 'N/A'
	website = company_soup.find('a', href=lambda href: href and 'http' in href)['href'] if company_soup.find('a', href=lambda href: href and 'http' in href) else 'N/A'
	city = company_soup.find('span', text='City').find_next('span').text.strip() if company_soup.find('span', text='City') else 'N/A'
	region = company_soup.find('span', text='Region').find_next('span').text.strip() if company_soup.find('span', text='Region') else 'N/A'
	country = company_soup.find('span', text='Country').find_next('span').text.strip() if company_soup.find('span', text='Country') else 'N/A'
	continent = company_soup.find('span', text='Continent').find_next('span').text.strip() if company_soup.find('span', text='Continent') else 'N/A'
	rank_org_company = company_soup.find('span', text='Rank Org Company').find_next('span').text.strip() if company_soup.find('span', text='Rank Org Company') else 'N/A'
	operating_status = company_soup.find('span', text='Operating Status').find_next('span').text.strip() if company_soup.find('span', text='Operating Status') else 'N/A'
	last_funding_type = company_soup.find('span', text='Last Funding Type').find_next('span').text.strip() if company_soup.find('span', text='Last Funding Type') else 'N/A'
	total_rounds = company_soup.find('span', text='Total Rounds').find_next('span').text.strip() if company_soup.find('span', text='Total Rounds') else 'N/A'
	total_investors = company_soup.find('span', text='Total Investors').find_next('span').text.strip() if company_soup.find('span', text='Total Investors') else 'N/A'
	total_money_raised_usd = company_soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if company_soup.find('span', text='Total Money Raised USD') else 'N/A'
	last_round_money_raised_usd = company_soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if company_soup.find('span', text='Last Round Money Raised USD') else 'N/A'
	most_recent_funding_date = company_soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if company_soup.find('span', text='Most Recent Funding Date') else 'N/A'
	industries = company_soup.find('span', text='Industries').find_next('span').text.strip() if company_soup.find('span', text='Industries') else 'N/A'
	similar_companies_permalinks = company_soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if company_soup.find('span', text='Similar Companies Permalinks') else 'N/A'
	min_employees = company_soup.find('span', text='Min Employees').find_next('span').text.strip() if company_soup.find('span', text='Min Employees') else 'N/A'
	max_employees = company_soup.find('span', text='Max Employees').find_next('span').text.strip() if company_soup.find('span', text='Max Employees') else 'N/A'
	max_score = company_soup.find('span', text='Max Score').find_next('span').text.strip() if company_soup.find('span', text='Max Score') else 'N/A'

	# Add company details to the list
	companies.append({
	'Url': company_url,
	'Company Name': company_name,
	'Short Description': short_description,
	'Founded On': founded_on,
	'Ipo Status': ipo_status,
	'Contact Email': contact_email,
	'Legal Name': legal_name,
	'Website': website,
	'City': city,
	'Region': region,
	'Country': country,
	'Continent': continent,
	'Rank Org Company': rank_org_company,
	'Operating Status': operating_status,
	'Last Funding Type': last_funding_type,
	'Total Rounds': total_rounds,
	'Total Investors': total_investors,
	'Total Money Raised USD': total_money_raised_usd,
	'Last Round Money Raised USD': last_round_money_raised_usd,
	'Most Recent Funding Date': most_recent_funding_date,
	'Industries': industries,
	'Similar Companies Permalinks': similar_companies_permalinks,
	'Min Employees': min_employees,
	'Max Employees': max_employees,
	'Max Score': max_score
	})

	# Convert the list to a DataFrame
	df = pd.DataFrame(companies)
	return df.head(5) # Return the top 5 companies

	# Gradio Interface
	def gradio_interface(description):
	df = scrape_crunchbase(description)
	return df

	# Launch Gradio App
	iface = gr.Interface(
	fn=gradio_interface,
	inputs="text",
	outputs="dataframe",
	title="Crunchbase Company Search",
	description="Enter a company services description to find the top 5 matching companies on Crunchbase."
	)

	iface.launch()