Spaces:

jaothan
/

crunchbase_test1

Sleeping

App Files Files Community

crunchbase_test1 / app.py

jaothan

Rename app1.py to app.py

14aeb5f verified 12 months ago

raw

history blame contribute delete

5.34 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import gradio as gr

	def scrape_crunchbase(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract relevant information
	company_name = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
	short_description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else 'N/A'
	founded_on = soup.find('span', text='Founded').find_next('span').text.strip() if soup.find('span', text='Founded') else 'N/A'
	ipo_status = soup.find('span', text='IPO Status').find_next('span').text.strip() if soup.find('span', text='IPO Status') else 'N/A'
	contact_email = soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A'
	legal_name = soup.find('span', text='Legal Name').find_next('span').text.strip() if soup.find('span', text='Legal Name') else 'N/A'
	website = soup.find('a', href=lambda href: href and 'http' in href)['href'] if soup.find('a', href=lambda href: href and 'http' in href) else 'N/A'
	city = soup.find('span', text='City').find_next('span').text.strip() if soup.find('span', text='City') else 'N/A'
	region = soup.find('span', text='Region').find_next('span').text.strip() if soup.find('span', text='Region') else 'N/A'
	country = soup.find('span', text='Country').find_next('span').text.strip() if soup.find('span', text='Country') else 'N/A'
	continent = soup.find('span', text='Continent').find_next('span').text.strip() if soup.find('span', text='Continent') else 'N/A'
	rank_org_company = soup.find('span', text='Rank Org Company').find_next('span').text.strip() if soup.find('span', text='Rank Org Company') else 'N/A'
	operating_status = soup.find('span', text='Operating Status').find_next('span').text.strip() if soup.find('span', text='Operating Status') else 'N/A'
	last_funding_type = soup.find('span', text='Last Funding Type').find_next('span').text.strip() if soup.find('span', text='Last Funding Type') else 'N/A'
	total_rounds = soup.find('span', text='Total Rounds').find_next('span').text.strip() if soup.find('span', text='Total Rounds') else 'N/A'
	total_investors = soup.find('span', text='Total Investors').find_next('span').text.strip() if soup.find('span', text='Total Investors') else 'N/A'
	total_money_raised_usd = soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Total Money Raised USD') else 'N/A'
	last_round_money_raised_usd = soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Last Round Money Raised USD') else 'N/A'
	most_recent_funding_date = soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if soup.find('span', text='Most Recent Funding Date') else 'N/A'
	industries = soup.find('span', text='Industries').find_next('span').text.strip() if soup.find('span', text='Industries') else 'N/A'
	similar_companies_permalinks = soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if soup.find('span', text='Similar Companies Permalinks') else 'N/A'
	min_employees = soup.find('span', text='Min Employees').find_next('span').text.strip() if soup.find('span', text='Min Employees') else 'N/A'
	max_employees = soup.find('span', text='Max Employees').find_next('span').text.strip() if soup.find('span', text='Max Employees') else 'N/A'
	max_score = soup.find('span', text='Max Score').find_next('span').text.strip() if soup.find('span', text='Max Score') else 'N/A'

	# Create a dictionary with the extracted data
	data = {
	'Url': url,
	'Company Name': company_name,
	'Short Description': short_description,
	'Founded On': founded_on,
	'Ipo Status': ipo_status,
	'Contact Email': contact_email,
	'Legal Name': legal_name,
	'Website': website,
	'City': city,
	'Region': region,
	'Country': country,
	'Continent': continent,
	'Rank Org Company': rank_org_company,
	'Operating Status': operating_status,
	'Last Funding Type': last_funding_type,
	'Total Rounds': total_rounds,
	'Total Investors': total_investors,
	'Total Money Raised USD': total_money_raised_usd,
	'Last Round Money Raised USD': last_round_money_raised_usd,
	'Most Recent Funding Date': most_recent_funding_date,
	'Industries': industries,
	'Similar Companies Permalinks': similar_companies_permalinks,
	'Min Employees': min_employees,
	'Max Employees': max_employees,
	'Max Score': max_score
	}

	# Convert the dictionary to a DataFrame
	df = pd.DataFrame([data])

	return df

	def scrape_and_display(url):
	df = scrape_crunchbase(url)
	return df

	# Create a Gradio interface
	iface = gr.Interface(
	fn=scrape_and_display,
	inputs="text",
	outputs="dataframe",
	title="Crunchbase Scraper",
	description="Enter a Crunchbase URL to scrape company information."
	)

	iface.launch()