Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import gradio as gr | |
| def scrape_crunchbase(url): | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract relevant information | |
| company_name = soup.find('h1').text.strip() if soup.find('h1') else 'N/A' | |
| short_description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else 'N/A' | |
| founded_on = soup.find('span', text='Founded').find_next('span').text.strip() if soup.find('span', text='Founded') else 'N/A' | |
| ipo_status = soup.find('span', text='IPO Status').find_next('span').text.strip() if soup.find('span', text='IPO Status') else 'N/A' | |
| contact_email = soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A' | |
| legal_name = soup.find('span', text='Legal Name').find_next('span').text.strip() if soup.find('span', text='Legal Name') else 'N/A' | |
| website = soup.find('a', href=lambda href: href and 'http' in href)['href'] if soup.find('a', href=lambda href: href and 'http' in href) else 'N/A' | |
| city = soup.find('span', text='City').find_next('span').text.strip() if soup.find('span', text='City') else 'N/A' | |
| region = soup.find('span', text='Region').find_next('span').text.strip() if soup.find('span', text='Region') else 'N/A' | |
| country = soup.find('span', text='Country').find_next('span').text.strip() if soup.find('span', text='Country') else 'N/A' | |
| continent = soup.find('span', text='Continent').find_next('span').text.strip() if soup.find('span', text='Continent') else 'N/A' | |
| rank_org_company = soup.find('span', text='Rank Org Company').find_next('span').text.strip() if soup.find('span', text='Rank Org Company') else 'N/A' | |
| operating_status = soup.find('span', text='Operating Status').find_next('span').text.strip() if soup.find('span', text='Operating Status') else 'N/A' | |
| last_funding_type = soup.find('span', text='Last Funding Type').find_next('span').text.strip() if soup.find('span', text='Last Funding Type') else 'N/A' | |
| total_rounds = soup.find('span', text='Total Rounds').find_next('span').text.strip() if soup.find('span', text='Total Rounds') else 'N/A' | |
| total_investors = soup.find('span', text='Total Investors').find_next('span').text.strip() if soup.find('span', text='Total Investors') else 'N/A' | |
| total_money_raised_usd = soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Total Money Raised USD') else 'N/A' | |
| last_round_money_raised_usd = soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Last Round Money Raised USD') else 'N/A' | |
| most_recent_funding_date = soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if soup.find('span', text='Most Recent Funding Date') else 'N/A' | |
| industries = soup.find('span', text='Industries').find_next('span').text.strip() if soup.find('span', text='Industries') else 'N/A' | |
| similar_companies_permalinks = soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if soup.find('span', text='Similar Companies Permalinks') else 'N/A' | |
| min_employees = soup.find('span', text='Min Employees').find_next('span').text.strip() if soup.find('span', text='Min Employees') else 'N/A' | |
| max_employees = soup.find('span', text='Max Employees').find_next('span').text.strip() if soup.find('span', text='Max Employees') else 'N/A' | |
| max_score = soup.find('span', text='Max Score').find_next('span').text.strip() if soup.find('span', text='Max Score') else 'N/A' | |
| # Create a dictionary with the extracted data | |
| data = { | |
| 'Url': url, | |
| 'Company Name': company_name, | |
| 'Short Description': short_description, | |
| 'Founded On': founded_on, | |
| 'Ipo Status': ipo_status, | |
| 'Contact Email': contact_email, | |
| 'Legal Name': legal_name, | |
| 'Website': website, | |
| 'City': city, | |
| 'Region': region, | |
| 'Country': country, | |
| 'Continent': continent, | |
| 'Rank Org Company': rank_org_company, | |
| 'Operating Status': operating_status, | |
| 'Last Funding Type': last_funding_type, | |
| 'Total Rounds': total_rounds, | |
| 'Total Investors': total_investors, | |
| 'Total Money Raised USD': total_money_raised_usd, | |
| 'Last Round Money Raised USD': last_round_money_raised_usd, | |
| 'Most Recent Funding Date': most_recent_funding_date, | |
| 'Industries': industries, | |
| 'Similar Companies Permalinks': similar_companies_permalinks, | |
| 'Min Employees': min_employees, | |
| 'Max Employees': max_employees, | |
| 'Max Score': max_score | |
| } | |
| # Convert the dictionary to a DataFrame | |
| df = pd.DataFrame([data]) | |
| return df | |
| def scrape_and_display(url): | |
| df = scrape_crunchbase(url) | |
| return df | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=scrape_and_display, | |
| inputs="text", | |
| outputs="dataframe", | |
| title="Crunchbase Scraper", | |
| description="Enter a Crunchbase URL to scrape company information." | |
| ) | |
| iface.launch() | |