jaothan commited on
Commit
c08f144
·
verified ·
1 Parent(s): cdab4ac

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import gradio as gr
5
+
6
+ def scrape_crunchbase(url):
7
+ response = requests.get(url)
8
+ soup = BeautifulSoup(response.content, 'html.parser')
9
+
10
+ # Extract relevant information
11
+ company_name = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
12
+ short_description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else 'N/A'
13
+ founded_on = soup.find('span', text='Founded').find_next('span').text.strip() if soup.find('span', text='Founded') else 'N/A'
14
+ ipo_status = soup.find('span', text='IPO Status').find_next('span').text.strip() if soup.find('span', text='IPO Status') else 'N/A'
15
+ contact_email = soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A'
16
+ legal_name = soup.find('span', text='Legal Name').find_next('span').text.strip() if soup.find('span', text='Legal Name') else 'N/A'
17
+ website = soup.find('a', href=lambda href: href and 'http' in href)['href'] if soup.find('a', href=lambda href: href and 'http' in href) else 'N/A'
18
+ city = soup.find('span', text='City').find_next('span').text.strip() if soup.find('span', text='City') else 'N/A'
19
+ region = soup.find('span', text='Region').find_next('span').text.strip() if soup.find('span', text='Region') else 'N/A'
20
+ country = soup.find('span', text='Country').find_next('span').text.strip() if soup.find('span', text='Country') else 'N/A'
21
+ continent = soup.find('span', text='Continent').find_next('span').text.strip() if soup.find('span', text='Continent') else 'N/A'
22
+ rank_org_company = soup.find('span', text='Rank Org Company').find_next('span').text.strip() if soup.find('span', text='Rank Org Company') else 'N/A'
23
+ operating_status = soup.find('span', text='Operating Status').find_next('span').text.strip() if soup.find('span', text='Operating Status') else 'N/A'
24
+ last_funding_type = soup.find('span', text='Last Funding Type').find_next('span').text.strip() if soup.find('span', text='Last Funding Type') else 'N/A'
25
+ total_rounds = soup.find('span', text='Total Rounds').find_next('span').text.strip() if soup.find('span', text='Total Rounds') else 'N/A'
26
+ total_investors = soup.find('span', text='Total Investors').find_next('span').text.strip() if soup.find('span', text='Total Investors') else 'N/A'
27
+ total_money_raised_usd = soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Total Money Raised USD') else 'N/A'
28
+ last_round_money_raised_usd = soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if soup.find('span', text='Last Round Money Raised USD') else 'N/A'
29
+ most_recent_funding_date = soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if soup.find('span', text='Most Recent Funding Date') else 'N/A'
30
+ industries = soup.find('span', text='Industries').find_next('span').text.strip() if soup.find('span', text='Industries') else 'N/A'
31
+ similar_companies_permalinks = soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if soup.find('span', text='Similar Companies Permalinks') else 'N/A'
32
+ min_employees = soup.find('span', text='Min Employees').find_next('span').text.strip() if soup.find('span', text='Min Employees') else 'N/A'
33
+ max_employees = soup.find('span', text='Max Employees').find_next('span').text.strip() if soup.find('span', text='Max Employees') else 'N/A'
34
+ max_score = soup.find('span', text='Max Score').find_next('span').text.strip() if soup.find('span', text='Max Score') else 'N/A'
35
+
36
+ # Create a dictionary with the extracted data
37
+ data = {
38
+ 'Url': url,
39
+ 'Company Name': company_name,
40
+ 'Short Description': short_description,
41
+ 'Founded On': founded_on,
42
+ 'Ipo Status': ipo_status,
43
+ 'Contact Email': contact_email,
44
+ 'Legal Name': legal_name,
45
+ 'Website': website,
46
+ 'City': city,
47
+ 'Region': region,
48
+ 'Country': country,
49
+ 'Continent': continent,
50
+ 'Rank Org Company': rank_org_company,
51
+ 'Operating Status': operating_status,
52
+ 'Last Funding Type': last_funding_type,
53
+ 'Total Rounds': total_rounds,
54
+ 'Total Investors': total_investors,
55
+ 'Total Money Raised USD': total_money_raised_usd,
56
+ 'Last Round Money Raised USD': last_round_money_raised_usd,
57
+ 'Most Recent Funding Date': most_recent_funding_date,
58
+ 'Industries': industries,
59
+ 'Similar Companies Permalinks': similar_companies_permalinks,
60
+ 'Min Employees': min_employees,
61
+ 'Max Employees': max_employees,
62
+ 'Max Score': max_score
63
+ }
64
+
65
+ # Convert the dictionary to a DataFrame
66
+ df = pd.DataFrame([data])
67
+
68
+ return df
69
+
70
+ def scrape_and_display(url):
71
+ df = scrape_crunchbase(url)
72
+ return df
73
+
74
+ # Create a Gradio interface
75
+ iface = gr.Interface(
76
+ fn=scrape_and_display,
77
+ inputs="text",
78
+ outputs="dataframe",
79
+ title="Crunchbase Scraper",
80
+ description="Enter a Crunchbase URL to scrape company information."
81
+ )
82
+
83
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ pandas
4
+ gradio