jaothan commited on
Commit
986bd24
·
verified ·
1 Parent(s): 91a8fa2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import gradio as gr
5
+
6
+ # Function to scrape Crunchbase for companies matching the description
7
+ def scrape_crunchbase(description):
8
+ # Simulate a search query on Crunchbase
9
+ search_url = f"https://www.crunchbase.com/textsearch?q={description}"
10
+ headers = {
11
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
12
+ }
13
+ response = requests.get(search_url, headers=headers)
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ # Extract company details
17
+ companies = []
18
+ for item in soup.find_all('div', class_='result-info'): # Adjust based on Crunchbase's HTML structure
19
+ company_name = item.find('a', class_='result-name').text.strip()
20
+ company_url = "https://www.crunchbase.com" + item.find('a', class_='result-name')['href']
21
+
22
+ # Fetch additional details from the company's Crunchbase page
23
+ company_response = requests.get(company_url, headers=headers)
24
+ company_soup = BeautifulSoup(company_response.content, 'html.parser')
25
+
26
+ # Extract relevant fields
27
+ short_description = company_soup.find('meta', attrs={'name': 'description'})['content'] if company_soup.find('meta', attrs={'name': 'description'}) else 'N/A'
28
+ founded_on = company_soup.find('span', text='Founded').find_next('span').text.strip() if company_soup.find('span', text='Founded') else 'N/A'
29
+ ipo_status = company_soup.find('span', text='IPO Status').find_next('span').text.strip() if company_soup.find('span', text='IPO Status') else 'N/A'
30
+ contact_email = company_soup.find('a', href=lambda href: href and 'mailto:' in href)['href'].replace('mailto:', '') if company_soup.find('a', href=lambda href: href and 'mailto:' in href) else 'N/A'
31
+ legal_name = company_soup.find('span', text='Legal Name').find_next('span').text.strip() if company_soup.find('span', text='Legal Name') else 'N/A'
32
+ website = company_soup.find('a', href=lambda href: href and 'http' in href)['href'] if company_soup.find('a', href=lambda href: href and 'http' in href) else 'N/A'
33
+ city = company_soup.find('span', text='City').find_next('span').text.strip() if company_soup.find('span', text='City') else 'N/A'
34
+ region = company_soup.find('span', text='Region').find_next('span').text.strip() if company_soup.find('span', text='Region') else 'N/A'
35
+ country = company_soup.find('span', text='Country').find_next('span').text.strip() if company_soup.find('span', text='Country') else 'N/A'
36
+ continent = company_soup.find('span', text='Continent').find_next('span').text.strip() if company_soup.find('span', text='Continent') else 'N/A'
37
+ rank_org_company = company_soup.find('span', text='Rank Org Company').find_next('span').text.strip() if company_soup.find('span', text='Rank Org Company') else 'N/A'
38
+ operating_status = company_soup.find('span', text='Operating Status').find_next('span').text.strip() if company_soup.find('span', text='Operating Status') else 'N/A'
39
+ last_funding_type = company_soup.find('span', text='Last Funding Type').find_next('span').text.strip() if company_soup.find('span', text='Last Funding Type') else 'N/A'
40
+ total_rounds = company_soup.find('span', text='Total Rounds').find_next('span').text.strip() if company_soup.find('span', text='Total Rounds') else 'N/A'
41
+ total_investors = company_soup.find('span', text='Total Investors').find_next('span').text.strip() if company_soup.find('span', text='Total Investors') else 'N/A'
42
+ total_money_raised_usd = company_soup.find('span', text='Total Money Raised USD').find_next('span').text.strip() if company_soup.find('span', text='Total Money Raised USD') else 'N/A'
43
+ last_round_money_raised_usd = company_soup.find('span', text='Last Round Money Raised USD').find_next('span').text.strip() if company_soup.find('span', text='Last Round Money Raised USD') else 'N/A'
44
+ most_recent_funding_date = company_soup.find('span', text='Most Recent Funding Date').find_next('span').text.strip() if company_soup.find('span', text='Most Recent Funding Date') else 'N/A'
45
+ industries = company_soup.find('span', text='Industries').find_next('span').text.strip() if company_soup.find('span', text='Industries') else 'N/A'
46
+ similar_companies_permalinks = company_soup.find('span', text='Similar Companies Permalinks').find_next('span').text.strip() if company_soup.find('span', text='Similar Companies Permalinks') else 'N/A'
47
+ min_employees = company_soup.find('span', text='Min Employees').find_next('span').text.strip() if company_soup.find('span', text='Min Employees') else 'N/A'
48
+ max_employees = company_soup.find('span', text='Max Employees').find_next('span').text.strip() if company_soup.find('span', text='Max Employees') else 'N/A'
49
+ max_score = company_soup.find('span', text='Max Score').find_next('span').text.strip() if company_soup.find('span', text='Max Score') else 'N/A'
50
+
51
+ # Add company details to the list
52
+ companies.append({
53
+ 'Url': company_url,
54
+ 'Company Name': company_name,
55
+ 'Short Description': short_description,
56
+ 'Founded On': founded_on,
57
+ 'Ipo Status': ipo_status,
58
+ 'Contact Email': contact_email,
59
+ 'Legal Name': legal_name,
60
+ 'Website': website,
61
+ 'City': city,
62
+ 'Region': region,
63
+ 'Country': country,
64
+ 'Continent': continent,
65
+ 'Rank Org Company': rank_org_company,
66
+ 'Operating Status': operating_status,
67
+ 'Last Funding Type': last_funding_type,
68
+ 'Total Rounds': total_rounds,
69
+ 'Total Investors': total_investors,
70
+ 'Total Money Raised USD': total_money_raised_usd,
71
+ 'Last Round Money Raised USD': last_round_money_raised_usd,
72
+ 'Most Recent Funding Date': most_recent_funding_date,
73
+ 'Industries': industries,
74
+ 'Similar Companies Permalinks': similar_companies_permalinks,
75
+ 'Min Employees': min_employees,
76
+ 'Max Employees': max_employees,
77
+ 'Max Score': max_score
78
+ })
79
+
80
+ # Convert the list to a DataFrame
81
+ df = pd.DataFrame(companies)
82
+ return df.head(5) # Return the top 5 companies
83
+
84
+ # Gradio Interface
85
+ def gradio_interface(description):
86
+ df = scrape_crunchbase(description)
87
+ return df
88
+
89
+ # Launch Gradio App
90
+ iface = gr.Interface(
91
+ fn=gradio_interface,
92
+ inputs="text",
93
+ outputs="dataframe",
94
+ title="Crunchbase Company Search",
95
+ description="Enter a company services description to find the top 5 matching companies on Crunchbase."
96
+ )
97
+
98
+ iface.launch()