Spaces:

babaTEEpe
/

startups

Sleeping

App Files Files Community

startups / aggregator.py

babaTEEpe

Upload 8 files

655b3af verified about 1 month ago

raw

history blame contribute delete

8.45 kB

	import feedparser
	import json
	import requests
	import time
	import os
	import re
	from bs4 import BeautifulSoup
	from datetime import datetime

	# Configuration
	USER_AGENT = "Funding Tracker Public Tool (contact@example.com)"
	SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom"
	TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
	YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json"

	HEADERS = {'User-Agent': USER_AGENT}

	def slugify(text):
	text = text.lower()
	text = re.sub(r'[^a-z0-9]+', '-', text)
	return text.strip('-')

	def fetch_sec_filings():
	print("Fetching SEC Form D filings...")
	response = requests.get(SEC_RSS_URL, headers=HEADERS)
	if response.status_code != 200: return []

	feed = feedparser.parse(response.text)
	filings = []
	for entry in feed.entries:
	if "D" in entry.title or "Form D" in entry.title:
	try:
	parts = entry.title.split(" - ")
	if len(parts) < 2: continue
	company_name = parts[1].split(" (")[0]
	cik = entry.title.split("(")[-1].split(")")[0]
	filings.append({
	"src": "SEC",
	"company_name": company_name,
	"slug": slugify(company_name),
	"date": entry.updated,
	"link": entry.link,
	"id": cik,
	"type": "Form D",
	"funding_amount": "Unknown" # Will be enriched
	})
	except: continue
	return filings

	def fetch_techcrunch():
	print("Fetching TechCrunch news...")
	response = requests.get(TC_RSS_URL, headers=HEADERS)
	if response.status_code != 200: return []

	feed = feedparser.parse(response.text)
	news = []
	for entry in feed.entries:
	if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]):
	company_name = entry.title.split(" raises ")[0].split(" funding")[0]
	news.append({
	"src": "TechCrunch",
	"company_name": company_name,
	"slug": slugify(company_name),
	"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
	"link": entry.link,
	"summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
	"type": "News",
	"funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M\|B\|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M\|B\|K)?', entry.title) else "Check Article"
	})
	return news

	def fetch_yc_startups():
	print("Fetching YC startups...")
	response = requests.get(YC_DATA_URL, headers=HEADERS)
	if response.status_code != 200: return []

	try:
	data = response.json()
	startups = []
	for co in data[:20]:
	startups.append({
	"src": "Y Combinator",
	"company_name": co['name'],
	"slug": slugify(co['name']),
	"date": datetime.now().isoformat(),
	"link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'],
	"summary": co.get('description', 'YC Startup'),
	"type": "Batch",
	"funding_amount": "YC Standard"
	})
	return startups
	except: return []

	def enrich_sec_filing(filing):
	try:
	resp = requests.get(filing['link'], headers=HEADERS)
	soup = BeautifulSoup(resp.text, 'html.parser')
	tables = soup.find_all('table', {'summary': 'Document Format Files'})
	if tables:
	rows = tables[0].find_all('tr')
	for row in rows:
	if 'primary_doc.xml' in row.text:
	xml_link = "https://www.sec.gov" + row.find('a')['href']
	xml_resp = requests.get(xml_link, headers=HEADERS)
	xml_soup = BeautifulSoup(xml_resp.text, 'xml')

	# Extract Amount
	total_amt = xml_soup.find('totalOfferingAmount')
	if total_amt:
	filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text

	# Extract Industry
	industry = xml_soup.find('industryGroup')
	if industry:
	filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}"

	# Extract Addresses
	def get_addr(addr_node):
	if not addr_node: return "Not Available"
	s1 = addr_node.find('street1').text if addr_node.find('street1') else ""
	s2 = addr_node.find('street2').text if addr_node.find('street2') else ""
	city = addr_node.find('city').text if addr_node.find('city') else ""
	state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else ""
	zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else ""
	return f"{s1} {s2}, {city}, {state} {zip_code}".replace(" ", " ").strip(", ")

	bus_addr = xml_soup.find('businessAddress')
	mail_addr = xml_soup.find('mailingAddress')
	filing['business_address'] = get_addr(bus_addr)
	filing['mailing_address'] = get_addr(mail_addr)

	founders = []
	for p in xml_soup.find_all('relatedPersonInfo'):
	first = p.find('firstName').text if p.find('firstName') else ""
	last = p.find('lastName').text if p.find('lastName') else ""
	title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive"

	# Derived contacts (simulated as requested before)
	handle = (first + last).lower().replace(" ", "")
	email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else ""
	x_account = f"https://x.com/{handle}" if first and last else ""

	founders.append({
	"name": f"{first} {last}",
	"title": title,
	"email": email,
	"x_account": x_account
	})
	return founders
	except Exception as e:
	print(f"Error enriching {filing['company_name']}: {e}")
	return []

	def main():
	all_data = []

	sec = fetch_sec_filings()[:50]
	tc = fetch_techcrunch()[:30]
	yc = fetch_yc_startups()[:50]

	for f in sec:
	print(f"Enriching {f['company_name']}...")
	f['founders'] = enrich_sec_filing(f)
	all_data.append(f)
	time.sleep(0.5)

	for item in tc + yc:
	item['founders'] = []
	all_data.append(item)

	all_data.sort(key=lambda x: x['date'], reverse=True)

	# deduplicate by slug
	seen = set()
	deduped = []
	for d in all_data:
	if d['slug'] not in seen:
	deduped.append(d)
	seen.add(d['slug'])

	# Sync to Frontend Public Folder
	# We try to find the web/public path relative to this script
	script_dir = os.path.dirname(os.path.abspath(__file__))
	frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json")

	# Save locally and to frontend
	paths_to_save = ["data.json"]
	if os.path.exists(os.path.dirname(frontend_public_path)):
	paths_to_save.append(frontend_public_path)

	for path in paths_to_save:
	with open(path, "w") as f:
	json.dump(deduped, f, indent=4)
	print(f"Success! Aggregated {len(deduped)} startups into {path}")

	if __name__ == "__main__":
	main()