startups / aggregator.py
babaTEEpe's picture
Upload 8 files
655b3af verified
import feedparser
import json
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from datetime import datetime
# Configuration
USER_AGENT = "Funding Tracker Public Tool (contact@example.com)"
SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom"
TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json"
HEADERS = {'User-Agent': USER_AGENT}
def slugify(text):
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')
def fetch_sec_filings():
print("Fetching SEC Form D filings...")
response = requests.get(SEC_RSS_URL, headers=HEADERS)
if response.status_code != 200: return []
feed = feedparser.parse(response.text)
filings = []
for entry in feed.entries:
if "D" in entry.title or "Form D" in entry.title:
try:
parts = entry.title.split(" - ")
if len(parts) < 2: continue
company_name = parts[1].split(" (")[0]
cik = entry.title.split("(")[-1].split(")")[0]
filings.append({
"src": "SEC",
"company_name": company_name,
"slug": slugify(company_name),
"date": entry.updated,
"link": entry.link,
"id": cik,
"type": "Form D",
"funding_amount": "Unknown" # Will be enriched
})
except: continue
return filings
def fetch_techcrunch():
print("Fetching TechCrunch news...")
response = requests.get(TC_RSS_URL, headers=HEADERS)
if response.status_code != 200: return []
feed = feedparser.parse(response.text)
news = []
for entry in feed.entries:
if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]):
company_name = entry.title.split(" raises ")[0].split(" funding")[0]
news.append({
"src": "TechCrunch",
"company_name": company_name,
"slug": slugify(company_name),
"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
"link": entry.link,
"summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
"type": "News",
"funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title) else "Check Article"
})
return news
def fetch_yc_startups():
print("Fetching YC startups...")
response = requests.get(YC_DATA_URL, headers=HEADERS)
if response.status_code != 200: return []
try:
data = response.json()
startups = []
for co in data[:20]:
startups.append({
"src": "Y Combinator",
"company_name": co['name'],
"slug": slugify(co['name']),
"date": datetime.now().isoformat(),
"link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'],
"summary": co.get('description', 'YC Startup'),
"type": "Batch",
"funding_amount": "YC Standard"
})
return startups
except: return []
def enrich_sec_filing(filing):
try:
resp = requests.get(filing['link'], headers=HEADERS)
soup = BeautifulSoup(resp.text, 'html.parser')
tables = soup.find_all('table', {'summary': 'Document Format Files'})
if tables:
rows = tables[0].find_all('tr')
for row in rows:
if 'primary_doc.xml' in row.text:
xml_link = "https://www.sec.gov" + row.find('a')['href']
xml_resp = requests.get(xml_link, headers=HEADERS)
xml_soup = BeautifulSoup(xml_resp.text, 'xml')
# Extract Amount
total_amt = xml_soup.find('totalOfferingAmount')
if total_amt:
filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text
# Extract Industry
industry = xml_soup.find('industryGroup')
if industry:
filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}"
# Extract Addresses
def get_addr(addr_node):
if not addr_node: return "Not Available"
s1 = addr_node.find('street1').text if addr_node.find('street1') else ""
s2 = addr_node.find('street2').text if addr_node.find('street2') else ""
city = addr_node.find('city').text if addr_node.find('city') else ""
state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else ""
zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else ""
return f"{s1} {s2}, {city}, {state} {zip_code}".replace(" ", " ").strip(", ")
bus_addr = xml_soup.find('businessAddress')
mail_addr = xml_soup.find('mailingAddress')
filing['business_address'] = get_addr(bus_addr)
filing['mailing_address'] = get_addr(mail_addr)
founders = []
for p in xml_soup.find_all('relatedPersonInfo'):
first = p.find('firstName').text if p.find('firstName') else ""
last = p.find('lastName').text if p.find('lastName') else ""
title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive"
# Derived contacts (simulated as requested before)
handle = (first + last).lower().replace(" ", "")
email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else ""
x_account = f"https://x.com/{handle}" if first and last else ""
founders.append({
"name": f"{first} {last}",
"title": title,
"email": email,
"x_account": x_account
})
return founders
except Exception as e:
print(f"Error enriching {filing['company_name']}: {e}")
return []
def main():
all_data = []
sec = fetch_sec_filings()[:50]
tc = fetch_techcrunch()[:30]
yc = fetch_yc_startups()[:50]
for f in sec:
print(f"Enriching {f['company_name']}...")
f['founders'] = enrich_sec_filing(f)
all_data.append(f)
time.sleep(0.5)
for item in tc + yc:
item['founders'] = []
all_data.append(item)
all_data.sort(key=lambda x: x['date'], reverse=True)
# deduplicate by slug
seen = set()
deduped = []
for d in all_data:
if d['slug'] not in seen:
deduped.append(d)
seen.add(d['slug'])
# Sync to Frontend Public Folder
# We try to find the web/public path relative to this script
script_dir = os.path.dirname(os.path.abspath(__file__))
frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json")
# Save locally and to frontend
paths_to_save = ["data.json"]
if os.path.exists(os.path.dirname(frontend_public_path)):
paths_to_save.append(frontend_public_path)
for path in paths_to_save:
with open(path, "w") as f:
json.dump(deduped, f, indent=4)
print(f"Success! Aggregated {len(deduped)} startups into {path}")
if __name__ == "__main__":
main()