import feedparser import json import requests import time import os import re from bs4 import BeautifulSoup from datetime import datetime # Configuration USER_AGENT = "Funding Tracker Public Tool (contact@example.com)" SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom" TC_RSS_URL = "https://techcrunch.com/category/startups/feed/" YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json" HEADERS = {'User-Agent': USER_AGENT} def slugify(text): text = text.lower() text = re.sub(r'[^a-z0-9]+', '-', text) return text.strip('-') def fetch_sec_filings(): print("Fetching SEC Form D filings...") response = requests.get(SEC_RSS_URL, headers=HEADERS) if response.status_code != 200: return [] feed = feedparser.parse(response.text) filings = [] for entry in feed.entries: if "D" in entry.title or "Form D" in entry.title: try: parts = entry.title.split(" - ") if len(parts) < 2: continue company_name = parts[1].split(" (")[0] cik = entry.title.split("(")[-1].split(")")[0] filings.append({ "src": "SEC", "company_name": company_name, "slug": slugify(company_name), "date": entry.updated, "link": entry.link, "id": cik, "type": "Form D", "funding_amount": "Unknown" # Will be enriched }) except: continue return filings def fetch_techcrunch(): print("Fetching TechCrunch news...") response = requests.get(TC_RSS_URL, headers=HEADERS) if response.status_code != 200: return [] feed = feedparser.parse(response.text) news = [] for entry in feed.entries: if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]): company_name = entry.title.split(" raises ")[0].split(" funding")[0] news.append({ "src": "TechCrunch", "company_name": company_name, "slug": slugify(company_name), "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(), "link": entry.link, "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...", "type": "News", "funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title) else "Check Article" }) return news def fetch_yc_startups(): print("Fetching YC startups...") response = requests.get(YC_DATA_URL, headers=HEADERS) if response.status_code != 200: return [] try: data = response.json() startups = [] for co in data[:20]: startups.append({ "src": "Y Combinator", "company_name": co['name'], "slug": slugify(co['name']), "date": datetime.now().isoformat(), "link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'], "summary": co.get('description', 'YC Startup'), "type": "Batch", "funding_amount": "YC Standard" }) return startups except: return [] def enrich_sec_filing(filing): try: resp = requests.get(filing['link'], headers=HEADERS) soup = BeautifulSoup(resp.text, 'html.parser') tables = soup.find_all('table', {'summary': 'Document Format Files'}) if tables: rows = tables[0].find_all('tr') for row in rows: if 'primary_doc.xml' in row.text: xml_link = "https://www.sec.gov" + row.find('a')['href'] xml_resp = requests.get(xml_link, headers=HEADERS) xml_soup = BeautifulSoup(xml_resp.text, 'xml') # Extract Amount total_amt = xml_soup.find('totalOfferingAmount') if total_amt: filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text # Extract Industry industry = xml_soup.find('industryGroup') if industry: filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}" # Extract Addresses def get_addr(addr_node): if not addr_node: return "Not Available" s1 = addr_node.find('street1').text if addr_node.find('street1') else "" s2 = addr_node.find('street2').text if addr_node.find('street2') else "" city = addr_node.find('city').text if addr_node.find('city') else "" state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else "" zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else "" return f"{s1} {s2}, {city}, {state} {zip_code}".replace(" ", " ").strip(", ") bus_addr = xml_soup.find('businessAddress') mail_addr = xml_soup.find('mailingAddress') filing['business_address'] = get_addr(bus_addr) filing['mailing_address'] = get_addr(mail_addr) founders = [] for p in xml_soup.find_all('relatedPersonInfo'): first = p.find('firstName').text if p.find('firstName') else "" last = p.find('lastName').text if p.find('lastName') else "" title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive" # Derived contacts (simulated as requested before) handle = (first + last).lower().replace(" ", "") email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else "" x_account = f"https://x.com/{handle}" if first and last else "" founders.append({ "name": f"{first} {last}", "title": title, "email": email, "x_account": x_account }) return founders except Exception as e: print(f"Error enriching {filing['company_name']}: {e}") return [] def main(): all_data = [] sec = fetch_sec_filings()[:50] tc = fetch_techcrunch()[:30] yc = fetch_yc_startups()[:50] for f in sec: print(f"Enriching {f['company_name']}...") f['founders'] = enrich_sec_filing(f) all_data.append(f) time.sleep(0.5) for item in tc + yc: item['founders'] = [] all_data.append(item) all_data.sort(key=lambda x: x['date'], reverse=True) # deduplicate by slug seen = set() deduped = [] for d in all_data: if d['slug'] not in seen: deduped.append(d) seen.add(d['slug']) # Sync to Frontend Public Folder # We try to find the web/public path relative to this script script_dir = os.path.dirname(os.path.abspath(__file__)) frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json") # Save locally and to frontend paths_to_save = ["data.json"] if os.path.exists(os.path.dirname(frontend_public_path)): paths_to_save.append(frontend_public_path) for path in paths_to_save: with open(path, "w") as f: json.dump(deduped, f, indent=4) print(f"Success! Aggregated {len(deduped)} startups into {path}") if __name__ == "__main__": main()