| import feedparser
|
| import json
|
| import requests
|
| import time
|
| import os
|
| import re
|
| from bs4 import BeautifulSoup
|
| from datetime import datetime
|
|
|
|
|
| USER_AGENT = "Funding Tracker Public Tool (contact@example.com)"
|
| SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom"
|
| TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
|
| YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json"
|
|
|
| HEADERS = {'User-Agent': USER_AGENT}
|
|
|
| def slugify(text):
|
| text = text.lower()
|
| text = re.sub(r'[^a-z0-9]+', '-', text)
|
| return text.strip('-')
|
|
|
| def fetch_sec_filings():
|
| print("Fetching SEC Form D filings...")
|
| response = requests.get(SEC_RSS_URL, headers=HEADERS)
|
| if response.status_code != 200: return []
|
|
|
| feed = feedparser.parse(response.text)
|
| filings = []
|
| for entry in feed.entries:
|
| if "D" in entry.title or "Form D" in entry.title:
|
| try:
|
| parts = entry.title.split(" - ")
|
| if len(parts) < 2: continue
|
| company_name = parts[1].split(" (")[0]
|
| cik = entry.title.split("(")[-1].split(")")[0]
|
| filings.append({
|
| "src": "SEC",
|
| "company_name": company_name,
|
| "slug": slugify(company_name),
|
| "date": entry.updated,
|
| "link": entry.link,
|
| "id": cik,
|
| "type": "Form D",
|
| "funding_amount": "Unknown"
|
| })
|
| except: continue
|
| return filings
|
|
|
| def fetch_techcrunch():
|
| print("Fetching TechCrunch news...")
|
| response = requests.get(TC_RSS_URL, headers=HEADERS)
|
| if response.status_code != 200: return []
|
|
|
| feed = feedparser.parse(response.text)
|
| news = []
|
| for entry in feed.entries:
|
| if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]):
|
| company_name = entry.title.split(" raises ")[0].split(" funding")[0]
|
| news.append({
|
| "src": "TechCrunch",
|
| "company_name": company_name,
|
| "slug": slugify(company_name),
|
| "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
|
| "link": entry.link,
|
| "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
|
| "type": "News",
|
| "funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title) else "Check Article"
|
| })
|
| return news
|
|
|
| def fetch_yc_startups():
|
| print("Fetching YC startups...")
|
| response = requests.get(YC_DATA_URL, headers=HEADERS)
|
| if response.status_code != 200: return []
|
|
|
| try:
|
| data = response.json()
|
| startups = []
|
| for co in data[:20]:
|
| startups.append({
|
| "src": "Y Combinator",
|
| "company_name": co['name'],
|
| "slug": slugify(co['name']),
|
| "date": datetime.now().isoformat(),
|
| "link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'],
|
| "summary": co.get('description', 'YC Startup'),
|
| "type": "Batch",
|
| "funding_amount": "YC Standard"
|
| })
|
| return startups
|
| except: return []
|
|
|
| def enrich_sec_filing(filing):
|
| try:
|
| resp = requests.get(filing['link'], headers=HEADERS)
|
| soup = BeautifulSoup(resp.text, 'html.parser')
|
| tables = soup.find_all('table', {'summary': 'Document Format Files'})
|
| if tables:
|
| rows = tables[0].find_all('tr')
|
| for row in rows:
|
| if 'primary_doc.xml' in row.text:
|
| xml_link = "https://www.sec.gov" + row.find('a')['href']
|
| xml_resp = requests.get(xml_link, headers=HEADERS)
|
| xml_soup = BeautifulSoup(xml_resp.text, 'xml')
|
|
|
|
|
| total_amt = xml_soup.find('totalOfferingAmount')
|
| if total_amt:
|
| filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text
|
|
|
|
|
| industry = xml_soup.find('industryGroup')
|
| if industry:
|
| filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}"
|
|
|
|
|
| def get_addr(addr_node):
|
| if not addr_node: return "Not Available"
|
| s1 = addr_node.find('street1').text if addr_node.find('street1') else ""
|
| s2 = addr_node.find('street2').text if addr_node.find('street2') else ""
|
| city = addr_node.find('city').text if addr_node.find('city') else ""
|
| state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else ""
|
| zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else ""
|
| return f"{s1} {s2}, {city}, {state} {zip_code}".replace(" ", " ").strip(", ")
|
|
|
| bus_addr = xml_soup.find('businessAddress')
|
| mail_addr = xml_soup.find('mailingAddress')
|
| filing['business_address'] = get_addr(bus_addr)
|
| filing['mailing_address'] = get_addr(mail_addr)
|
|
|
| founders = []
|
| for p in xml_soup.find_all('relatedPersonInfo'):
|
| first = p.find('firstName').text if p.find('firstName') else ""
|
| last = p.find('lastName').text if p.find('lastName') else ""
|
| title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive"
|
|
|
|
|
| handle = (first + last).lower().replace(" ", "")
|
| email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else ""
|
| x_account = f"https://x.com/{handle}" if first and last else ""
|
|
|
| founders.append({
|
| "name": f"{first} {last}",
|
| "title": title,
|
| "email": email,
|
| "x_account": x_account
|
| })
|
| return founders
|
| except Exception as e:
|
| print(f"Error enriching {filing['company_name']}: {e}")
|
| return []
|
|
|
| def main():
|
| all_data = []
|
|
|
| sec = fetch_sec_filings()[:50]
|
| tc = fetch_techcrunch()[:30]
|
| yc = fetch_yc_startups()[:50]
|
|
|
| for f in sec:
|
| print(f"Enriching {f['company_name']}...")
|
| f['founders'] = enrich_sec_filing(f)
|
| all_data.append(f)
|
| time.sleep(0.5)
|
|
|
| for item in tc + yc:
|
| item['founders'] = []
|
| all_data.append(item)
|
|
|
| all_data.sort(key=lambda x: x['date'], reverse=True)
|
|
|
|
|
| seen = set()
|
| deduped = []
|
| for d in all_data:
|
| if d['slug'] not in seen:
|
| deduped.append(d)
|
| seen.add(d['slug'])
|
|
|
|
|
|
|
| script_dir = os.path.dirname(os.path.abspath(__file__))
|
| frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json")
|
|
|
|
|
| paths_to_save = ["data.json"]
|
| if os.path.exists(os.path.dirname(frontend_public_path)):
|
| paths_to_save.append(frontend_public_path)
|
|
|
| for path in paths_to_save:
|
| with open(path, "w") as f:
|
| json.dump(deduped, f, indent=4)
|
| print(f"Success! Aggregated {len(deduped)} startups into {path}")
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|