Spaces:

babaTEEpe
/

startups

Sleeping

File size: 8,454 Bytes

655b3af

import feedparser
import json
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from datetime import datetime

# Configuration
USER_AGENT = "Funding Tracker Public Tool (contact@example.com)"
SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom"
TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json"

HEADERS = {'User-Agent': USER_AGENT}

def slugify(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9]+', '-', text)
    return text.strip('-')

def fetch_sec_filings():
    print("Fetching SEC Form D filings...")
    response = requests.get(SEC_RSS_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    feed = feedparser.parse(response.text)
    filings = []
    for entry in feed.entries:
        if "D" in entry.title or "Form D" in entry.title:
            try:
                parts = entry.title.split(" - ")
                if len(parts) < 2: continue
                company_name = parts[1].split(" (")[0]
                cik = entry.title.split("(")[-1].split(")")[0]
                filings.append({
                    "src": "SEC",
                    "company_name": company_name,
                    "slug": slugify(company_name),
                    "date": entry.updated,
                    "link": entry.link,
                    "id": cik,
                    "type": "Form D",
                    "funding_amount": "Unknown" # Will be enriched
                })
            except: continue
    return filings

def fetch_techcrunch():
    print("Fetching TechCrunch news...")
    response = requests.get(TC_RSS_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    feed = feedparser.parse(response.text)
    news = []
    for entry in feed.entries:
        if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]):
            company_name = entry.title.split(" raises ")[0].split(" funding")[0]
            news.append({
                "src": "TechCrunch",
                "company_name": company_name,
                "slug": slugify(company_name),
                "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
                "link": entry.link,
                "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
                "type": "News",
                "funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title) else "Check Article"
            })
    return news

def fetch_yc_startups():
    print("Fetching YC startups...")
    response = requests.get(YC_DATA_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    try:
        data = response.json()
        startups = []
        for co in data[:20]:
            startups.append({
                "src": "Y Combinator",
                "company_name": co['name'],
                "slug": slugify(co['name']),
                "date": datetime.now().isoformat(),
                "link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'],
                "summary": co.get('description', 'YC Startup'),
                "type": "Batch",
                "funding_amount": "YC Standard"
            })
        return startups
    except: return []

def enrich_sec_filing(filing):
    try:
        resp = requests.get(filing['link'], headers=HEADERS)
        soup = BeautifulSoup(resp.text, 'html.parser')
        tables = soup.find_all('table', {'summary': 'Document Format Files'})
        if tables:
            rows = tables[0].find_all('tr')
            for row in rows:
                if 'primary_doc.xml' in row.text:
                    xml_link = "https://www.sec.gov" + row.find('a')['href']
                    xml_resp = requests.get(xml_link, headers=HEADERS)
                    xml_soup = BeautifulSoup(xml_resp.text, 'xml')
                    
                    # Extract Amount
                    total_amt = xml_soup.find('totalOfferingAmount')
                    if total_amt:
                        filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text
                    
                    # Extract Industry
                    industry = xml_soup.find('industryGroup')
                    if industry:
                        filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}"

                    # Extract Addresses
                    def get_addr(addr_node):
                        if not addr_node: return "Not Available"
                        s1 = addr_node.find('street1').text if addr_node.find('street1') else ""
                        s2 = addr_node.find('street2').text if addr_node.find('street2') else ""
                        city = addr_node.find('city').text if addr_node.find('city') else ""
                        state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else ""
                        zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else ""
                        return f"{s1} {s2}, {city}, {state} {zip_code}".replace("  ", " ").strip(", ")

                    bus_addr = xml_soup.find('businessAddress')
                    mail_addr = xml_soup.find('mailingAddress')
                    filing['business_address'] = get_addr(bus_addr)
                    filing['mailing_address'] = get_addr(mail_addr)

                    founders = []
                    for p in xml_soup.find_all('relatedPersonInfo'):
                        first = p.find('firstName').text if p.find('firstName') else ""
                        last = p.find('lastName').text if p.find('lastName') else ""
                        title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive"
                        
                        # Derived contacts (simulated as requested before)
                        handle = (first + last).lower().replace(" ", "")
                        email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else ""
                        x_account = f"https://x.com/{handle}" if first and last else ""
                        
                        founders.append({
                            "name": f"{first} {last}",
                            "title": title,
                            "email": email,
                            "x_account": x_account
                        })
                    return founders
    except Exception as e:
        print(f"Error enriching {filing['company_name']}: {e}")
    return []

def main():
    all_data = []
    
    sec = fetch_sec_filings()[:50]
    tc = fetch_techcrunch()[:30]
    yc = fetch_yc_startups()[:50]
    
    for f in sec:
        print(f"Enriching {f['company_name']}...")
        f['founders'] = enrich_sec_filing(f)
        all_data.append(f)
        time.sleep(0.5)

    for item in tc + yc:
        item['founders'] = [] 
        all_data.append(item)

    all_data.sort(key=lambda x: x['date'], reverse=True)

    # deduplicate by slug
    seen = set()
    deduped = []
    for d in all_data:
        if d['slug'] not in seen:
            deduped.append(d)
            seen.add(d['slug'])

    # Sync to Frontend Public Folder
    # We try to find the web/public path relative to this script
    script_dir = os.path.dirname(os.path.abspath(__file__))
    frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json")
    
    # Save locally and to frontend
    paths_to_save = ["data.json"]
    if os.path.exists(os.path.dirname(frontend_public_path)):
        paths_to_save.append(frontend_public_path)

    for path in paths_to_save:
        with open(path, "w") as f:
            json.dump(deduped, f, indent=4)
        print(f"Success! Aggregated {len(deduped)} startups into {path}")

if __name__ == "__main__":
    main()