File size: 8,454 Bytes
655b3af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import feedparser
import json
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from datetime import datetime

# Configuration
USER_AGENT = "Funding Tracker Public Tool (contact@example.com)"
SEC_RSS_URL = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=D&owner=exclude&start=0&count=100&output=atom"
TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
YC_DATA_URL = "https://raw.githubusercontent.com/yc-oss/api/main/data/companies.json"

HEADERS = {'User-Agent': USER_AGENT}

def slugify(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9]+', '-', text)
    return text.strip('-')

def fetch_sec_filings():
    print("Fetching SEC Form D filings...")
    response = requests.get(SEC_RSS_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    feed = feedparser.parse(response.text)
    filings = []
    for entry in feed.entries:
        if "D" in entry.title or "Form D" in entry.title:
            try:
                parts = entry.title.split(" - ")
                if len(parts) < 2: continue
                company_name = parts[1].split(" (")[0]
                cik = entry.title.split("(")[-1].split(")")[0]
                filings.append({
                    "src": "SEC",
                    "company_name": company_name,
                    "slug": slugify(company_name),
                    "date": entry.updated,
                    "link": entry.link,
                    "id": cik,
                    "type": "Form D",
                    "funding_amount": "Unknown" # Will be enriched
                })
            except: continue
    return filings

def fetch_techcrunch():
    print("Fetching TechCrunch news...")
    response = requests.get(TC_RSS_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    feed = feedparser.parse(response.text)
    news = []
    for entry in feed.entries:
        if any(keyword in entry.title.lower() for keyword in ["funding", "raise", "seed", "series a", "series b", "series c", "backed"]):
            company_name = entry.title.split(" raises ")[0].split(" funding")[0]
            news.append({
                "src": "TechCrunch",
                "company_name": company_name,
                "slug": slugify(company_name),
                "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
                "link": entry.link,
                "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
                "type": "News",
                "funding_amount": re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title).group(0) if re.search(r'\$\d+(?:\.\d+)?(?:M|B|K)?', entry.title) else "Check Article"
            })
    return news

def fetch_yc_startups():
    print("Fetching YC startups...")
    response = requests.get(YC_DATA_URL, headers=HEADERS)
    if response.status_code != 200: return []
    
    try:
        data = response.json()
        startups = []
        for co in data[:20]:
            startups.append({
                "src": "Y Combinator",
                "company_name": co['name'],
                "slug": slugify(co['name']),
                "date": datetime.now().isoformat(),
                "link": f"https://www.ycombinator.com/companies/{co['slug']}" if 'slug' in co else co['website'],
                "summary": co.get('description', 'YC Startup'),
                "type": "Batch",
                "funding_amount": "YC Standard"
            })
        return startups
    except: return []

def enrich_sec_filing(filing):
    try:
        resp = requests.get(filing['link'], headers=HEADERS)
        soup = BeautifulSoup(resp.text, 'html.parser')
        tables = soup.find_all('table', {'summary': 'Document Format Files'})
        if tables:
            rows = tables[0].find_all('tr')
            for row in rows:
                if 'primary_doc.xml' in row.text:
                    xml_link = "https://www.sec.gov" + row.find('a')['href']
                    xml_resp = requests.get(xml_link, headers=HEADERS)
                    xml_soup = BeautifulSoup(xml_resp.text, 'xml')
                    
                    # Extract Amount
                    total_amt = xml_soup.find('totalOfferingAmount')
                    if total_amt:
                        filing['funding_amount'] = f"${total_amt.text}" if total_amt.text.isdigit() else total_amt.text
                    
                    # Extract Industry
                    industry = xml_soup.find('industryGroup')
                    if industry:
                        filing['summary'] = f"Industry: {industry.find('industryGroupType').text if industry.find('industryGroupType') else 'Miscellaneous'}"

                    # Extract Addresses
                    def get_addr(addr_node):
                        if not addr_node: return "Not Available"
                        s1 = addr_node.find('street1').text if addr_node.find('street1') else ""
                        s2 = addr_node.find('street2').text if addr_node.find('street2') else ""
                        city = addr_node.find('city').text if addr_node.find('city') else ""
                        state = addr_node.find('stateOrCountry').text if addr_node.find('stateOrCountry') else ""
                        zip_code = addr_node.find('zipCode').text if addr_node.find('zipCode') else ""
                        return f"{s1} {s2}, {city}, {state} {zip_code}".replace("  ", " ").strip(", ")

                    bus_addr = xml_soup.find('businessAddress')
                    mail_addr = xml_soup.find('mailingAddress')
                    filing['business_address'] = get_addr(bus_addr)
                    filing['mailing_address'] = get_addr(mail_addr)

                    founders = []
                    for p in xml_soup.find_all('relatedPersonInfo'):
                        first = p.find('firstName').text if p.find('firstName') else ""
                        last = p.find('lastName').text if p.find('lastName') else ""
                        title = p.find('relationshipRole').text if p.find('relationshipRole') else "Executive"
                        
                        # Derived contacts (simulated as requested before)
                        handle = (first + last).lower().replace(" ", "")
                        email = f"{handle}@{filing['company_name'].lower().split(' ')[0]}.com" if first and last else ""
                        x_account = f"https://x.com/{handle}" if first and last else ""
                        
                        founders.append({
                            "name": f"{first} {last}",
                            "title": title,
                            "email": email,
                            "x_account": x_account
                        })
                    return founders
    except Exception as e:
        print(f"Error enriching {filing['company_name']}: {e}")
    return []

def main():
    all_data = []
    
    sec = fetch_sec_filings()[:50]
    tc = fetch_techcrunch()[:30]
    yc = fetch_yc_startups()[:50]
    
    for f in sec:
        print(f"Enriching {f['company_name']}...")
        f['founders'] = enrich_sec_filing(f)
        all_data.append(f)
        time.sleep(0.5)

    for item in tc + yc:
        item['founders'] = [] 
        all_data.append(item)

    all_data.sort(key=lambda x: x['date'], reverse=True)

    # deduplicate by slug
    seen = set()
    deduped = []
    for d in all_data:
        if d['slug'] not in seen:
            deduped.append(d)
            seen.add(d['slug'])

    # Sync to Frontend Public Folder
    # We try to find the web/public path relative to this script
    script_dir = os.path.dirname(os.path.abspath(__file__))
    frontend_public_path = os.path.join(script_dir, "..", "web", "public", "data.json")
    
    # Save locally and to frontend
    paths_to_save = ["data.json"]
    if os.path.exists(os.path.dirname(frontend_public_path)):
        paths_to_save.append(frontend_public_path)

    for path in paths_to_save:
        with open(path, "w") as f:
            json.dump(deduped, f, indent=4)
        print(f"Success! Aggregated {len(deduped)} startups into {path}")

if __name__ == "__main__":
    main()