import requests, json, re, sys from openpyxl import Workbook APP = "45BWZJ1SGC" def get_key(): r = requests.get("https://www.ycombinator.com/companies", headers={"User-Agent": "Mozilla/5.0"}, timeout=30) m = re.search(r'AlgoliaOpts\s*=\s*(\{[^}]*\})', r.text) return json.loads(m.group(1))["key"] KEY = get_key() URL = f"https://{APP.lower()}-dsn.algolia.net/1/indexes/YCCompany_production/query" HDR = {"X-Algolia-Application-Id": APP, "X-Algolia-API-Key": KEY, "Content-Type": "application/json"} BATCHES = ["Fall 2025", "Winter 2026", "Spring 2026", "Summer 2026"] def fetch_batch(batch): hits = [] page = 0 while True: body = {"query": "", "facetFilters": [[f"batch:{batch}"]], "hitsPerPage": 1000, "page": page} r = requests.post(URL, headers=HDR, data=json.dumps(body), timeout=30) d = r.json() hits.extend(d.get("hits", [])) if page + 1 >= d.get("nbPages", 0): break page += 1 return hits wb = Workbook() ws = wb.active ws.title = "YC Startups" ws.append(["Name", "Batch", "Website", "One-liner", "Location", "Industry", "Team Size", "Status", "Hiring", "Tags", "YC Page"]) totals = {} all_hits = [] for b in BATCHES: hits = fetch_batch(b) totals[b] = len(hits) print(f"{b}: {len(hits)}", flush=True) all_hits.extend(hits) for h in all_hits: ws.append([ h.get("name", ""), h.get("batch", ""), h.get("website", ""), h.get("one_liner", ""), h.get("all_locations", ""), h.get("industry", ""), h.get("team_size", ""), h.get("status", ""), "Yes" if h.get("isHiring") else "No", ", ".join(h.get("tags", []) or []), f"https://www.ycombinator.com/companies/{h.get('slug','')}", ]) for col, width in enumerate([22, 14, 40, 55, 28, 18, 11, 10, 8, 40, 50], start=1): ws.column_dimensions[chr(64 + col)].width = width out = "/home/azureuser/yc_companies.xlsx" wb.save(out) print(f"\nTOTAL: {sum(totals.values())} companies") print(f"Saved: {out}")