azure-scripts / yc_scrape.py
vivekvar's picture
azure home scripts: data gen, training, misc
a70eb3d verified
import requests, json, re, sys
from openpyxl import Workbook
APP = "45BWZJ1SGC"
def get_key():
r = requests.get("https://www.ycombinator.com/companies",
headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
m = re.search(r'AlgoliaOpts\s*=\s*(\{[^}]*\})', r.text)
return json.loads(m.group(1))["key"]
KEY = get_key()
URL = f"https://{APP.lower()}-dsn.algolia.net/1/indexes/YCCompany_production/query"
HDR = {"X-Algolia-Application-Id": APP, "X-Algolia-API-Key": KEY,
"Content-Type": "application/json"}
BATCHES = ["Fall 2025", "Winter 2026", "Spring 2026", "Summer 2026"]
def fetch_batch(batch):
hits = []
page = 0
while True:
body = {"query": "", "facetFilters": [[f"batch:{batch}"]],
"hitsPerPage": 1000, "page": page}
r = requests.post(URL, headers=HDR, data=json.dumps(body), timeout=30)
d = r.json()
hits.extend(d.get("hits", []))
if page + 1 >= d.get("nbPages", 0):
break
page += 1
return hits
wb = Workbook()
ws = wb.active
ws.title = "YC Startups"
ws.append(["Name", "Batch", "Website", "One-liner", "Location",
"Industry", "Team Size", "Status", "Hiring", "Tags", "YC Page"])
totals = {}
all_hits = []
for b in BATCHES:
hits = fetch_batch(b)
totals[b] = len(hits)
print(f"{b}: {len(hits)}", flush=True)
all_hits.extend(hits)
for h in all_hits:
ws.append([
h.get("name", ""),
h.get("batch", ""),
h.get("website", ""),
h.get("one_liner", ""),
h.get("all_locations", ""),
h.get("industry", ""),
h.get("team_size", ""),
h.get("status", ""),
"Yes" if h.get("isHiring") else "No",
", ".join(h.get("tags", []) or []),
f"https://www.ycombinator.com/companies/{h.get('slug','')}",
])
for col, width in enumerate([22, 14, 40, 55, 28, 18, 11, 10, 8, 40, 50], start=1):
ws.column_dimensions[chr(64 + col)].width = width
out = "/home/azureuser/yc_companies.xlsx"
wb.save(out)
print(f"\nTOTAL: {sum(totals.values())} companies")
print(f"Saved: {out}")