cbio-vec / pull_pdfs.py
jim-bo's picture
initial commit
56689a3
#!/usr/bin/env python3
import csv, sys, time, requests
BASE = "https://www.cbioportal.org/api"
HEADERS = {"Accept": "application/json"} # add 'X-API-KEY' here if your instance needs it
def get_all_studies(page_size=500):
# cBioPortal API supports paging via pageSize/pageNumber
studies = []
page = 0
while True:
params = {"pageSize": page_size, "pageNumber": page}
r = requests.get(f"{BASE}/studies", headers=HEADERS, params=params, timeout=60)
r.raise_for_status()
batch = r.json()
if not batch:
break
studies.extend(batch)
page += 1
# friendly throttle
time.sleep(0.2)
return studies
def to_list(x):
if x is None:
return []
if isinstance(x, list):
return x
# some portals store comma-separated string
return [s.strip() for s in str(x).split(",") if s.strip()]
def main(out_csv="cbioportal_study_pmids.csv"):
studies = get_all_studies()
# fields commonly present: studyId, name, shortName, cancerTypeId, description, citation, pmid, etc.
rows = []
for s in studies:
pmids = to_list(s.get("pmid"))
for pmid in pmids:
rows.append({
"studyId": s.get("studyId"),
#"name": s.get("name"),
#"pmids": ";".join(pmids) if pmids else ""
"pmid": pmid
})
# write CSV
with open(out_csv, "w", newline="", encoding="utf-8") as f:
#w = csv.DictWriter(f, fieldnames=["studyId", "name", "pmids"])
w = csv.DictWriter(f, fieldnames=["studyId", "pmids"])
w.writeheader()
w.writerows(rows)
print(f"wrote {len(rows)} rows to {out_csv}")
if __name__ == "__main__":
out = sys.argv[1] if len(sys.argv) > 1 else "cbioportal_study_pmids.csv"
main(out)