|
|
|
|
|
import csv, sys, time, requests |
|
|
|
|
|
BASE = "https://www.cbioportal.org/api" |
|
|
HEADERS = {"Accept": "application/json"} |
|
|
|
|
|
def get_all_studies(page_size=500): |
|
|
|
|
|
studies = [] |
|
|
page = 0 |
|
|
while True: |
|
|
params = {"pageSize": page_size, "pageNumber": page} |
|
|
r = requests.get(f"{BASE}/studies", headers=HEADERS, params=params, timeout=60) |
|
|
r.raise_for_status() |
|
|
batch = r.json() |
|
|
if not batch: |
|
|
break |
|
|
studies.extend(batch) |
|
|
page += 1 |
|
|
|
|
|
time.sleep(0.2) |
|
|
return studies |
|
|
|
|
|
def to_list(x): |
|
|
if x is None: |
|
|
return [] |
|
|
if isinstance(x, list): |
|
|
return x |
|
|
|
|
|
return [s.strip() for s in str(x).split(",") if s.strip()] |
|
|
|
|
|
def main(out_csv="cbioportal_study_pmids.csv"): |
|
|
studies = get_all_studies() |
|
|
|
|
|
rows = [] |
|
|
for s in studies: |
|
|
pmids = to_list(s.get("pmid")) |
|
|
for pmid in pmids: |
|
|
rows.append({ |
|
|
"studyId": s.get("studyId"), |
|
|
|
|
|
|
|
|
"pmid": pmid |
|
|
}) |
|
|
|
|
|
with open(out_csv, "w", newline="", encoding="utf-8") as f: |
|
|
|
|
|
w = csv.DictWriter(f, fieldnames=["studyId", "pmids"]) |
|
|
w.writeheader() |
|
|
w.writerows(rows) |
|
|
print(f"wrote {len(rows)} rows to {out_csv}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
out = sys.argv[1] if len(sys.argv) > 1 else "cbioportal_study_pmids.csv" |
|
|
main(out) |
|
|
|