jim-bo commited on
Commit
56689a3
·
1 Parent(s): 43ba613

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ index_dir/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ batch_pdfs/
2
+ Pipfile
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file and install dependencies
8
+ COPY requirements.txt ./
9
+ RUN pip install -r requirements.txt
10
+
11
+ # Copy the rest of the application code
12
+ COPY . .
13
+
14
+ # Make port 8123 available to the world outside this container
15
+ EXPOSE 8123
16
+
17
+ # Define environment variable
18
+ ENV INDEX_DIR=./index_dir
19
+
20
+ # Run mcp_server.py when the container launches
21
+ CMD ["python", "mcp_server.py", "--host", "0.0.0.0", "--port", "8123"]
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_openai import OpenAIEmbeddings
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+
8
+ # Create the pdfs directory if it doesn't exist
9
+ if not os.path.exists("pdfs"):
10
+ os.makedirs("pdfs")
11
+
12
+ def get_pdf_files():
13
+ """Gets the list of PDF files from the 'pdfs' directory."""
14
+ return [f for f in os.listdir("pdfs") if f.endswith(".pdf")]
15
+
16
+ def index_pdfs():
17
+ """Indexes the PDF files in the 'pdfs' directory."""
18
+ pdf_files = get_pdf_files()
19
+ if not pdf_files:
20
+ return "No PDF files found in the 'pdfs' directory."
21
+
22
+ success_files = []
23
+ failed_files = []
24
+
25
+ for pdf_file in pdf_files:
26
+ try:
27
+ file_path = os.path.join("pdfs", pdf_file)
28
+ if os.path.getsize(file_path) == 0:
29
+ failed_files.append(f"{pdf_file} (file is empty)")
30
+ continue
31
+
32
+ loader = PyPDFLoader(file_path)
33
+ documents = loader.load()
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
35
+ texts = text_splitter.split_documents(documents)
36
+ embeddings = OpenAIEmbeddings()
37
+ db = Chroma.from_documents(texts, embeddings, persist_directory="./db")
38
+ db.persist()
39
+ success_files.append(pdf_file)
40
+ except Exception as e:
41
+ failed_files.append(f"{pdf_file} (Error: {e})")
42
+
43
+ status = ""
44
+ if success_files:
45
+ status += f"Successfully indexed: {', '.join(success_files)}\n"
46
+ if failed_files:
47
+ status += f"Failed to index: {', '.join(failed_files)}"
48
+
49
+ return status if status else "No files were processed."
50
+
51
+ def search(query):
52
+ """Searches the indexed PDFs for the given query."""
53
+ embeddings = OpenAIEmbeddings()
54
+ db = Chroma(persist_directory="./db", embedding_function=embeddings)
55
+ docs = db.similarity_search(query)
56
+ results = ""
57
+ for doc in docs:
58
+ results += f"Source: {doc.metadata['source']}\n"
59
+ results += f"Content: {doc.page_content}\n\n"
60
+ return results
61
+
62
+ with gr.Blocks() as demo:
63
+ gr.Markdown("# Simple Semantic Search App")
64
+ with gr.Tab("Index PDFs"):
65
+ pdf_files_display = gr.Textbox(label="Available PDF Files", interactive=False, value="\n".join(get_pdf_files()))
66
+ index_button = gr.Button("Index PDFs")
67
+ index_status = gr.Textbox(label="Indexing Status", interactive=False)
68
+ index_button.click(index_pdfs, inputs=None, outputs=index_status)
69
+ with gr.Tab("Search"):
70
+ search_query = gr.Textbox(label="Search Query")
71
+ search_button = gr.Button("Search")
72
+ search_results = gr.Textbox(label="Search Results", interactive=False)
73
+ search_button.click(search, inputs=search_query, outputs=search_results)
74
+
75
+ if __name__ == "__main__":
76
+ demo.launch()
app2.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ from pathlib import Path
5
+ from typing import List, Dict
6
+
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+ import faiss
10
+
11
+ # --- Configuration ---
12
+ INDEX_DIR = "./index_dir"
13
+ EMBED_MODEL = "intfloat/e5-base-v2"
14
+ RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
15
+ FETCH_K = 40
16
+ TOP_K = 8
17
+
18
+ # --- Global variables to hold models and data ---
19
+ index = None
20
+ meta = None
21
+ embedder = None
22
+ reranker = None
23
+
24
+ # --- Utility Functions (adapted from pdf_semsearch.py) ---
25
+
26
+ def e5_prefix(text: str, is_query: bool, model_name: str) -> str:
27
+ """Add E5-style prefixes if using an e5 model."""
28
+ if "e5" in model_name.lower():
29
+ return f"{'query' if is_query else 'passage'}: {text}"
30
+ return text
31
+
32
+ def read_metadata(meta_path: str) -> List[Dict]:
33
+ """Reads metadata from a JSONL file."""
34
+ out = []
35
+ with open(meta_path, "r", encoding="utf-8") as f:
36
+ for line in f:
37
+ out.append(json.loads(line))
38
+ return out
39
+
40
+ def pretty_snippet(s: str, max_len: int = 320) -> str:
41
+ """Cleans up and truncates text for display."""
42
+ s = " ".join(s.split())
43
+ return s if len(s) <= max_len else s[: max_len - 1] + "…"
44
+
45
+ # --- Model and Data Loading ---
46
+
47
+ def load_models_and_data():
48
+ """Loads the FAISS index, metadata, and models into memory."""
49
+ global index, meta, embedder, reranker
50
+
51
+ index_path = os.path.join(INDEX_DIR, "faiss.index")
52
+ meta_path = os.path.join(INDEX_DIR, "meta.jsonl")
53
+
54
+ if not os.path.exists(index_path) or not os.path.exists(meta_path):
55
+ raise FileNotFoundError(
56
+ f"Index not found in '{INDEX_DIR}'. "
57
+ "Please run the indexing command from pdf_semsearch.py first."
58
+ )
59
+
60
+ print(f"[*] Loading FAISS index: {index_path}")
61
+ index = faiss.read_index(index_path)
62
+
63
+ print("[*] Loading metadata…")
64
+ meta = read_metadata(meta_path)
65
+
66
+ print(f"[*] Loading embedding model: {EMBED_MODEL}")
67
+ embedder = SentenceTransformer(EMBED_MODEL)
68
+
69
+ print(f"[*] Loading reranker model: {RERANKER_MODEL}")
70
+ reranker = CrossEncoder(RERANKER_MODEL)
71
+ print("[✓] Models and data loaded.")
72
+
73
+ # --- Search Function ---
74
+
75
+ def search(query: str):
76
+ """
77
+ Performs semantic search on the loaded index.
78
+ Takes a user query, finds relevant chunks, reranks them, and returns formatted results.
79
+ """
80
+ if not query or not query.strip():
81
+ return "Please enter a search query."
82
+
83
+ if not all([index, meta, embedder, reranker]):
84
+ return "Error: Models or data not loaded. Please check the console."
85
+
86
+ # 1. Embed the query
87
+ query_text = e5_prefix(query, is_query=True, model_name=EMBED_MODEL)
88
+ qvec = embedder.encode([query_text], normalize_embeddings=True).astype("float32")
89
+
90
+ # 2. Search the FAISS index
91
+ D, I = index.search(qvec, FETCH_K)
92
+
93
+ # 3. Retrieve candidates
94
+ candidates = []
95
+ for j, idx in enumerate(I[0]):
96
+ if idx == -1:
97
+ continue
98
+ rec = dict(meta[idx])
99
+ rec["ann_score"] = float(D[0][j])
100
+ candidates.append(rec)
101
+
102
+ if not candidates:
103
+ return "No results found."
104
+
105
+ # 4. Rerank the candidates
106
+ pairs = [(query, c["text"]) for c in candidates]
107
+ scores = reranker.predict(pairs)
108
+ for c, s in zip(candidates, scores):
109
+ c["rerank_score"] = float(s)
110
+
111
+ candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
112
+
113
+ # 5. Format the top results for display
114
+ results = candidates[:TOP_K]
115
+ output = f"## Results for: \"{query}\"\n\n"
116
+ for i, r in enumerate(results, start=1):
117
+ base = Path(r["doc_path"]).name
118
+ score = r.get("rerank_score", r["ann_score"])
119
+ output += (
120
+ f"**{i}. {base} (Page: {r['page']}, Score: {score:.3f})**\n\n"
121
+ f"> {pretty_snippet(r['text'])}\n\n"
122
+ "---"
123
+ )
124
+
125
+ return output
126
+
127
+ # --- Gradio App ---
128
+
129
+ def create_gradio_app():
130
+ """Creates and returns the Gradio interface."""
131
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
132
+ gr.Markdown(
133
+ """
134
+ # Semantic PDF Search
135
+ Enter a query to search through the indexed PDF documents.
136
+ The index must be created first using `pdf_semsearch.py`.
137
+ """
138
+ )
139
+ with gr.Row():
140
+ query_input = gr.Textbox(
141
+ label="Search Query",
142
+ placeholder="e.g., KRAS G12C eligibility in lung cancer",
143
+ lines=1,
144
+ scale=4,
145
+ )
146
+ search_button = gr.Button("Search", variant="primary", scale=1)
147
+
148
+ results_output = gr.Markdown(label="Search Results")
149
+
150
+ search_button.click(
151
+ fn=search,
152
+ inputs=query_input,
153
+ outputs=results_output,
154
+ )
155
+ query_input.submit(
156
+ fn=search,
157
+ inputs=query_input,
158
+ outputs=results_output,
159
+ )
160
+ return iface
161
+
162
+ if __name__ == "__main__":
163
+ load_models_and_data()
164
+ app = create_gradio_app()
165
+ app.launch()
cbioportal_study_pmids.csv ADDED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ studyId,pmid
2
+ all_stjude_2015,25730765
3
+ acyc_fmi_2014,24418857
4
+ acyc_mgh_2016,26829750
5
+ appendiceal_msk_2022,36493333
6
+ blca_plasmacytoid_mskcc_2016,26901067
7
+ blca_mskcc_solit_2014,25092538
8
+ blca_nmibc_2017,28583311
9
+ brca_mapk_hp_msk_2021,34795269
10
+ bowel_colitis_msk_2022,36611031
11
+ bladder_columbia_msk_2018,29625057
12
+ bladder_msk_2023,37682528
13
+ bm_nsclc_mskcc_2023,37591896
14
+ cfdna_msk_2019,31768066
15
+ ccrcc_dfci_2019,29301960
16
+ cervix_msk_2023,37643132
17
+ chol_jhu_2013,24185509
18
+ chol_nccs_2013,24185513
19
+ chol_nus_2012,22561520
20
+ coadread_mskcc,25164765
21
+ cllsll_icgc_2011,22158541
22
+ coad_caseccc_2015,25583493
23
+ chol_msk_2018,29848569
24
+ chol_icgc_2017,28667006
25
+ coadread_mskresistance_2022,36355783
26
+ cscc_dfarber_2015,25589618
27
+ ctcl_columbia_2015,26551667
28
+ crc_eo_2020,34405229
29
+ crc_apc_impact_2020,32730818
30
+ crc_nigerian_2020,34819518
31
+ crc_dd_2022,35235413
32
+ difg_msk_2023,37910594
33
+ escc_ucla_2014,24686850
34
+ hcc_msk_venturaa_2018,30052636
35
+ gct_msk_2020,32897884
36
+ egc_msk_tp53_ccr_2022,35377946
37
+ gbc_mskcc_2022,36228155
38
+ gist_msk_2022,36593350
39
+ egc_msk_2023,37699004
40
+ hcc_jcopo_msk_2023,37769223
41
+ histiocytosis_cobi_msk_2019,30867592
42
+ ihch_ismms_2015,25608663
43
+ lgsoc_mapk_msk_2022,35443055
44
+ luad_tsp,18948947
45
+ lymphoma_cellline_msk_2020,33067607
46
+ lung_msk_mind_2020,36038778
47
+ mbc_msk_2021,33863915
48
+ mnm_washu_2016,27959731
49
+ metastatic_solid_tumors_mich_2017,28783718
50
+ mixed_kunga_msk_2022,35585047
51
+ msk_ch_ped_2021,35078859
52
+ mtnn_msk_2022,37078708
53
+ msk_ch_2023,38147626
54
+ nsclc_unito_2016,27346245
55
+ nsclc_ctdx_msk_2022,36357680
56
+ pediatric_dkfz_2017,29489754
57
+ paired_bladder_2022,36543146
58
+ scco_mskcc,24658004
59
+ sarc_mskcc,20601955
60
+ rectal_msk_2019,31591597
61
+ rbl_cfdna_msk_2020,32633890
62
+ rbl_mskcc_2020,33466343
63
+ rms_msk_2023,37315267
64
+ sarcoma_msk_2023,37350195
65
+ skcm_vanderbilt_mskcc_2015,32913971
66
+ soft_tissue_msk_2023,37730754
67
+ ucec_ccr_msk_2022,35849120
68
+ ucec_ancestry_cds_msk_2023,37651310
69
+ ucec_msk_2024,38653864
70
+ urcc_mskcc_2016,27713405
71
+ utuc_mskcc_2015,26278805
72
+ utuc_msk_2019,32332851
73
+ utuc_pdx_msk_2019,32332851
74
+ plmeso_msk_2024,38630790
75
+ pancreas_msk_2024,39214094
76
+ lms_msk_2024,38488807
77
+ prostate_msk_2024,38949888
78
+ panet_msk_2018,30687805
79
+ makeanimpact_ccr_2023,36862133
80
+ acbc_mskcc_2015,26095796
81
+ blca_tcga_pan_can_atlas_2018,29625048
82
+ blca_tcga_pan_can_atlas_2018,29596782
83
+ blca_tcga_pan_can_atlas_2018,29622463
84
+ blca_tcga_pan_can_atlas_2018,29617662
85
+ blca_tcga_pan_can_atlas_2018,29625055
86
+ blca_tcga_pan_can_atlas_2018,29625050
87
+ blca_tcga_pan_can_atlas_2018,29617662
88
+ blca_tcga_pan_can_atlas_2018,32214244
89
+ blca_tcga_pan_can_atlas_2018,29625049
90
+ blca_tcga_pan_can_atlas_2018,29850653
91
+ blca_tcga_pan_can_atlas_2018,36334560
92
+ acc_2019,31483290
93
+ blca_msk_tcga_2020,30290956
94
+ pcnsl_msk_2024,38995739
95
+ msk_ctdna_vte_2024,39147831
96
+ cellline_ccle_broad,22460905
97
+ ccrcc_irc_2014,24487277
98
+ ccrcc_utokyo_2013,23797736
99
+ coadread_genentech,22895193
100
+ cellline_nci60,22802077
101
+ cll_iuopa_2015,26200345
102
+ brca_metabric,27161491
103
+ brca_metabric,30867590
104
+ brca_metabric,22522925
105
+ coadread_dfci_2016,27149842
106
+ cll_broad_2015,26466571
107
+ brca_tcga_pan_can_atlas_2018,29625048
108
+ brca_tcga_pan_can_atlas_2018,29596782
109
+ brca_tcga_pan_can_atlas_2018,29622463
110
+ brca_tcga_pan_can_atlas_2018,29617662
111
+ brca_tcga_pan_can_atlas_2018,29625055
112
+ brca_tcga_pan_can_atlas_2018,29625050
113
+ brca_tcga_pan_can_atlas_2018,29617662
114
+ brca_tcga_pan_can_atlas_2018,30643250
115
+ brca_tcga_pan_can_atlas_2018,32214244
116
+ brca_tcga_pan_can_atlas_2018,29625049
117
+ brca_tcga_pan_can_atlas_2018,29850653
118
+ brca_tcga_pan_can_atlas_2018,36334560
119
+ cesc_tcga_pan_can_atlas_2018,29625048
120
+ cesc_tcga_pan_can_atlas_2018,29596782
121
+ cesc_tcga_pan_can_atlas_2018,29622463
122
+ cesc_tcga_pan_can_atlas_2018,29617662
123
+ cesc_tcga_pan_can_atlas_2018,29625055
124
+ cesc_tcga_pan_can_atlas_2018,29625050
125
+ cesc_tcga_pan_can_atlas_2018,29617662
126
+ cesc_tcga_pan_can_atlas_2018,30643250
127
+ cesc_tcga_pan_can_atlas_2018,32214244
128
+ cesc_tcga_pan_can_atlas_2018,29625049
129
+ cesc_tcga_pan_can_atlas_2018,29850653
130
+ cesc_tcga_pan_can_atlas_2018,36334560
131
+ chol_tcga_pan_can_atlas_2018,29625048
132
+ chol_tcga_pan_can_atlas_2018,29596782
133
+ chol_tcga_pan_can_atlas_2018,29622463
134
+ chol_tcga_pan_can_atlas_2018,29617662
135
+ chol_tcga_pan_can_atlas_2018,29625055
136
+ chol_tcga_pan_can_atlas_2018,29625050
137
+ chol_tcga_pan_can_atlas_2018,29617662
138
+ chol_tcga_pan_can_atlas_2018,32214244
139
+ chol_tcga_pan_can_atlas_2018,29625049
140
+ chol_tcga_pan_can_atlas_2018,29850653
141
+ chol_tcga_pan_can_atlas_2018,36334560
142
+ ccle_broad_2019,31068700
143
+ ccle_broad_2019,31978347
144
+ coad_cptac_2019,31031003
145
+ coadread_cass_2020,32888432
146
+ cll_broad_2022,35927489
147
+ coadread_tcga_pub,22810696
148
+ desm_broad_2015,26343386
149
+ dlbc_broad_2012,22343534
150
+ cscc_hgsc_bcm_2014,25303977
151
+ coadread_tcga_pan_can_atlas_2018,29625048
152
+ coadread_tcga_pan_can_atlas_2018,29596782
153
+ coadread_tcga_pan_can_atlas_2018,29622463
154
+ coadread_tcga_pan_can_atlas_2018,29617662
155
+ coadread_tcga_pan_can_atlas_2018,29625055
156
+ coadread_tcga_pan_can_atlas_2018,29625050
157
+ coadread_tcga_pan_can_atlas_2018,29617662
158
+ coadread_tcga_pan_can_atlas_2018,30643250
159
+ coadread_tcga_pan_can_atlas_2018,32214244
160
+ coadread_tcga_pan_can_atlas_2018,29625049
161
+ coadread_tcga_pan_can_atlas_2018,29850653
162
+ coadread_tcga_pan_can_atlas_2018,36334560
163
+ dlbc_tcga_pan_can_atlas_2018,29625048
164
+ dlbc_tcga_pan_can_atlas_2018,29596782
165
+ dlbc_tcga_pan_can_atlas_2018,29622463
166
+ dlbc_tcga_pan_can_atlas_2018,29617662
167
+ dlbc_tcga_pan_can_atlas_2018,29625055
168
+ dlbc_tcga_pan_can_atlas_2018,29625050
169
+ dlbc_tcga_pan_can_atlas_2018,29617662
170
+ dlbc_tcga_pan_can_atlas_2018,32214244
171
+ dlbc_tcga_pan_can_atlas_2018,29625049
172
+ dlbc_tcga_pan_can_atlas_2018,29850653
173
+ dlbc_tcga_pan_can_atlas_2018,36334560
174
+ difg_glass_2019,31748746
175
+ cscc_ucsf_2021,34272401
176
+ cscc_ranson_2022,35982973
177
+ difg_glass,35649412
178
+ difg_glass,38117484
179
+ es_dfarber_broad_2014,25186949
180
+ esca_tcga_pan_can_atlas_2018,29625048
181
+ esca_tcga_pan_can_atlas_2018,29596782
182
+ esca_tcga_pan_can_atlas_2018,29622463
183
+ esca_tcga_pan_can_atlas_2018,29617662
184
+ esca_tcga_pan_can_atlas_2018,29625055
185
+ esca_tcga_pan_can_atlas_2018,29625050
186
+ esca_tcga_pan_can_atlas_2018,29617662
187
+ esca_tcga_pan_can_atlas_2018,32214244
188
+ esca_tcga_pan_can_atlas_2018,29625049
189
+ esca_tcga_pan_can_atlas_2018,29850653
190
+ esca_tcga_pan_can_atlas_2018,36334560
191
+ gist_msk_2023,37477937
192
+ kirc_tcga_pub,23792563
193
+ hcc_msk_2024,38864854
194
+ laml_tcga_pub,23634996
195
+ luad_mskcc_2023_met_organotropism,37084736
196
+ mbl_sickkids_2016,26760213
197
+ mixed_pipseq_2017,28007021
198
+ mds_mskcc_2020,27276561
199
+ mds_mskcc_2020,30333627
200
+ mds_mskcc_2020,24030381
201
+ mixed_msk_tcga_2021,34635660
202
+ nhl_bcgsc_2011,21796119
203
+ nhl_bcgsc_2013,23699601
204
+ prad_broad_2013,23622249
205
+ prad_mskcc,20579941
206
+ prad_tcga_pub,26544944
207
+ pcpg_tcga_pub,28162975
208
+ pptc_2019,31693904
209
+ prad_cdk12_mskcc_2020,32317181
210
+ prad_pik3r1_msk_2021,35670774
211
+ pog570_bcgsc_2020,35121966
212
+ prad_organoids_msk_2022,35617398
213
+ ptad_msk_2024,38758238
214
+ prad_msk_mdanderson_2023,38488813
215
+ stad_tcga_pub,25079317
216
+ rectal_msk_2022,35970919
217
+ sarcoma_msk_2022,35705558
218
+ hnsc_tcga_pan_can_atlas_2018,29625048
219
+ hnsc_tcga_pan_can_atlas_2018,29596782
220
+ hnsc_tcga_pan_can_atlas_2018,29622463
221
+ hnsc_tcga_pan_can_atlas_2018,29617662
222
+ hnsc_tcga_pan_can_atlas_2018,29625055
223
+ hnsc_tcga_pan_can_atlas_2018,29625050
224
+ hnsc_tcga_pan_can_atlas_2018,29617662
225
+ hnsc_tcga_pan_can_atlas_2018,30643250
226
+ hnsc_tcga_pan_can_atlas_2018,32214244
227
+ hnsc_tcga_pan_can_atlas_2018,29625049
228
+ hnsc_tcga_pan_can_atlas_2018,29850653
229
+ hnsc_tcga_pan_can_atlas_2018,36334560
230
+ kich_tcga_pan_can_atlas_2018,29625048
231
+ kich_tcga_pan_can_atlas_2018,29596782
232
+ kich_tcga_pan_can_atlas_2018,29622463
233
+ kich_tcga_pan_can_atlas_2018,29617662
234
+ kich_tcga_pan_can_atlas_2018,29625055
235
+ kich_tcga_pan_can_atlas_2018,29625050
236
+ kich_tcga_pan_can_atlas_2018,29617662
237
+ kich_tcga_pan_can_atlas_2018,32214244
238
+ kich_tcga_pan_can_atlas_2018,29625049
239
+ kich_tcga_pan_can_atlas_2018,29850653
240
+ kich_tcga_pan_can_atlas_2018,36334560
241
+ kirc_tcga_pan_can_atlas_2018,29625048
242
+ kirc_tcga_pan_can_atlas_2018,29596782
243
+ kirc_tcga_pan_can_atlas_2018,29622463
244
+ kirc_tcga_pan_can_atlas_2018,29617662
245
+ kirc_tcga_pan_can_atlas_2018,29625055
246
+ kirc_tcga_pan_can_atlas_2018,29625050
247
+ kirc_tcga_pan_can_atlas_2018,29617662
248
+ kirc_tcga_pan_can_atlas_2018,30643250
249
+ kirc_tcga_pan_can_atlas_2018,32214244
250
+ kirc_tcga_pan_can_atlas_2018,29625049
251
+ kirc_tcga_pan_can_atlas_2018,29850653
252
+ kirc_tcga_pan_can_atlas_2018,36334560
253
+ kirp_tcga_pan_can_atlas_2018,29625048
254
+ kirp_tcga_pan_can_atlas_2018,29596782
255
+ kirp_tcga_pan_can_atlas_2018,29622463
256
+ kirp_tcga_pan_can_atlas_2018,29617662
257
+ kirp_tcga_pan_can_atlas_2018,29625055
258
+ kirp_tcga_pan_can_atlas_2018,29625050
259
+ kirp_tcga_pan_can_atlas_2018,29617662
260
+ kirp_tcga_pan_can_atlas_2018,30643250
261
+ kirp_tcga_pan_can_atlas_2018,32214244
262
+ kirp_tcga_pan_can_atlas_2018,29625049
263
+ kirp_tcga_pan_can_atlas_2018,29850653
264
+ kirp_tcga_pan_can_atlas_2018,36334560
265
+ laml_tcga_pan_can_atlas_2018,29625048
266
+ laml_tcga_pan_can_atlas_2018,29596782
267
+ laml_tcga_pan_can_atlas_2018,29622463
268
+ laml_tcga_pan_can_atlas_2018,29617662
269
+ laml_tcga_pan_can_atlas_2018,29625055
270
+ laml_tcga_pan_can_atlas_2018,29625050
271
+ laml_tcga_pan_can_atlas_2018,29617662
272
+ laml_tcga_pan_can_atlas_2018,32214244
273
+ laml_tcga_pan_can_atlas_2018,29625049
274
+ laml_tcga_pan_can_atlas_2018,29850653
275
+ laml_tcga_pan_can_atlas_2018,36334560
276
+ lihc_tcga_pan_can_atlas_2018,29625048
277
+ lihc_tcga_pan_can_atlas_2018,29596782
278
+ lihc_tcga_pan_can_atlas_2018,29622463
279
+ lihc_tcga_pan_can_atlas_2018,29617662
280
+ lihc_tcga_pan_can_atlas_2018,29625055
281
+ lihc_tcga_pan_can_atlas_2018,29625050
282
+ lihc_tcga_pan_can_atlas_2018,29617662
283
+ lihc_tcga_pan_can_atlas_2018,30643250
284
+ lihc_tcga_pan_can_atlas_2018,32214244
285
+ lihc_tcga_pan_can_atlas_2018,29625049
286
+ lihc_tcga_pan_can_atlas_2018,29850653
287
+ lihc_tcga_pan_can_atlas_2018,36334560
288
+ luad_tcga_pan_can_atlas_2018,29625048
289
+ luad_tcga_pan_can_atlas_2018,29596782
290
+ luad_tcga_pan_can_atlas_2018,29622463
291
+ luad_tcga_pan_can_atlas_2018,29617662
292
+ luad_tcga_pan_can_atlas_2018,29625055
293
+ luad_tcga_pan_can_atlas_2018,29625050
294
+ luad_tcga_pan_can_atlas_2018,29617662
295
+ luad_tcga_pan_can_atlas_2018,30643250
296
+ luad_tcga_pan_can_atlas_2018,32214244
297
+ luad_tcga_pan_can_atlas_2018,29625049
298
+ luad_tcga_pan_can_atlas_2018,29850653
299
+ luad_tcga_pan_can_atlas_2018,36334560
300
+ lusc_tcga_pan_can_atlas_2018,29625048
301
+ lusc_tcga_pan_can_atlas_2018,29596782
302
+ lusc_tcga_pan_can_atlas_2018,29622463
303
+ lusc_tcga_pan_can_atlas_2018,29617662
304
+ lusc_tcga_pan_can_atlas_2018,29625055
305
+ lusc_tcga_pan_can_atlas_2018,29625050
306
+ lusc_tcga_pan_can_atlas_2018,29617662
307
+ lusc_tcga_pan_can_atlas_2018,30643250
308
+ lusc_tcga_pan_can_atlas_2018,32214244
309
+ lusc_tcga_pan_can_atlas_2018,29625049
310
+ lusc_tcga_pan_can_atlas_2018,29850653
311
+ lusc_tcga_pan_can_atlas_2018,36334560
312
+ meso_tcga_pan_can_atlas_2018,29625048
313
+ meso_tcga_pan_can_atlas_2018,29596782
314
+ meso_tcga_pan_can_atlas_2018,29622463
315
+ meso_tcga_pan_can_atlas_2018,29617662
316
+ meso_tcga_pan_can_atlas_2018,29625055
317
+ meso_tcga_pan_can_atlas_2018,29625050
318
+ meso_tcga_pan_can_atlas_2018,29617662
319
+ meso_tcga_pan_can_atlas_2018,32214244
320
+ meso_tcga_pan_can_atlas_2018,29625049
321
+ meso_tcga_pan_can_atlas_2018,29850653
322
+ meso_tcga_pan_can_atlas_2018,36334560
323
+ ov_tcga_pan_can_atlas_2018,29625048
324
+ ov_tcga_pan_can_atlas_2018,29596782
325
+ ov_tcga_pan_can_atlas_2018,29622463
326
+ ov_tcga_pan_can_atlas_2018,29617662
327
+ ov_tcga_pan_can_atlas_2018,29625055
328
+ ov_tcga_pan_can_atlas_2018,29625050
329
+ ov_tcga_pan_can_atlas_2018,29617662
330
+ ov_tcga_pan_can_atlas_2018,30643250
331
+ ov_tcga_pan_can_atlas_2018,32214244
332
+ ov_tcga_pan_can_atlas_2018,29625049
333
+ ov_tcga_pan_can_atlas_2018,29850653
334
+ ov_tcga_pan_can_atlas_2018,36334560
335
+ paad_tcga_pan_can_atlas_2018,29625048
336
+ paad_tcga_pan_can_atlas_2018,29596782
337
+ paad_tcga_pan_can_atlas_2018,29622463
338
+ paad_tcga_pan_can_atlas_2018,29617662
339
+ paad_tcga_pan_can_atlas_2018,29625055
340
+ paad_tcga_pan_can_atlas_2018,29625050
341
+ paad_tcga_pan_can_atlas_2018,29617662
342
+ paad_tcga_pan_can_atlas_2018,30643250
343
+ paad_tcga_pan_can_atlas_2018,32214244
344
+ paad_tcga_pan_can_atlas_2018,29625049
345
+ paad_tcga_pan_can_atlas_2018,29850653
346
+ paad_tcga_pan_can_atlas_2018,36334560
347
+ pcpg_tcga_pan_can_atlas_2018,29625048
348
+ pcpg_tcga_pan_can_atlas_2018,29596782
349
+ pcpg_tcga_pan_can_atlas_2018,29622463
350
+ pcpg_tcga_pan_can_atlas_2018,29617662
351
+ pcpg_tcga_pan_can_atlas_2018,29625055
352
+ pcpg_tcga_pan_can_atlas_2018,29625050
353
+ pcpg_tcga_pan_can_atlas_2018,29617662
354
+ pcpg_tcga_pan_can_atlas_2018,30643250
355
+ pcpg_tcga_pan_can_atlas_2018,32214244
356
+ pcpg_tcga_pan_can_atlas_2018,29625049
357
+ pcpg_tcga_pan_can_atlas_2018,29850653
358
+ pcpg_tcga_pan_can_atlas_2018,36334560
359
+ prad_tcga_pan_can_atlas_2018,29625048
360
+ prad_tcga_pan_can_atlas_2018,29596782
361
+ prad_tcga_pan_can_atlas_2018,29622463
362
+ prad_tcga_pan_can_atlas_2018,29617662
363
+ prad_tcga_pan_can_atlas_2018,29625055
364
+ prad_tcga_pan_can_atlas_2018,29625050
365
+ prad_tcga_pan_can_atlas_2018,29617662
366
+ prad_tcga_pan_can_atlas_2018,30643250
367
+ prad_tcga_pan_can_atlas_2018,32214244
368
+ prad_tcga_pan_can_atlas_2018,29625049
369
+ prad_tcga_pan_can_atlas_2018,29850653
370
+ prad_tcga_pan_can_atlas_2018,36334560
371
+ sarc_tcga_pan_can_atlas_2018,29625048
372
+ sarc_tcga_pan_can_atlas_2018,29596782
373
+ sarc_tcga_pan_can_atlas_2018,29622463
374
+ sarc_tcga_pan_can_atlas_2018,29617662
375
+ sarc_tcga_pan_can_atlas_2018,29625055
376
+ sarc_tcga_pan_can_atlas_2018,29625050
377
+ sarc_tcga_pan_can_atlas_2018,29617662
378
+ sarc_tcga_pan_can_atlas_2018,32214244
379
+ sarc_tcga_pan_can_atlas_2018,29625049
380
+ sarc_tcga_pan_can_atlas_2018,29850653
381
+ sarc_tcga_pan_can_atlas_2018,36334560
382
+ skcm_tcga_pan_can_atlas_2018,29625048
383
+ skcm_tcga_pan_can_atlas_2018,29596782
384
+ skcm_tcga_pan_can_atlas_2018,29622463
385
+ skcm_tcga_pan_can_atlas_2018,29617662
386
+ skcm_tcga_pan_can_atlas_2018,29625055
387
+ skcm_tcga_pan_can_atlas_2018,29625050
388
+ skcm_tcga_pan_can_atlas_2018,29617662
389
+ skcm_tcga_pan_can_atlas_2018,30643250
390
+ skcm_tcga_pan_can_atlas_2018,32214244
391
+ skcm_tcga_pan_can_atlas_2018,29625049
392
+ skcm_tcga_pan_can_atlas_2018,29850653
393
+ skcm_tcga_pan_can_atlas_2018,36334560
394
+ stad_tcga_pan_can_atlas_2018,29625048
395
+ stad_tcga_pan_can_atlas_2018,29596782
396
+ stad_tcga_pan_can_atlas_2018,29622463
397
+ stad_tcga_pan_can_atlas_2018,29617662
398
+ stad_tcga_pan_can_atlas_2018,29625055
399
+ stad_tcga_pan_can_atlas_2018,29625050
400
+ stad_tcga_pan_can_atlas_2018,29617662
401
+ stad_tcga_pan_can_atlas_2018,32214244
402
+ stad_tcga_pan_can_atlas_2018,29625049
403
+ stad_tcga_pan_can_atlas_2018,29850653
404
+ stad_tcga_pan_can_atlas_2018,36334560
405
+ tgct_tcga_pan_can_atlas_2018,29625048
406
+ tgct_tcga_pan_can_atlas_2018,29596782
407
+ tgct_tcga_pan_can_atlas_2018,29622463
408
+ tgct_tcga_pan_can_atlas_2018,29617662
409
+ tgct_tcga_pan_can_atlas_2018,29625055
410
+ tgct_tcga_pan_can_atlas_2018,29625050
411
+ tgct_tcga_pan_can_atlas_2018,29617662
412
+ tgct_tcga_pan_can_atlas_2018,32214244
413
+ tgct_tcga_pan_can_atlas_2018,29625049
414
+ tgct_tcga_pan_can_atlas_2018,29850653
415
+ tgct_tcga_pan_can_atlas_2018,36334560
416
+ thca_tcga_pan_can_atlas_2018,29625048
417
+ thca_tcga_pan_can_atlas_2018,29596782
418
+ thca_tcga_pan_can_atlas_2018,29622463
419
+ thca_tcga_pan_can_atlas_2018,29617662
420
+ thca_tcga_pan_can_atlas_2018,29625055
421
+ thca_tcga_pan_can_atlas_2018,29625050
422
+ thca_tcga_pan_can_atlas_2018,29617662
423
+ thca_tcga_pan_can_atlas_2018,30643250
424
+ thca_tcga_pan_can_atlas_2018,32214244
425
+ thca_tcga_pan_can_atlas_2018,29625049
426
+ thca_tcga_pan_can_atlas_2018,29850653
427
+ thca_tcga_pan_can_atlas_2018,36334560
428
+ thym_tcga_pan_can_atlas_2018,29625048
429
+ thym_tcga_pan_can_atlas_2018,29596782
430
+ thym_tcga_pan_can_atlas_2018,29622463
431
+ thym_tcga_pan_can_atlas_2018,29617662
432
+ thym_tcga_pan_can_atlas_2018,29625055
433
+ thym_tcga_pan_can_atlas_2018,29625050
434
+ thym_tcga_pan_can_atlas_2018,29617662
435
+ thym_tcga_pan_can_atlas_2018,32214244
436
+ thym_tcga_pan_can_atlas_2018,29625049
437
+ thym_tcga_pan_can_atlas_2018,29850653
438
+ thym_tcga_pan_can_atlas_2018,36334560
439
+ ucec_tcga_pan_can_atlas_2018,29625048
440
+ ucec_tcga_pan_can_atlas_2018,29596782
441
+ ucec_tcga_pan_can_atlas_2018,29622463
442
+ ucec_tcga_pan_can_atlas_2018,29617662
443
+ ucec_tcga_pan_can_atlas_2018,29625055
444
+ ucec_tcga_pan_can_atlas_2018,29625050
445
+ ucec_tcga_pan_can_atlas_2018,29617662
446
+ ucec_tcga_pan_can_atlas_2018,30643250
447
+ ucec_tcga_pan_can_atlas_2018,32214244
448
+ ucec_tcga_pan_can_atlas_2018,29625049
449
+ ucec_tcga_pan_can_atlas_2018,29850653
450
+ ucec_tcga_pan_can_atlas_2018,36334560
451
+ ucs_tcga_pan_can_atlas_2018,29625048
452
+ ucs_tcga_pan_can_atlas_2018,29596782
453
+ ucs_tcga_pan_can_atlas_2018,29622463
454
+ ucs_tcga_pan_can_atlas_2018,29617662
455
+ ucs_tcga_pan_can_atlas_2018,29625055
456
+ ucs_tcga_pan_can_atlas_2018,29625050
457
+ ucs_tcga_pan_can_atlas_2018,29617662
458
+ ucs_tcga_pan_can_atlas_2018,32214244
459
+ ucs_tcga_pan_can_atlas_2018,29625049
460
+ ucs_tcga_pan_can_atlas_2018,29850653
461
+ ucs_tcga_pan_can_atlas_2018,36334560
462
+ uvm_tcga_pan_can_atlas_2018,29625048
463
+ uvm_tcga_pan_can_atlas_2018,29596782
464
+ uvm_tcga_pan_can_atlas_2018,29622463
465
+ uvm_tcga_pan_can_atlas_2018,29617662
466
+ uvm_tcga_pan_can_atlas_2018,29625055
467
+ uvm_tcga_pan_can_atlas_2018,29625050
468
+ uvm_tcga_pan_can_atlas_2018,29617662
469
+ uvm_tcga_pan_can_atlas_2018,32214244
470
+ uvm_tcga_pan_can_atlas_2018,29625049
471
+ uvm_tcga_pan_can_atlas_2018,29850653
472
+ uvm_tcga_pan_can_atlas_2018,36334560
473
+ coad_silu_2022,37202560
474
+ acc_tcga_pan_can_atlas_2018,29625048
475
+ acc_tcga_pan_can_atlas_2018,29596782
476
+ acc_tcga_pan_can_atlas_2018,29622463
477
+ acc_tcga_pan_can_atlas_2018,29617662
478
+ acc_tcga_pan_can_atlas_2018,29625055
479
+ acc_tcga_pan_can_atlas_2018,29625050
480
+ acc_tcga_pan_can_atlas_2018,29617662
481
+ acc_tcga_pan_can_atlas_2018,32214244
482
+ acc_tcga_pan_can_atlas_2018,29625049
483
+ acc_tcga_pan_can_atlas_2018,29850653
484
+ acc_tcga_pan_can_atlas_2018,36334560
485
+ msk_chord_2024,39506116
486
+ pancan_mappyacts_2022,35292802
487
+ msk_met_2021,35120664
488
+ blca_msk_2024,39499893
489
+ brca_fuscc_2020,32719455
490
+ thyroid_gatci_2024,38412093
491
+ braf_msk_impact_2024,38922339
492
+ bcc_unige_2016,26950094
493
+ ampca_bcm_2016,26804919
494
+ blca_dfarber_mskcc_2014,25096233
495
+ blca_mskcc_solit_2012,23897969
496
+ blca_bgi,24121792
497
+ all_stjude_2013,23334668
498
+ acyc_mskcc_2013,23685749
499
+ acyc_jhu_2016,26862087
500
+ acyc_mda_2015,26631609
501
+ acyc_sanger_2013,23778141
502
+ all_stjude_2016,27776115
503
+ angs_project_painter_2018,32042194
504
+ bfn_duke_nus_2015,26437033
505
+ blca_cornell_2016,27749842
506
+ aml_ohsu_2018,30333627
507
+ blca_bcan_hcrn_2022,36333289
508
+ aml_ohsu_2022,35868306
509
+ asclc_msk_2024,39185963
510
+ brca_bccrc_xenograft_2014,25470049
511
+ brca_broad,22722202
512
+ brca_bccrc,22495314
513
+ brca_igr_2015,28027327
514
+ blca_tcga_pub_2017,28988769
515
+ brca_mskcc_2019,31552290
516
+ brca_jup_msk_2020,33263939
517
+ brain_cptac_2020,33242424
518
+ brca_cptac_2020,33212010
519
+ brca_dfci_2020,32404308
520
+ brca_sanger,22722201
521
+ brca_tcga_pub,23000897
522
+ breast_msk_2018,30205045
523
+ breast_alpelisib_2020,32864625
524
+ brca_smc_2018,29713003
525
+ breast_ink4_msk_2021,34544752
526
+ brca_pareja_msk_2020,32220886
527
+ crc_msk_2017,29316426
528
+ pancan_pcawg_2020,32025007
529
+ pdac_msk_2024,39753968
530
+ braf_msk_archer_2024,38922339
531
+ sarcoma_ucla_2024,39305899
532
+ csf_msk_2024,39289779
533
+ normal_skin_fibroblast_2024,39091884
534
+ normal_skin_keratinocytes_2024,39091884
535
+ normal_skin_melanocytes_2024,33029006
536
+ normal_skin_melanocytes_2024,39091884
537
+ normal_skin_melanocytes_2024,38895302
538
+ normal_skin_melanocytes_2024,39975212
539
+ chl_sccc_2023,36723991
540
+ blca_msk_2025,40256659
541
+ esca_broad,23525077
542
+ escc_icgc,24670651
543
+ es_iocurie_2014,25223734
544
+ gbc_shanghai_2014,24997986
545
+ egc_tmucih_2015,25583476
546
+ egc_msk_2017,29122777
547
+ dlbcl_duke_2017,28985567
548
+ dlbcl_dfci_2018,29713087
549
+ gbc_msk_2018,30427539
550
+ egc_trap_msk_2020,32437664
551
+ egc_mskcc_2020,33795256
552
+ egc_trap_ccr_msk_2023,37406106
553
+ hnsc_broad,21798893
554
+ hnc_mskcc_2016,27442865
555
+ hcc_inserm_fr_2015,25822088
556
+ gct_msk_2016,27646943
557
+ hcc_mskimpact_2018,30373752
558
+ glioma_mskcc_2019,31263031
559
+ glioma_msk_2018,30675060
560
+ hccihch_pku_2019,31130341
561
+ hgsoc_msk_2021,34819508
562
+ hcc_meric_2021,35508466
563
+ hcc_clca_2024,38355797
564
+ kirc_bgi,22138691
565
+ kich_tcga_pub,25155756
566
+ hnsc_jhu,21798897
567
+ hnsc_mdanderson_2013,23619168
568
+ ihch_smmu_2014,25526346
569
+ ihch_mskcc_2020,33963001
570
+ ihch_msk_2021,33765338
571
+ lgg_ucsf_2014,24336570
572
+ lgggbm_tcga_pub,26824661
573
+ lihc_amc_prv,24798001
574
+ lihc_riken,22634756
575
+ luad_mskcc_2015,25765070
576
+ luad_broad,22980975
577
+ liad_inserm_fr_2014,24735922
578
+ lcll_broad_2013,23415222
579
+ luad_msk_npjpo_2021,34290393
580
+ luad_cptac_2020,32649874
581
+ lusc_tcga_pub,22960745
582
+ mbl_broad_2012,22820256
583
+ mbl_icgc,22832583
584
+ mbl_pcgp,22722829
585
+ lung_msk_2017,28336552
586
+ luad_mskcc_2020,32791233
587
+ luad_oncosg_2020,32015526
588
+ lung_smc_2016,27634761
589
+ lung_pdx_msk_2021,35440124
590
+ mbl_dkfz_2017,28726821
591
+ lusc_cptac_2021,34358469
592
+ lung_nci_2022,34493867
593
+ mm_broad,24434212
594
+ mcl_idibips_2013,24145436
595
+ mds_tokyo_2011,21909114
596
+ mel_tsam_liang_2017,28373299
597
+ mel_ucla_2016,26997480
598
+ mixed_allen_2018,30150660
599
+ mixed_selpercatinib_2020,35304457
600
+ mixed_cfdna_msk_2020,34059130
601
+ mel_dfci_2019,31792460
602
+ mel_mskimpact_2020,33509808
603
+ mbn_sfu_2023,36201743
604
+ mbn_msk_2024,38497151
605
+ npc_nusingapore,24952746
606
+ nepc_wcm_2016,26855148
607
+ nbl_ucologne_2015,26466568
608
+ nbl_broad_2013,23334666
609
+ mrt_bcgsc_2016,26977886
610
+ mpn_cimr_2013,24325359
611
+ nsclc_mskcc_2015,25765070
612
+ nsclc_mskcc_2018,29657128
613
+ msk_access_2021,34145282
614
+ mng_utoronto_2021,34433969
615
+ mpnst_mskcc,25240281
616
+ nbl_amc_2012,22367537
617
+ nccrcc_genentech_2014,25401301
618
+ ov_tcga_pub,21720365
619
+ paac_jhu_2014,24293293
620
+ paad_icgc,23103869
621
+ paad_utsw_2015,25855536
622
+ nsclc_tcga_broad_2016,27158780
623
+ paad_qcmg_uq_2016,26909576
624
+ pact_jhu_2011,22158988
625
+ nsclc_tracerx_2017,28445112
626
+ nsclc_tracerx_2017,28445469
627
+ nsclc_pd1_msk_2018,29337640
628
+ ntrk_msk_2019,31871300
629
+ pan_origimed_2020,35871175
630
+ paad_cptac_2021,34534465
631
+ nst_nfosi_ntap,32561749
632
+ panet_jhu_2011,21252315
633
+ pcnsl_mayo_2015,25991819
634
+ prad_broad,22610119
635
+ crc_hta11_htan_2021,34910928
636
+ panet_shanghai_2013,24326773
637
+ plmeso_nyu_2015,25488749
638
+ prad_cpcg_2017,28068672
639
+ panet_arcnet_2017,28199314
640
+ past_dkfz_heidelberg_2013,23817572
641
+ prad_eururol_2017,28927585
642
+ prad_fhcrc,26928463
643
+ prad_mich,22722839
644
+ prad_mskcc_2014,25024180
645
+ prad_su2c_2015,26000489
646
+ prad_mskcc_2017,28825054
647
+ prad_p1000,29610475
648
+ prad_su2c_2019,31061129
649
+ prostate_dkfz_2018,30537516
650
+ prad_msk_2019,31564440
651
+ prad_mskcc_cheny1_organoids_2014,25201530
652
+ prad_mcspc_mskcc_2020,32220891
653
+ prad_msk_stopsack_2021,34667026
654
+ prostate_pcbm_swiss_2019,35504881
655
+ sclc_clcgp,22941188
656
+ sclc_jhu,22941189
657
+ skcm_broad,22817889
658
+ rms_nih_2014,24436047
659
+ sarc_tcga_pub,29100075
660
+ sclc_cancercell_gardner_2017,28196596
661
+ sclc_ucologne_2015,26168399
662
+ sarcoma_mskcc_2022,35705560
663
+ skcm_broad_dfarber,22622578
664
+ skcm_yale,22842228
665
+ stad_pfizer_uhongkong,24816253
666
+ skcm_broad_brafresist_2012,24265153
667
+ skcm_mskcc_2014,25409260
668
+ skcm_tcga_pub_2015,26091043
669
+ skcm_dfci_2015,26359337
670
+ stad_uhongkong,22037554
671
+ stad_utokyo,24816255
672
+ tet_nci_2014,24974848
673
+ thyroid_mskcc_2016,26878173
674
+ stes_tcga_pub,28052061
675
+ summit_2018,29420467
676
+ stmyec_wcm_2022,36577525
677
+ ucs_jhu_2014,25233892
678
+ ucec_tcga_pub,23636398
679
+ um_qimr_2016,26683228
680
+ ucec_msk_2018,30068706
681
+ uccc_nih_2017,28485815
682
+ tmb_mskcc_2018,30643254
683
+ ucec_cptac_2020,32059776
684
+ ucec_ccr_cfdna_msk_2022,36007103
685
+ vsc_cuk_2018,29422544
686
+ utuc_cornell_baylor_mdacc_2019,31278255
687
+ usarc_msk_2020,32299819
688
+ utuc_igbmc_2021,33397444
689
+ lgg_tcga_pan_can_atlas_2018,29625048
690
+ lgg_tcga_pan_can_atlas_2018,29596782
691
+ lgg_tcga_pan_can_atlas_2018,29622463
692
+ lgg_tcga_pan_can_atlas_2018,29617662
693
+ lgg_tcga_pan_can_atlas_2018,29625055
694
+ lgg_tcga_pan_can_atlas_2018,29625050
695
+ lgg_tcga_pan_can_atlas_2018,29617662
696
+ lgg_tcga_pan_can_atlas_2018,30643250
697
+ lgg_tcga_pan_can_atlas_2018,32214244
698
+ lgg_tcga_pan_can_atlas_2018,29625049
699
+ lgg_tcga_pan_can_atlas_2018,29850653
700
+ lgg_tcga_pan_can_atlas_2018,36334560
701
+ crc_orion_2024,39386479
702
+ brca_aurora_2023,36585450
703
+ schw_ctf_synodos_2025,33025139
704
+ ovary_geomx_gray_foundation_2024,39386723
705
+ brca_tcga_pub2015,26451490
706
+ hnsc_tcga_pub,25631445
707
+ luad_tcga_pub,25079552
708
+ thca_tcga_pub,25417114
709
+ blca_tcga_pub,24476821
710
+ msk_ch_2020,33106634
711
+ msk_spectrum_tme_2022,36517593
712
+ pancan_mimsi_msk_2024,39746944
713
+ mel_iatlas_riaz_nivolumab_2017,29033130
714
+ stad_oncosg_2018,29670109
715
+ gbm_tcga_pub,18772890
716
+ gbm_tcga_pub2013,24120142
717
+ odg_msk_2017,28472509
718
+ gbm_tcga_pan_can_atlas_2018,29625048
719
+ gbm_tcga_pan_can_atlas_2018,29596782
720
+ gbm_tcga_pan_can_atlas_2018,29622463
721
+ gbm_tcga_pan_can_atlas_2018,29617662
722
+ gbm_tcga_pan_can_atlas_2018,29625055
723
+ gbm_tcga_pan_can_atlas_2018,29625050
724
+ gbm_tcga_pan_can_atlas_2018,29617662
725
+ gbm_tcga_pan_can_atlas_2018,30643250
726
+ gbm_tcga_pan_can_atlas_2018,32214244
727
+ gbm_tcga_pan_can_atlas_2018,29625049
728
+ gbm_tcga_pan_can_atlas_2018,29850653
729
+ gbm_tcga_pan_can_atlas_2018,36334560
730
+ gbm_mayo_pdx_sarkaria_2019,31852831
731
+ gbm_columbia_2019,30742119
732
+ gbm_cptac_2021,33577785
733
+ msk_impact_2017,28481359
index_dir/faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bf5bafdb8fd3f9bbf8bcc66f81d773d6831262a9de7f72a9eba16985cf24a7c
3
+ size 115104813
index_dir/meta.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ef5e13e32fdc66c13c2bf468a9601f371d7a2538430af05be29c8d4a91e242b
3
+ size 31852250
mcp_server.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Minimal MCP server using fastmcp to expose the PDF search script as a tool.
4
+ Uses argparse to configure the server (host, port).
5
+ Uses environment variables for the index config (INDEX_DIR, etc.).
6
+ """
7
+
8
+ import argparse # <-- Added this import
9
+ import json
10
+ import os
11
+ from contextlib import asynccontextmanager
12
+ from pathlib import Path
13
+ from typing import Annotated, Dict, List, Optional
14
+
15
+ # --- Vector index ---
16
+ import faiss
17
+
18
+ # --- NLP / embeddings ---
19
+ import numpy as np
20
+
21
+ # --- MCP Server ---
22
+ from fastmcp import Context, FastMCP
23
+ from sentence_transformers import CrossEncoder, SentenceTransformer
24
+
25
+ # ---------------------------
26
+ # Configuration (from Environment Variables)
27
+ # ---------------------------
28
+ INDEX_DIR = Path(os.environ.get("INDEX_DIR", "./index_dir"))
29
+ EMBED_MODEL = os.environ.get("EMBED_MODEL", "intfloat/e5-base-v2")
30
+ RERANKER_MODEL = os.environ.get("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
31
+ FETCH_K = int(os.environ.get("FETCH_K", 40))
32
+
33
+ # ---------------------------
34
+ # Global state to hold models
35
+ # ---------------------------
36
+ models = {}
37
+
38
+ # ---------------------------
39
+ # Copied Utilities
40
+ # ---------------------------
41
+ def read_metadata(meta_path: str) -> List[Dict]:
42
+ out = []
43
+ with open(meta_path, "r", encoding="utf-8") as f:
44
+ for line in f:
45
+ out.append(json.loads(line))
46
+ return out
47
+
48
+ def e5_prefix(text: str, is_query: bool, model_name: str) -> str:
49
+ if "e5" in model_name.lower():
50
+ return f"{'query' if is_query else 'passage'}: {text}"
51
+ return text
52
+
53
+ # ---------------------------
54
+ # Server Startup & Shutdown
55
+ # ---------------------------
56
+ @asynccontextmanager
57
+ async def lifespan(mcp: FastMCP):
58
+ """
59
+ Handles loading the models on startup.
60
+ """
61
+ print("[*] Server starting... Loading models...")
62
+ index_path = INDEX_DIR / "faiss.index"
63
+ meta_path = INDEX_DIR / "meta.jsonl"
64
+
65
+ if not index_path.exists() or not meta_path.exists():
66
+ print(f"[!] ERROR: Index files not found in '{INDEX_DIR}'.")
67
+ else:
68
+ print(f"[*] Loading FAISS index: {index_path}")
69
+ models["index"] = faiss.read_index(str(index_path))
70
+
71
+ print(f"[*] Loading metadata: {meta_path}")
72
+ models["meta"] = read_metadata(str(meta_path))
73
+
74
+ print(f"[*] Loading embedding model: {EMBED_MODEL}")
75
+ models["embedder"] = SentenceTransformer(EMBED_MODEL)
76
+
77
+ print(f"[*] Loading reranker model: {RERANKER_MODEL}")
78
+ models["reranker"] = CrossEncoder(RERANKER_MODEL)
79
+
80
+ print("[✓] All models and index loaded. Server is ready.")
81
+
82
+ yield
83
+
84
+ print("[*] Server shutting down... Clearing models.")
85
+ models.clear()
86
+
87
+ # ---------------------------
88
+ # Create the MCP Server
89
+ # ---------------------------
90
+ mcp = FastMCP(
91
+ name="PDF Corpus Search Tool",
92
+ lifespan=lifespan
93
+ )
94
+
95
+ # ---------------------------
96
+ # The MCP Tool
97
+ # ---------------------------
98
+ Query = Annotated[str, "The semantic search query to run against the documents."]
99
+ TopK = Annotated[int, "The final number of results to return.", "default: 5"]
100
+ Rerank = Annotated[bool, "Whether to use a cross-encoder to rerank results. Default: true.", "default: true"]
101
+
102
+ @mcp.tool()
103
+ def search_pdf_corpus(
104
+ query: Query,
105
+ top_k: TopK = 5,
106
+ rerank: Rerank = True
107
+ ) -> List[Dict]:
108
+ """
109
+ Searches a private corpus of PDF documents for relevant text chunks.
110
+ Use this to answer questions about specific topics found in the user's files.
111
+ """
112
+ if "index" not in models:
113
+ return [{"error": "Index is not loaded. Check server logs."}]
114
+
115
+ # ... (search logic remains identical to before) ...
116
+ # 1. Get pre-loaded assets
117
+ index = models["index"]
118
+ meta = models["meta"]
119
+ embedder = models["embedder"]
120
+
121
+ # 2. Embed Query
122
+ query_text = e5_prefix(query, is_query=True, model_name=EMBED_MODEL)
123
+ qvec = embedder.encode([query_text], normalize_embeddings=True).astype("float32")
124
+
125
+ # 3. FAISS Search
126
+ D, I = index.search(qvec, FETCH_K)
127
+
128
+ # 4. Get Candidates
129
+ candidates = []
130
+ for j, idx in enumerate(I[0]):
131
+ if idx == -1: continue
132
+ rec = dict(meta[idx])
133
+ rec["ann_score"] = float(D[0][j])
134
+ candidates.append(rec)
135
+
136
+ if not candidates:
137
+ return []
138
+
139
+ # 5. Optional Reranking
140
+ if rerank:
141
+ pairs = [(query, c["text"]) for c in candidates]
142
+ scores = models["reranker"].predict(pairs)
143
+ for c, s in zip(candidates, scores):
144
+ c["rerank_score"] = float(s)
145
+ candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
146
+ else:
147
+ candidates.sort(key=lambda x: x["ann_score"], reverse=True)
148
+
149
+ # 6. Format and return top_k results
150
+ final_results = []
151
+ for r in candidates[:top_k]:
152
+ final_results.append({
153
+ "doc_path": r["doc_path"],
154
+ "page": r["page"],
155
+ "score": r.get("rerank_score", r["ann_score"]),
156
+ "text": r["text"]
157
+ })
158
+
159
+ return final_results
160
+
161
+ # ---------------------------
162
+ # Run the Server
163
+ # ---------------------------
164
+ if __name__ == "__main__":
165
+ # --- This is the corrected block ---
166
+
167
+ parser = argparse.ArgumentParser(description="Run the PDF Search MCP Server")
168
+
169
+ # Add arguments for server configuration
170
+ parser.add_argument(
171
+ "--host",
172
+ type=str,
173
+ default="localhost",
174
+ help="Host to bind the server to (default: 0.0.0.0)"
175
+ )
176
+ parser.add_argument(
177
+ "--port",
178
+ type=int,
179
+ default=8123,
180
+ help="Port to run the server on (default: 8123)"
181
+ )
182
+ parser.add_argument(
183
+ "--transport",
184
+ type=str,
185
+ default="http",
186
+ choices=["http"], # fastmcp primarily uses http
187
+ help="Server transport protocol (default: http)"
188
+ )
189
+
190
+ args = parser.parse_args()
191
+
192
+ print(f"--- Starting PDF Search MCP Server on {args.transport}://{args.host}:{args.port} ---")
193
+ print(f"--- Using INDEX_DIR: {INDEX_DIR.resolve()} ---")
194
+
195
+ # Pass the parsed arguments to mcp.run()
196
+ mcp.run(
197
+ transport=args.transport,
198
+ host=args.host,
199
+ port=args.port
200
+ )
pdf_semsearch.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tiny CLI for open-source semantic search over PDFs.
4
+ - Index: extract → chunk → embed → FAISS
5
+ - Search: embed query → ANN → (optional) rerank
6
+
7
+ Examples:
8
+ # Index all PDFs in ./pdfs into ./index_dir
9
+ python pdf_semsearch.py index --pdf-dir ./pdfs --index-dir ./index_dir
10
+
11
+ # Search with reranking
12
+ python pdf_semsearch.py search --index-dir ./index_dir -q "KRAS G12C eligibility in lung cancer" --top-k 5 --rerank
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import json
18
+ import argparse
19
+ import hashlib
20
+ from pathlib import Path
21
+ from typing import List, Dict, Tuple, Optional
22
+
23
+ import numpy as np
24
+ from tqdm import tqdm
25
+
26
+ # --- PDF parsing / OCR ---
27
+ import pdfplumber
28
+
29
+ # OCR is optional; only imported if --ocr is used or needed
30
+ try:
31
+ from pdf2image import convert_from_path # requires poppler
32
+ import pytesseract # requires tesseract runtime
33
+ _OCR_AVAILABLE = True
34
+ except Exception:
35
+ _OCR_AVAILABLE = False
36
+
37
+ # --- NLP / embeddings ---
38
+ import spacy
39
+ from sentence_transformers import SentenceTransformer, CrossEncoder
40
+
41
+ # --- Vector index ---
42
+ import faiss
43
+
44
+
45
+ # ---------------------------
46
+ # Utilities
47
+ # ---------------------------
48
+ def sha1_16(s: str) -> str:
49
+ return hashlib.sha1(s.encode("utf-8")).hexdigest()[:16]
50
+
51
+
52
+ def ensure_dir(p: str):
53
+ Path(p).mkdir(parents=True, exist_ok=True)
54
+
55
+
56
+ def load_spacy(model: str = "en_core_web_sm"):
57
+ try:
58
+ return spacy.load(model)
59
+ except OSError as e:
60
+ print(
61
+ f"[!] spaCy model '{model}' not found. Install it once with:\n"
62
+ f" python -m spacy download {model}\n"
63
+ )
64
+ raise e
65
+
66
+
67
+ def e5_prefix(text: str, is_query: bool, model_name: str) -> str:
68
+ # Add E5-style prefixes if using an e5 model
69
+ if "e5" in model_name.lower():
70
+ return f"{'query' if is_query else 'passage'}: {text}"
71
+ return text # BGE & others usually don't need prefixes
72
+
73
+
74
+ def chunk_sentences(nlp, text: str, target_chars: int = 900, overlap: int = 120) -> List[str]:
75
+ """Sentence-aware chunking around target_chars with soft overlap."""
76
+ doc = nlp(text)
77
+ sents = [s.text.strip() for s in doc.sents if s.text.strip()]
78
+ chunks, cur, cur_len = [], [], 0
79
+ for s in sents:
80
+ if cur and cur_len + len(s) > target_chars:
81
+ chunk = " ".join(cur)
82
+ chunks.append(chunk)
83
+ tail = chunk[-overlap:] if overlap > 0 else ""
84
+ cur = [tail, s] if tail else [s]
85
+ cur_len = len(" ".join(cur))
86
+ else:
87
+ cur.append(s)
88
+ cur_len += len(s)
89
+ if cur:
90
+ chunks.append(" ".join(cur))
91
+ # Fallback if text had no sentence boundaries
92
+ if not chunks and text.strip():
93
+ chunks = [text[:target_chars]]
94
+ return chunks
95
+
96
+
97
+ def extract_pdf_text(pdf_path: str) -> List[Tuple[int, str]]:
98
+ """Return [(page_num, text)] using pdfplumber only (born-digital PDFs)."""
99
+ pages = []
100
+ with pdfplumber.open(pdf_path) as pdf:
101
+ for i, page in enumerate(pdf.pages, start=1):
102
+ txt = page.extract_text() or ""
103
+ pages.append((i, txt))
104
+ return pages
105
+
106
+
107
+ def extract_pdf_text_with_ocr(pdf_path: str, dpi: int = 300, min_len: int = 20) -> List[Tuple[int, str]]:
108
+ """
109
+ Return [(page_num, text)] using pdfplumber and selective OCR if page text is too short.
110
+ Requires poppler & tesseract installed.
111
+ """
112
+ if not _OCR_AVAILABLE:
113
+ raise RuntimeError("OCR requested but pdf2image/pytesseract not available.")
114
+
115
+ out = []
116
+ with pdfplumber.open(pdf_path) as pdf:
117
+ page_count = len(pdf.pages)
118
+
119
+ for i in range(1, page_count + 1):
120
+ with pdfplumber.open(pdf_path) as pdf:
121
+ txt = (pdf.pages[i - 1].extract_text() or "").strip()
122
+
123
+ if len(txt) >= min_len:
124
+ out.append((i, txt))
125
+ continue
126
+
127
+ # OCR fallback for this page only
128
+ pil = convert_from_path(pdf_path, first_page=i, last_page=i, dpi=dpi)[0]
129
+ ocr_txt = pytesseract.image_to_string(pil, lang="eng")
130
+ out.append((i, ocr_txt or ""))
131
+
132
+ return out
133
+
134
+
135
+ def build_corpus(pdf_dir: str, use_ocr: bool, nlp, chunk_chars: int, overlap: int, min_text_len_for_ocr: int) -> List[Dict]:
136
+ corpus = []
137
+ pdf_files = sorted(Path(pdf_dir).glob("**/*.pdf"))
138
+ for pdf_file in tqdm(pdf_files, desc="Reading PDFs"):
139
+ try:
140
+ pages = extract_pdf_text_with_ocr(str(pdf_file), min_len=min_text_len_for_ocr) if use_ocr \
141
+ else extract_pdf_text(str(pdf_file))
142
+ except Exception as e:
143
+ print(f"[!] Failed to read {pdf_file}: {e}")
144
+ continue
145
+
146
+ for page_num, txt in pages:
147
+ if not txt or not txt.strip():
148
+ continue
149
+ for idx, chunk in enumerate(chunk_sentences(nlp, txt, target_chars=chunk_chars, overlap=overlap)):
150
+ corpus.append({
151
+ "doc_path": str(pdf_file),
152
+ "page": page_num,
153
+ "chunk_id": idx,
154
+ "text": chunk
155
+ })
156
+ return corpus
157
+
158
+
159
+ def write_metadata(meta_path: str, corpus: List[Dict]):
160
+ with open(meta_path, "w", encoding="utf-8") as f:
161
+ for rec in corpus:
162
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
163
+
164
+
165
+ def read_metadata(meta_path: str) -> List[Dict]:
166
+ out = []
167
+ with open(meta_path, "r", encoding="utf-8") as f:
168
+ for line in f:
169
+ out.append(json.loads(line))
170
+ return out
171
+
172
+
173
+ # ---------------------------
174
+ # Indexing
175
+ # ---------------------------
176
+ def cmd_index(args):
177
+ ensure_dir(args.index_dir)
178
+
179
+ if args.ocr and not _OCR_AVAILABLE:
180
+ print("[!] --ocr requested but OCR deps not available. Install poppler, tesseract, pdf2image, pytesseract.")
181
+ sys.exit(2)
182
+
183
+ print("[*] Loading spaCy...")
184
+ nlp = load_spacy("en_core_web_sm")
185
+
186
+ print("[*] Building corpus from PDFs...")
187
+ corpus = build_corpus(
188
+ pdf_dir=args.pdf_dir,
189
+ use_ocr=args.ocr,
190
+ nlp=nlp,
191
+ chunk_chars=args.chunk_chars,
192
+ overlap=args.overlap,
193
+ min_text_len_for_ocr=args.ocr_min_text_len
194
+ )
195
+ if not corpus:
196
+ print("[!] No text found. Are your PDFs scanned? Try --ocr.")
197
+ sys.exit(1)
198
+
199
+ meta_path = os.path.join(args.index_dir, "meta.jsonl")
200
+ write_metadata(meta_path, corpus)
201
+ print(f"[*] Wrote metadata for {len(corpus)} chunks to {meta_path}")
202
+
203
+ print(f"[*] Loading embedding model: {args.embed_model}")
204
+ embedder = SentenceTransformer(args.embed_model)
205
+
206
+ texts = [e5_prefix(rec["text"], is_query=False, model_name=args.embed_model) for rec in corpus]
207
+
208
+ print("[*] Encoding chunks...")
209
+ embeddings = embedder.encode(
210
+ texts,
211
+ batch_size=args.batch_size,
212
+ normalize_embeddings=True,
213
+ show_progress_bar=True
214
+ ).astype("float32")
215
+
216
+ dim = embeddings.shape[1]
217
+ index = faiss.IndexFlatIP(dim) # cosine via normalized vectors + inner product
218
+ index.add(embeddings)
219
+
220
+ index_path = os.path.join(args.index_dir, "faiss.index")
221
+ faiss.write_index(index, index_path)
222
+ print(f"[*] Wrote FAISS index to {index_path}")
223
+
224
+ print("[✓] Indexing complete.")
225
+
226
+
227
+ # ---------------------------
228
+ # Searching
229
+ # ---------------------------
230
+ def pretty_snippet(s: str, max_len: int = 320) -> str:
231
+ s = " ".join(s.split())
232
+ return s if len(s) <= max_len else s[: max_len - 1] + "…"
233
+
234
+
235
+ def cmd_search(args):
236
+ index_path = os.path.join(args.index_dir, "faiss.index")
237
+ meta_path = os.path.join(args.index_dir, "meta.jsonl")
238
+ if not os.path.exists(index_path) or not os.path.exists(meta_path):
239
+ print("[!] Index not found. Run 'index' first.")
240
+ sys.exit(1)
241
+
242
+ print(f"[*] Loading FAISS index: {index_path}")
243
+ index = faiss.read_index(index_path)
244
+
245
+ print("[*] Loading metadata…")
246
+ meta = read_metadata(meta_path)
247
+
248
+ print(f"[*] Loading embedding model: {args.embed_model}")
249
+ embedder = SentenceTransformer(args.embed_model)
250
+
251
+ query_text = e5_prefix(args.query, is_query=True, model_name=args.embed_model)
252
+ qvec = embedder.encode([query_text], normalize_embeddings=True).astype("float32")
253
+ D, I = index.search(qvec, args.fetch_k)
254
+
255
+ candidates = []
256
+ for j, idx in enumerate(I[0]):
257
+ if idx == -1:
258
+ continue
259
+ rec = dict(meta[idx])
260
+ rec["ann_score"] = float(D[0][j])
261
+ candidates.append(rec)
262
+
263
+ if not candidates:
264
+ print("[!] No results.")
265
+ sys.exit(0)
266
+
267
+ # Optional reranking
268
+ if args.rerank:
269
+ print(f"[*] Reranking top {len(candidates)} with {args.reranker_model}…")
270
+ reranker = CrossEncoder(args.reranker_model)
271
+ pairs = [(args.query, c["text"]) for c in candidates]
272
+ scores = reranker.predict(pairs)
273
+ for c, s in zip(candidates, scores):
274
+ c["rerank_score"] = float(s)
275
+ candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
276
+ else:
277
+ candidates.sort(key=lambda x: x["ann_score"], reverse=True)
278
+
279
+ results = candidates[: args.top_k]
280
+
281
+ # Print nicely
282
+ print("\n=== Results ===\n")
283
+ for i, r in enumerate(results, start=1):
284
+ base = Path(r["doc_path"]).name
285
+ score = r.get("rerank_score", r["ann_score"])
286
+ print(f"{i}. {base} p.{r['page']} score={score:.3f}")
287
+ print(f" {pretty_snippet(r['text'])}\n")
288
+
289
+ if args.jsonl:
290
+ out = []
291
+ for r in results:
292
+ out.append({
293
+ "doc_path": r["doc_path"],
294
+ "page": r["page"],
295
+ "score": r.get("rerank_score", r["ann_score"]),
296
+ "text": r["text"]
297
+ })
298
+ print(json.dumps(out, ensure_ascii=False, indent=2))
299
+
300
+
301
+ # ---------------------------
302
+ # Main (argparse)
303
+ # ---------------------------
304
+ def main():
305
+ parser = argparse.ArgumentParser(description="Tiny CLI for semantic PDF search (FAISS + Sentence-Transformers)")
306
+ sub = parser.add_subparsers(dest="cmd", required=True)
307
+
308
+ # index
309
+ p_index = sub.add_parser("index", help="Index PDFs into a FAISS index")
310
+ p_index.add_argument("--pdf-dir", required=True, help="Folder with PDFs")
311
+ p_index.add_argument("--index-dir", required=True, help="Folder to write index & metadata")
312
+ p_index.add_argument("--embed-model", default="intfloat/e5-base-v2", help="Sentence-Transformers model name")
313
+ p_index.add_argument("--batch-size", type=int, default=64, help="Embedding batch size")
314
+ p_index.add_argument("--chunk-chars", type=int, default=900, help="Target characters per chunk")
315
+ p_index.add_argument("--overlap", type=int, default=120, help="Overlap characters between chunks")
316
+ p_index.add_argument("--ocr", action="store_true", help="Enable OCR fallback for scan-like pages")
317
+ p_index.add_argument("--ocr-min-text-len", type=int, default=20, help="If page text < N chars, OCR that page")
318
+ p_index.set_defaults(func=cmd_index)
319
+
320
+ # search
321
+ p_search = sub.add_parser("search", help="Search an existing index")
322
+ p_search.add_argument("--index-dir", required=True, help="Folder with faiss.index and meta.jsonl")
323
+ p_search.add_argument("-q", "--query", required=True, help="Search query")
324
+ p_search.add_argument("--top-k", type=int, default=8, help="How many results to show")
325
+ p_search.add_argument("--fetch-k", type=int, default=40, help="First-stage ANN fetch depth (before rerank)")
326
+ p_search.add_argument("--embed-model", default="intfloat/e5-base-v2", help="Sentence-Transformers model name")
327
+ p_search.add_argument("--rerank", action="store_true", help="Enable CrossEncoder reranking")
328
+ p_search.add_argument("--reranker-model", default="cross-encoder/ms-marco-MiniLM-L-6-v2", help="CrossEncoder name")
329
+ p_search.add_argument("--jsonl", action="store_true", help="Also print results as JSON to stdout")
330
+ p_search.set_defaults(func=cmd_search)
331
+
332
+ args = parser.parse_args()
333
+ args.func(args)
334
+
335
+
336
+ if __name__ == "__main__":
337
+ main()
pmc_pdf_downloader.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download NIH/PMC PDFs in batches from a CSV produced by pmid2pmcid.py.
4
+
5
+ Input CSV schema (minimum):
6
+ pmid,pmcid,doi,status,errmsg
7
+ Only rows with a non-empty PMCID are attempted (NIH/PMC full text).
8
+
9
+ Features
10
+ - Batch processing with per-batch delay
11
+ - Concurrency (threaded) with polite throttling
12
+ - Robust URL strategy (handles /pdf/ and /pdf/<PMCID>.pdf)
13
+ - Retries with backoff for 429/5xx
14
+ - Resume: skips already-downloaded files unless --overwrite
15
+ - Manifest CSV of successes/failures
16
+
17
+ Examples
18
+ python pmc_pdf_downloader.py --in pmid_to_pmcid.csv --out-dir ./pmc_pdfs \
19
+ --batch-size 40 --concurrency 4 --delay 1.0 --email you@org.edu
20
+
21
+ # Overwrite existing PDFs and be extra slow/polite
22
+ python pmc_pdf_downloader.py --in map.csv --out-dir ./pmc_pdfs --overwrite \
23
+ --batch-size 20 --concurrency 2 --delay 2.0
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import csv
29
+ import os
30
+ import time
31
+ import sys
32
+ import math
33
+ import re
34
+ import pathlib
35
+ from typing import List, Dict, Optional, Tuple
36
+ from concurrent.futures import ThreadPoolExecutor, as_completed
37
+
38
+ import requests
39
+
40
+ UA = "pmc-pdf-downloader/1.0 (+https://example.org)"
41
+ PMC_HOSTS = [
42
+ "https://www.ncbi.nlm.nih.gov",
43
+ "https://pmc.ncbi.nlm.nih.gov",
44
+ ]
45
+ # Strategies we try in order for each host
46
+ PMC_PATH_PATTERNS = [
47
+ "/pmc/articles/{pmcid}/pdf/", # canonical "directory" that serves main PDF
48
+ "/pmc/articles/{pmcid}/pdf/{pmcid}.pdf", # explicit filename
49
+ ]
50
+
51
+ def read_rows(csv_path: str) -> List[Dict[str, str]]:
52
+ rows = []
53
+ with open(csv_path, newline="", encoding="utf-8") as f:
54
+ rd = csv.DictReader(f)
55
+ if not rd.fieldnames:
56
+ raise ValueError("CSV appears to have no header row.")
57
+ for r in rd:
58
+ rows.append({k: (v or "").strip() for k, v in r.items()})
59
+ return rows
60
+
61
+ def valid_pmcid(pmcid: str) -> bool:
62
+ # Accept forms like "PMC12345" (case-insensitive)
63
+ return bool(re.fullmatch(r"(?i)PMC\d+", pmcid or ""))
64
+
65
+ def ensure_dir(path: str):
66
+ pathlib.Path(path).mkdir(parents=True, exist_ok=True)
67
+
68
+ def sanitize_filename(name: str) -> str:
69
+ return re.sub(r"[^A-Za-z0-9._\-]+", "_", name)
70
+
71
+ def pick_filename(pmcid: str, pmid: str = "", doi: str = "") -> str:
72
+ base = pmcid.upper() if pmcid else (pmid or "UNKNOWN")
73
+ return sanitize_filename(f"{base}.pdf")
74
+
75
+ def stream_download(url: str, dest_path: str, timeout: int, session: requests.Session) -> Tuple[bool, str]:
76
+ """Stream to disk; returns (ok, message)."""
77
+ with session.get(url, stream=True, timeout=timeout) as r:
78
+ if r.status_code != 200 or "application/pdf" not in r.headers.get("Content-Type", "").lower():
79
+ return False, f"HTTP {r.status_code} CT={r.headers.get('Content-Type')}"
80
+ # Respect filename from header if present
81
+ cd = r.headers.get("Content-Disposition", "")
82
+ if "filename=" in cd:
83
+ # simple parse; keep extension .pdf
84
+ fname = cd.split("filename=")[-1].strip('"; ')
85
+ if fname:
86
+ dest_dir = os.path.dirname(dest_path)
87
+ dest_path = os.path.join(dest_dir, sanitize_filename(fname))
88
+ with open(dest_path, "wb") as f:
89
+ for chunk in r.iter_content(chunk_size=1024 * 256):
90
+ if chunk:
91
+ f.write(chunk)
92
+ return True, "ok"
93
+
94
+ def try_download_pmc_pdf(pmcid: str, out_dir: str, timeout: int, session: requests.Session) -> Tuple[bool, str, str]:
95
+ """
96
+ Attempt multiple PMC URL variants across hosts. Returns (ok, msg, final_path_or_empty).
97
+ """
98
+ for host in PMC_HOSTS:
99
+ for pattern in PMC_PATH_PATTERNS:
100
+ url = f"{host}{pattern.format(pmcid=pmcid)}"
101
+ target = os.path.join(out_dir, pick_filename(pmcid))
102
+ ok, msg = stream_download(url, target, timeout=timeout, session=session)
103
+ if ok:
104
+ return True, f"{host} {msg}", target
105
+ return False, "no_pdf_found", ""
106
+
107
+ def polite_retry(fn, *, retries=4, backoff=1.5, initial_delay=0.0, on_retry=None):
108
+ def wrapper(*args, **kwargs):
109
+ delay = initial_delay
110
+ attempt = 0
111
+ while True:
112
+ try:
113
+ return fn(*args, **kwargs)
114
+ except requests.RequestException as e:
115
+ attempt += 1
116
+ if attempt > retries:
117
+ raise
118
+ if on_retry:
119
+ on_retry(attempt, e)
120
+ time.sleep(max(0.25, delay))
121
+ delay *= backoff
122
+ return wrapper
123
+
124
+ def worker(row: Dict[str, str],
125
+ out_dir: str,
126
+ timeout: int,
127
+ overwrite: bool,
128
+ email: Optional[str]) -> Dict[str, str]:
129
+ pmid = row.get("pmid", "")
130
+ pmcid = row.get("pmcid", "")
131
+ doi = row.get("doi", "")
132
+ result = {
133
+ "pmid": pmid,
134
+ "pmcid": pmcid,
135
+ "doi": doi,
136
+ "status": "",
137
+ "message": "",
138
+ "file": "",
139
+ }
140
+
141
+ if not valid_pmcid(pmcid):
142
+ result["status"] = "skip"
143
+ result["message"] = "no_pmcid"
144
+ return result
145
+
146
+ target_path = os.path.join(out_dir, pick_filename(pmcid, pmid, doi))
147
+ if os.path.exists(target_path) and not overwrite:
148
+ result["status"] = "ok_cached"
149
+ result["file"] = target_path
150
+ return result
151
+
152
+ headers = {"User-Agent": UA}
153
+ if email:
154
+ headers["From"] = email # be nice; some servers log contact
155
+
156
+ with requests.Session() as s:
157
+ s.headers.update(headers)
158
+
159
+ def _on_retry(attempt, exc):
160
+ # print minimal retry info
161
+ sys.stderr.write(f"[retry] {pmcid} attempt {attempt}: {exc}\n")
162
+
163
+ safe_download = polite_retry(
164
+ lambda: try_download_pmc_pdf(pmcid, out_dir, timeout, s),
165
+ retries=4, backoff=1.8, initial_delay=0.5, on_retry=_on_retry
166
+ )
167
+
168
+ try:
169
+ ok, msg, final_path = safe_download()
170
+ if ok:
171
+ result["status"] = "ok"
172
+ result["message"] = msg
173
+ result["file"] = final_path
174
+ else:
175
+ result["status"] = "fail"
176
+ result["message"] = msg
177
+ except Exception as e:
178
+ result["status"] = "error"
179
+ result["message"] = f"{type(e).__name__}: {e}"
180
+
181
+ return result
182
+
183
+ def write_manifest(path: str, rows: List[Dict[str, str]]):
184
+ fieldnames = ["pmid", "pmcid", "doi", "status", "message", "file"]
185
+ with open(path, "w", newline="", encoding="utf-8") as f:
186
+ w = csv.DictWriter(f, fieldnames=fieldnames)
187
+ w.writeheader()
188
+ for r in rows:
189
+ w.writerow(r)
190
+
191
+ def chunked(seq, size):
192
+ for i in range(0, len(seq), size):
193
+ yield seq[i:i+size]
194
+
195
+ def main():
196
+ ap = argparse.ArgumentParser(description="Download NIH/PMC PDFs in batches from a pmid→pmcid CSV.")
197
+ ap.add_argument("--in", dest="in_csv", required=True, help="Input CSV (from pmid2pmcid.py)")
198
+ ap.add_argument("--out-dir", required=True, help="Directory to write PDFs")
199
+ ap.add_argument("--manifest", default="pmc_download_manifest.csv", help="CSV manifest output (default: pmc_download_manifest.csv)")
200
+ ap.add_argument("--batch-size", type=int, default=40, help="Items per batch (default: 40)")
201
+ ap.add_argument("--concurrency", type=int, default=4, help="Concurrent downloads per batch (default: 4)")
202
+ ap.add_argument("--delay", type=float, default=1.0, help="Seconds to sleep between batches (default: 1.0)")
203
+ ap.add_argument("--timeout", type=int, default=60, help="Per-request timeout seconds (default: 60)")
204
+ ap.add_argument("--overwrite", action="store_true", help="Overwrite existing PDFs")
205
+ ap.add_argument("--email", help="Contact email (sent in headers)")
206
+ args = ap.parse_args()
207
+
208
+ ensure_dir(args.out_dir)
209
+ rows = read_rows(args.in_csv)
210
+ # keep only rows with PMCID
211
+ todo = [r for r in rows if valid_pmcid(r.get("pmcid", ""))]
212
+
213
+ results: List[Dict[str, str]] = []
214
+ total = len(todo)
215
+ if total == 0:
216
+ print("No rows with a valid PMCID found. Nothing to download.")
217
+ sys.exit(0)
218
+
219
+ print(f"Found {total} entries with PMCID. Starting downloads…")
220
+ count = 0
221
+ for batch_num, batch in enumerate(chunked(todo, args.batch_size), start=1):
222
+ print(f"Batch {batch_num}: {len(batch)} items")
223
+ with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
224
+ futs = [ex.submit(worker, r, args.out_dir, args.timeout, args.overwrite, args.email) for r in batch]
225
+ for fut in as_completed(futs):
226
+ res = fut.result()
227
+ results.append(res)
228
+ count += 1
229
+ if res["status"].startswith("ok"):
230
+ print(f" ✓ {res['pmcid']} → {os.path.basename(res['file'])}")
231
+ elif res["status"] == "skip":
232
+ print(f" - {res['pmcid']} skipped ({res['message']})")
233
+ else:
234
+ print(f" ✗ {res['pmcid']} ({res['message']})")
235
+ # polite pause between batches
236
+ time.sleep(max(0.0, args.delay))
237
+
238
+ write_manifest(args.manifest, results)
239
+ ok = sum(1 for r in results if r["status"].startswith("ok"))
240
+ fail = sum(1 for r in results if r["status"] in ("fail", "error"))
241
+ skip = sum(1 for r in results if r["status"] == "skip")
242
+ print(f"\nDone. ok={ok}, fail={fail}, skip={skip}. Manifest: {args.manifest}")
243
+
244
+ if __name__ == "__main__":
245
+ main()
pmcids.txt ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PMC4553269
2
+ PMC11834963
3
+ PMC4767593
4
+ PMC10043565
5
+ PMC4827439
6
+ PMC4312739
7
+ PMC6007852
8
+ PMC8602441
9
+ PMC9825391
10
+ PMC5890941
11
+ PMC11233068
12
+ PMC10435547
13
+ PMC7061455
14
+ PMC6035749
15
+ PMC10644000
16
+ PMC4013720
17
+ PMC12468851
18
+ PMC12468851
19
+ PMC4189196
20
+ PMC12541443
21
+ PMC4313860
22
+ PMC6642361
23
+ PMC5628134
24
+ PMC9827113
25
+ PMC4359951
26
+ PMC4878831
27
+ PMC8634406
28
+ PMC7680360
29
+ PMC8613248
30
+ PMC8906458
31
+ PMC10841595
32
+ PMC4070589
33
+ PMC6063411
34
+ PMC7685753
35
+ PMC9197876
36
+ PMC9772093
37
+ PMC9807588
38
+ PMC10852615
39
+ PMC10581608
40
+ PMC6438729
41
+ PMC12463604
42
+ PMC9582036
43
+ PMC2694412
44
+ PMC8057264
45
+ PMC9586871
46
+ PMC8052452
47
+ PMC5217532
48
+ PMC5995337
49
+ PMC9117241
50
+ PMC9983778
51
+ PMC10480533
52
+ PMC10875331
53
+ PMC12433863
54
+ PMC10338177
55
+ PMC12541558
56
+ PMC9882421
57
+ PMC5699446
58
+ PMC2911503
59
+ PMC7385919
60
+ PMC7476838
61
+ PMC7796332
62
+ PMC10309566
63
+ PMC10756077
64
+ PMC7446400
65
+ PMC10511463
66
+ PMC9529954
67
+ PMC11149479
68
+ PMC11108776
69
+ PMC5059781
70
+ PMC4675454
71
+ PMC7181640
72
+ PMC7181640
73
+ PMC11216861
74
+ PMC11419252
75
+ PMC11096044
76
+ PMC11371520
77
+ PMC6345401
78
+ PMC10330105
79
+ PMC4676955
80
+ PMC5957518
81
+ PMC6075717
82
+ PMC6028190
83
+ PMC5916809
84
+ PMC6066282
85
+ PMC6070353
86
+ PMC5916809
87
+ PMC7500457
88
+ PMC5916814
89
+ PMC5972025
90
+ PMC12390932
91
+ PMC6763222
92
+ PMC6339572
93
+ PMC11398981
94
+ PMC11405286
95
+ PMC3320027
96
+ PMC4636053
97
+ PMC12533196
98
+ PMC3690621
99
+ PMC3399763
100
+ PMC12487679
101
+ PMC4866047
102
+ PMC6647838
103
+ PMC3440846
104
+ PMC4850357
105
+ PMC4815041
106
+ PMC5957518
107
+ PMC6075717
108
+ PMC6028190
109
+ PMC5916809
110
+ PMC6066282
111
+ PMC6070353
112
+ PMC5916809
113
+ PMC12521747
114
+ PMC7500457
115
+ PMC5916814
116
+ PMC5972025
117
+ PMC12390932
118
+ PMC5957518
119
+ PMC6075717
120
+ PMC6028190
121
+ PMC5916809
122
+ PMC6066282
123
+ PMC6070353
124
+ PMC5916809
125
+ PMC12521747
126
+ PMC7500457
127
+ PMC5916814
128
+ PMC5972025
129
+ PMC12390932
130
+ PMC5957518
131
+ PMC6075717
132
+ PMC6028190
133
+ PMC5916809
134
+ PMC6066282
135
+ PMC6070353
136
+ PMC5916809
137
+ PMC7500457
138
+ PMC5916814
139
+ PMC5972025
140
+ PMC12390932
141
+ PMC6697103
142
+ PMC7339254
143
+ PMC6768830
144
+ PMC12545938
145
+ PMC10084830
146
+ PMC3401966
147
+ PMC4589486
148
+ PMC3309757
149
+ PMC4367811
150
+ PMC5957518
151
+ PMC6075717
152
+ PMC6028190
153
+ PMC5916809
154
+ PMC6066282
155
+ PMC6070353
156
+ PMC5916809
157
+ PMC12521747
158
+ PMC7500457
159
+ PMC5916814
160
+ PMC5972025
161
+ PMC12390932
162
+ PMC5957518
163
+ PMC6075717
164
+ PMC6028190
165
+ PMC5916809
166
+ PMC6066282
167
+ PMC6070353
168
+ PMC5916809
169
+ PMC7500457
170
+ PMC5916814
171
+ PMC5972025
172
+ PMC12390932
173
+ PMC6897368
174
+ PMC8285521
175
+ PMC9379253
176
+ PMC9189056
177
+ PMC10911804
178
+ PMC12552549
179
+ PMC5957518
180
+ PMC6075717
181
+ PMC6028190
182
+ PMC5916809
183
+ PMC6066282
184
+ PMC6070353
185
+ PMC5916809
186
+ PMC7500457
187
+ PMC5916814
188
+ PMC5972025
189
+ PMC12390932
190
+ PMC11095631
191
+ PMC3771322
192
+ PMC11326964
193
+ PMC3767041
194
+ PMC10391526
195
+ PMC4936195
196
+ PMC5180407
197
+ PMC4979995
198
+ PMC6280667
199
+ PMC3837510
200
+ PMC8505423
201
+ PMC3210554
202
+ PMC3744992
203
+ PMC3690918
204
+ PMC3198787
205
+ PMC4695400
206
+ PMC5643159
207
+ PMC6880934
208
+ PMC7572747
209
+ PMC9438279
210
+ PMC12504664
211
+ PMC9299269
212
+ PMC11101347
213
+ PMC11094415
214
+ PMC4170219
215
+ PMC9801308
216
+ PMC9200814
217
+ PMC5957518
218
+ PMC6075717
219
+ PMC6028190
220
+ PMC5916809
221
+ PMC6066282
222
+ PMC6070353
223
+ PMC5916809
224
+ PMC12521747
225
+ PMC7500457
226
+ PMC5916814
227
+ PMC5972025
228
+ PMC12390932
229
+ PMC5957518
230
+ PMC6075717
231
+ PMC6028190
232
+ PMC5916809
233
+ PMC6066282
234
+ PMC6070353
235
+ PMC5916809
236
+ PMC7500457
237
+ PMC5916814
238
+ PMC5972025
239
+ PMC12390932
240
+ PMC5957518
241
+ PMC6075717
242
+ PMC6028190
243
+ PMC5916809
244
+ PMC6066282
245
+ PMC6070353
246
+ PMC5916809
247
+ PMC12521747
248
+ PMC7500457
249
+ PMC5916814
250
+ PMC5972025
251
+ PMC12390932
252
+ PMC5957518
253
+ PMC6075717
254
+ PMC6028190
255
+ PMC5916809
256
+ PMC6066282
257
+ PMC6070353
258
+ PMC5916809
259
+ PMC12521747
260
+ PMC7500457
261
+ PMC5916814
262
+ PMC5972025
263
+ PMC12390932
264
+ PMC5957518
265
+ PMC6075717
266
+ PMC6028190
267
+ PMC5916809
268
+ PMC6066282
269
+ PMC6070353
270
+ PMC5916809
271
+ PMC7500457
272
+ PMC5916814
273
+ PMC5972025
274
+ PMC12390932
275
+ PMC5957518
276
+ PMC6075717
277
+ PMC6028190
278
+ PMC5916809
279
+ PMC6066282
280
+ PMC6070353
281
+ PMC5916809
282
+ PMC12521747
283
+ PMC7500457
284
+ PMC5916814
285
+ PMC5972025
286
+ PMC12390932
287
+ PMC5957518
288
+ PMC6075717
289
+ PMC6028190
290
+ PMC5916809
291
+ PMC6066282
292
+ PMC6070353
293
+ PMC5916809
294
+ PMC12521747
295
+ PMC7500457
296
+ PMC5916814
297
+ PMC5972025
298
+ PMC12390932
299
+ PMC5957518
300
+ PMC6075717
301
+ PMC6028190
302
+ PMC5916809
303
+ PMC6066282
304
+ PMC6070353
305
+ PMC5916809
306
+ PMC12521747
307
+ PMC7500457
308
+ PMC5916814
309
+ PMC5972025
310
+ PMC12390932
311
+ PMC5957518
312
+ PMC6075717
313
+ PMC6028190
314
+ PMC5916809
315
+ PMC6066282
316
+ PMC6070353
317
+ PMC5916809
318
+ PMC7500457
319
+ PMC5916814
320
+ PMC5972025
321
+ PMC12390932
322
+ PMC5957518
323
+ PMC6075717
324
+ PMC6028190
325
+ PMC5916809
326
+ PMC6066282
327
+ PMC6070353
328
+ PMC5916809
329
+ PMC12521747
330
+ PMC7500457
331
+ PMC5916814
332
+ PMC5972025
333
+ PMC12390932
334
+ PMC5957518
335
+ PMC6075717
336
+ PMC6028190
337
+ PMC5916809
338
+ PMC6066282
339
+ PMC6070353
340
+ PMC5916809
341
+ PMC12521747
342
+ PMC7500457
343
+ PMC5916814
344
+ PMC5972025
345
+ PMC12390932
346
+ PMC5957518
347
+ PMC6075717
348
+ PMC6028190
349
+ PMC5916809
350
+ PMC6066282
351
+ PMC6070353
352
+ PMC5916809
353
+ PMC12521747
354
+ PMC7500457
355
+ PMC5916814
356
+ PMC5972025
357
+ PMC12390932
358
+ PMC5957518
359
+ PMC6075717
360
+ PMC6028190
361
+ PMC5916809
362
+ PMC6066282
363
+ PMC6070353
364
+ PMC5916809
365
+ PMC12521747
366
+ PMC7500457
367
+ PMC5916814
368
+ PMC5972025
369
+ PMC12390932
370
+ PMC5957518
371
+ PMC6075717
372
+ PMC6028190
373
+ PMC5916809
374
+ PMC6066282
375
+ PMC6070353
376
+ PMC5916809
377
+ PMC7500457
378
+ PMC5916814
379
+ PMC5972025
380
+ PMC12390932
381
+ PMC5957518
382
+ PMC6075717
383
+ PMC6028190
384
+ PMC5916809
385
+ PMC6066282
386
+ PMC6070353
387
+ PMC5916809
388
+ PMC12521747
389
+ PMC7500457
390
+ PMC5916814
391
+ PMC5972025
392
+ PMC12390932
393
+ PMC5957518
394
+ PMC6075717
395
+ PMC6028190
396
+ PMC5916809
397
+ PMC6066282
398
+ PMC6070353
399
+ PMC5916809
400
+ PMC7500457
401
+ PMC5916814
402
+ PMC5972025
403
+ PMC12390932
404
+ PMC5957518
405
+ PMC6075717
406
+ PMC6028190
407
+ PMC5916809
408
+ PMC6066282
409
+ PMC6070353
410
+ PMC5916809
411
+ PMC7500457
412
+ PMC5916814
413
+ PMC5972025
414
+ PMC12390932
415
+ PMC5957518
416
+ PMC6075717
417
+ PMC6028190
418
+ PMC5916809
419
+ PMC6066282
420
+ PMC6070353
421
+ PMC5916809
422
+ PMC12521747
423
+ PMC7500457
424
+ PMC5916814
425
+ PMC5972025
426
+ PMC12390932
427
+ PMC5957518
428
+ PMC6075717
429
+ PMC6028190
430
+ PMC5916809
431
+ PMC6066282
432
+ PMC6070353
433
+ PMC5916809
434
+ PMC7500457
435
+ PMC5916814
436
+ PMC5972025
437
+ PMC12390932
438
+ PMC5957518
439
+ PMC6075717
440
+ PMC6028190
441
+ PMC5916809
442
+ PMC6066282
443
+ PMC6070353
444
+ PMC5916809
445
+ PMC12521747
446
+ PMC7500457
447
+ PMC5916814
448
+ PMC5972025
449
+ PMC12390932
450
+ PMC5957518
451
+ PMC6075717
452
+ PMC6028190
453
+ PMC5916809
454
+ PMC6066282
455
+ PMC6070353
456
+ PMC5916809
457
+ PMC7500457
458
+ PMC5916814
459
+ PMC5972025
460
+ PMC12390932
461
+ PMC5957518
462
+ PMC6075717
463
+ PMC6028190
464
+ PMC5916809
465
+ PMC6066282
466
+ PMC6070353
467
+ PMC5916809
468
+ PMC7500457
469
+ PMC5916814
470
+ PMC5972025
471
+ PMC12390932
472
+ PMC10202816
473
+ PMC5957518
474
+ PMC6075717
475
+ PMC6028190
476
+ PMC5916809
477
+ PMC6066282
478
+ PMC6070353
479
+ PMC5916809
480
+ PMC7500457
481
+ PMC5916814
482
+ PMC5972025
483
+ PMC12390932
484
+ PMC11655358
485
+ PMC9394403
486
+ PMC9147702
487
+ PMC12088707
488
+ PMC8027015
489
+ PMC11077417
490
+ PMC11371517
491
+ PMC12448954
492
+ PMC4982376
493
+ PMC4238969
494
+ PMC3753703
495
+ PMC7512009
496
+ PMC3919793
497
+ PMC3708595
498
+ PMC4818686
499
+ PMC4807116
500
+ PMC3999050
501
+ PMC5144107
502
+ PMC12534164
503
+ PMC12458290
504
+ PMC5549141
505
+ PMC6280667
506
+ PMC9636269
507
+ PMC9378589
508
+ PMC11726019
509
+ PMC4864027
510
+ PMC4148686
511
+ PMC3863681
512
+ PMC5189935
513
+ PMC5687509
514
+ PMC6757060
515
+ PMC7869928
516
+ PMC8143193
517
+ PMC8077737
518
+ PMC8815415
519
+ PMC3428862
520
+ PMC3465532
521
+ PMC6327853
522
+ PMC7450824
523
+ PMC5928087
524
+ PMC8831444
525
+ PMC7367727
526
+ PMC5765991
527
+ PMC7025898
528
+ PMC11835752
529
+ PMC11371517
530
+ PMC12318355
531
+ PMC11406943
532
+ PMC11291049
533
+ PMC11291049
534
+ PMC7581540
535
+ PMC11291049
536
+ PMC11185634
537
+ PMC11839034
538
+ PMC10150291
539
+ PMC12008543
540
+ PMC3678719
541
+ PMC12524419
542
+ PMC4264969
543
+ PMC12363069
544
+ PMC4313862
545
+ PMC5813492
546
+ PMC5659841
547
+ PMC6613387
548
+ PMC6636637
549
+ PMC8229851
550
+ PMC8228505
551
+ PMC10502449
552
+ PMC3415217
553
+ PMC5253129
554
+ PMC4587544
555
+ PMC5477828
556
+ PMC6689131
557
+ PMC6753053
558
+ PMC6457907
559
+ PMC8317046
560
+ PMC8613272
561
+ PMC9068765
562
+ PMC12548999
563
+ PMC12432380
564
+ PMC4160352
565
+ PMC3162986
566
+ PMC3858325
567
+ PMC12468851
568
+ PMC8282702
569
+ PMC8713028
570
+ PMC3998672
571
+ PMC4754110
572
+ PMC12452113
573
+ PMC12529571
574
+ PMC4993154
575
+ PMC3557932
576
+ PMC12261305
577
+ PMC3575604
578
+ PMC8295366
579
+ PMC7373300
580
+ PMC3466113
581
+ PMC3413789
582
+ PMC3662966
583
+ PMC3412905
584
+ PMC5482929
585
+ PMC7704768
586
+ PMC12443039
587
+ PMC10937974
588
+ PMC9018685
589
+ PMC5905700
590
+ PMC8475722
591
+ PMC8432745
592
+ PMC4241387
593
+ PMC3831489
594
+ PMC12550706
595
+ PMC5378171
596
+ PMC4808437
597
+ PMC6119118
598
+ PMC8933489
599
+ PMC8165771
600
+ PMC6898788
601
+ PMC8046739
602
+ PMC10023728
603
+ PMC11215372
604
+ PMC12468275
605
+ PMC4777652
606
+ PMC4881306
607
+ PMC3682833
608
+ PMC5094835
609
+ PMC3966280
610
+ PMC4993154
611
+ PMC5953836
612
+ PMC8213710
613
+ PMC11604310
614
+ PMC4249650
615
+ PMC12508281
616
+ PMC4489427
617
+ PMC3163504
618
+ PMC4048021
619
+ PMC3530898
620
+ PMC4403382
621
+ PMC4884143
622
+ PMC12553238
623
+ PMC3248495
624
+ PMC12553238
625
+ PMC5812436
626
+ PMC6075848
627
+ PMC7124988
628
+ PMC9308789
629
+ PMC8654574
630
+ PMC7305302
631
+ PMC3144496
632
+ PMC4558226
633
+ PMC3673022
634
+ PMC8941949
635
+ PMC12056210
636
+ PMC12404184
637
+ PMC12513462
638
+ PMC12533323
639
+ PMC3951336
640
+ PMC12508145
641
+ PMC5045679
642
+ PMC3396711
643
+ PMC4121784
644
+ PMC4484602
645
+ PMC5558263
646
+ PMC6107367
647
+ PMC6561293
648
+ PMC7444093
649
+ PMC6949382
650
+ PMC4237931
651
+ PMC7334067
652
+ PMC8776579
653
+ PMC9065149
654
+ PMC4915822
655
+ PMC3557461
656
+ PMC3600117
657
+ PMC4462130
658
+ PMC5693358
659
+ PMC5313262
660
+ PMC4861069
661
+ PMC9200818
662
+ PMC3367798
663
+ PMC3432702
664
+ PMC12468435
665
+ PMC3947264
666
+ PMC4315319
667
+ PMC4580370
668
+ PMC5054517
669
+ PMC12489181
670
+ PMC12523349
671
+ PMC5705185
672
+ PMC4767360
673
+ PMC5651175
674
+ PMC5808581
675
+ PMC9808553
676
+ PMC4354107
677
+ PMC3704730
678
+ PMC4826231
679
+ PMC6279519
680
+ PMC5587124
681
+ PMC6365097
682
+ PMC7233456
683
+ PMC9852004
684
+ PMC5903820
685
+ PMC6611775
686
+ PMC7367750
687
+ PMC7780630
688
+ PMC5957518
689
+ PMC6075717
690
+ PMC6028190
691
+ PMC5916809
692
+ PMC6066282
693
+ PMC6070353
694
+ PMC5916809
695
+ PMC12521747
696
+ PMC7500457
697
+ PMC5916814
698
+ PMC5972025
699
+ PMC12390932
700
+ PMC11463659
701
+ PMC9886551
702
+ PMC7785562
703
+ PMC11463462
704
+ PMC4603750
705
+ PMC4311405
706
+ PMC4231481
707
+ PMC4243044
708
+ PMC3962515
709
+ PMC7891089
710
+ PMC9771812
711
+ PMC11696176
712
+ PMC5685550
713
+ PMC5906695
714
+ PMC2671642
715
+ PMC3910500
716
+ PMC5596171
717
+ PMC5957518
718
+ PMC6075717
719
+ PMC6028190
720
+ PMC5916809
721
+ PMC6066282
722
+ PMC6070353
723
+ PMC5916809
724
+ PMC12521747
725
+ PMC7500457
726
+ PMC5916814
727
+ PMC5972025
728
+ PMC12390932
729
+ PMC7056576
730
+ PMC6810613
731
+ PMC8044053
732
+ PMC5461196
pmid2pmcid.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse, csv, sys, time, requests
3
+ from typing import Iterable, List, Dict, Optional, Tuple
4
+ try:
5
+ from Bio import Entrez
6
+ except Exception:
7
+ Entrez = None
8
+
9
+ IDCONV_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
10
+ ELINK_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
11
+ UA = "pmid2pmcid-cli/1.2 (+https://example.org)"
12
+
13
+ # -------------------- IO --------------------
14
+ def read_study_pmids_from_csv(path: str) -> List[Dict[str, str]]:
15
+ """
16
+ Expect a CSV with at least: studyId, pmid
17
+ Falls back to any column named pmid/PMID/id if studyId is missing.
18
+ Returns list of dicts: {'studyId': <str or ''>, 'pmid': <str>}
19
+ """
20
+ rows = []
21
+ with open(path, newline="", encoding="utf-8") as f:
22
+ reader = csv.DictReader(f)
23
+ if not reader.fieldnames:
24
+ raise ValueError("No header row found in CSV.")
25
+
26
+ # Discover columns
27
+ study_col = None
28
+ if "studyId" in reader.fieldnames:
29
+ study_col = "studyId"
30
+
31
+ pmid_col = None
32
+ for c in ("pmid", "PMID", "id", "Id", "ID"):
33
+ if c in reader.fieldnames:
34
+ pmid_col = c
35
+ break
36
+ if not pmid_col:
37
+ raise ValueError(f"No pmid-like column in {reader.fieldnames}")
38
+
39
+ for row in reader:
40
+ pmid = (row.get(pmid_col) or "").strip()
41
+ if not pmid:
42
+ continue
43
+ study = (row.get(study_col) or "").strip() if study_col else ""
44
+ rows.append({"studyId": study, "pmid": pmid})
45
+ return rows
46
+
47
+ def normalize_pmid(p: str) -> str:
48
+ p = str(p).strip()
49
+ if not p:
50
+ return ""
51
+ if p.lower().startswith("pmid"):
52
+ p = "".join(ch for ch in p if ch.isdigit())
53
+ return p
54
+
55
+ def unique_pmids(rows: List[Dict[str, str]]) -> List[str]:
56
+ seen = set()
57
+ out = []
58
+ for r in rows:
59
+ p = normalize_pmid(r["pmid"])
60
+ if p and p not in seen:
61
+ seen.add(p)
62
+ out.append(p)
63
+ return out
64
+
65
+ # -------------------- NIH resolvers --------------------
66
+ def idconv_batch(pmids: List[str], email: Optional[str], verbose: bool) -> Dict[str, Dict]:
67
+ """Return mapping {pmid(str): {pmid, pmcid, doi, status, errmsg}} for a batch."""
68
+ params = {
69
+ "ids": ",".join(pmids),
70
+ "format": "json",
71
+ "tool": "pmid2pmcid-cli",
72
+ }
73
+ if email:
74
+ params["email"] = email
75
+
76
+ r = requests.get(IDCONV_URL, params=params, timeout=60,
77
+ headers={"User-Agent": UA})
78
+ r.raise_for_status()
79
+ j = r.json()
80
+ if verbose:
81
+ print("[idconv] records:", len(j.get("records", [])))
82
+
83
+ out: Dict[str, Dict] = {}
84
+ for rec in j.get("records", []):
85
+ # Force string key so lookup matches normalized inputs
86
+ pmid = str(rec.get("pmid") or rec.get("requested-id") or "").strip()
87
+ out[pmid] = {
88
+ "pmid": pmid,
89
+ "pmcid": rec.get("pmcid") or "",
90
+ "doi": rec.get("doi") or "",
91
+ "status": rec.get("status") or "ok",
92
+ "errmsg": rec.get("errmsg") or "",
93
+ }
94
+
95
+ # Ensure every input pmid has an entry (keys are strings)
96
+ for p in pmids:
97
+ ps = str(p).strip()
98
+ out.setdefault(ps, {"pmid": ps, "pmcid": "", "doi": "", "status": "", "errmsg": ""})
99
+ return out
100
+
101
+ def resolve_idconv_all(pmids: List[str], email: Optional[str], sleep=0.34, verbose=False) -> Dict[str, Dict]:
102
+ out: Dict[str, Dict] = {}
103
+ B = 200 # NIH allows up to 200 per request
104
+ for i in range(0, len(pmids), B):
105
+ batch = pmids[i:i+B]
106
+ m = idconv_batch(batch, email=email, verbose=verbose)
107
+ out.update(m)
108
+ time.sleep(sleep)
109
+ return out
110
+
111
+ def elink_pubmed_to_pmc(pmid: str, email: Optional[str], api_key: Optional[str]) -> str:
112
+ if Entrez:
113
+ if not email:
114
+ raise ValueError("E-utilities elink requires --email when Biopython is installed.")
115
+ Entrez.email = email
116
+ if api_key:
117
+ Entrez.api_key = api_key
118
+ h = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid, retmode="xml")
119
+ recs = Entrez.read(h); h.close()
120
+ try:
121
+ links = recs[0]["LinkSetDb"][0]["Link"]
122
+ if links:
123
+ return "PMC" + links[0]["Id"]
124
+ except Exception:
125
+ return ""
126
+ return ""
127
+ else:
128
+ params = {"dbfrom": "pubmed", "db": "pmc", "id": pmid, "retmode": "json", "tool": "pmid2pmcid-cli"}
129
+ if email:
130
+ params["email"] = email
131
+ r = requests.get(ELINK_URL, params=params, timeout=30, headers={"User-Agent": UA})
132
+ r.raise_for_status()
133
+ j = r.json()
134
+ try:
135
+ links = j["linksets"][0]["linksetdbs"][0]["links"]
136
+ if links:
137
+ return "PMC" + str(links[0])
138
+ except Exception:
139
+ return ""
140
+ return ""
141
+
142
+ def resolve_pmids(pmids: List[str], email: Optional[str], force_elink: bool,
143
+ fallback: bool, api_key: Optional[str], verbose: bool) -> Dict[str, Dict]:
144
+ """Return mapping pmid(str) -> resolved fields."""
145
+ pmids = [normalize_pmid(p) for p in pmids if normalize_pmid(p)]
146
+ mapping: Dict[str, Dict] = {p: {"pmid": p, "pmcid": "", "doi": "", "status": "", "errmsg": ""} for p in pmids}
147
+
148
+ if not force_elink:
149
+ idc = resolve_idconv_all(pmids, email=email, verbose=verbose)
150
+ mapping.update(idc)
151
+
152
+ if force_elink or fallback:
153
+ for p in pmids:
154
+ need = force_elink or not mapping[p]["pmcid"]
155
+ if need:
156
+ try:
157
+ pmcid = elink_pubmed_to_pmc(p, email=email, api_key=api_key)
158
+ except Exception as e:
159
+ pmcid = ""
160
+ if verbose:
161
+ print(f"[elink] {p} error: {e}")
162
+ if pmcid:
163
+ mapping[p]["pmcid"] = pmcid
164
+ mapping[p]["status"] = (mapping[p]["status"] + ";elink").strip(";")
165
+
166
+ return mapping
167
+
168
+ # -------------------- CLI --------------------
169
+ def main():
170
+ ap = argparse.ArgumentParser(description="Convert PMID → PMCID/DOI and keep studyId in the output when provided.")
171
+ g_in = ap.add_mutually_exclusive_group(required=True)
172
+ g_in.add_argument("--pmids", help="Comma-separated PMIDs, e.g. 29625048,37261122 (no studyId)")
173
+ g_in.add_argument("--in-csv", help="CSV with columns 'studyId' and 'pmid' (at minimum)")
174
+ ap.add_argument("--out", default="pmid_to_pmcid.csv", help="Output CSV path")
175
+ ap.add_argument("--email", help="Your email (recommended; passed to NIH)")
176
+ ap.add_argument("--api-key", help="NCBI API key (optional)")
177
+ ap.add_argument("--fallback-elink", action="store_true", help="If ID Converter has no PMCID, try E-utilities")
178
+ ap.add_argument("--force-elink", action="store_true", help="Use E-utilities for ALL IDs (skip ID Converter)")
179
+ ap.add_argument("--verbose", action="store_true", help="Print raw API info summary")
180
+ args = ap.parse_args()
181
+
182
+ # Build an input list with optional studyId
183
+ if args.pmids:
184
+ input_rows = [{"studyId": "", "pmid": p.strip()} for p in args.pmids.split(",") if p.strip()]
185
+ else:
186
+ input_rows = read_study_pmids_from_csv(args.in_csv)
187
+
188
+ if not input_rows:
189
+ print("No input rows found.", file=sys.stderr)
190
+ sys.exit(1)
191
+
192
+ # Resolve unique PMIDs once
193
+ pmids = unique_pmids(input_rows)
194
+ pmid_map = resolve_pmids(pmids, email=args.email, force_elink=args.force_elink,
195
+ fallback=args.fallback_elink, api_key=args.api_key, verbose=args.verbose)
196
+
197
+ # Re-expand to one row per studyId from input
198
+ out_rows = []
199
+ for r in input_rows:
200
+ p = normalize_pmid(r["pmid"])
201
+ res = pmid_map.get(p, {"pmcid": "", "doi": "", "status": "", "errmsg": ""})
202
+ out_rows.append({
203
+ "studyId": r.get("studyId", ""),
204
+ "pmid": p,
205
+ "pmcid": res.get("pmcid", ""),
206
+ "doi": res.get("doi", ""),
207
+ "status": res.get("status", ""),
208
+ "errmsg": res.get("errmsg", ""),
209
+ })
210
+
211
+ with open(args.out, "w", newline="", encoding="utf-8") as f:
212
+ w = csv.DictWriter(f, fieldnames=["studyId", "pmid", "pmcid", "doi", "status", "errmsg"])
213
+ w.writeheader()
214
+ w.writerows(out_rows)
215
+
216
+ print(f"[✓] Wrote {len(out_rows)} rows to {args.out}")
217
+
218
+ if __name__ == "__main__":
219
+ main()
pmid_to_pmcid.csv ADDED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ studyId,pmid,pmcid,doi,status,errmsg
2
+ all_stjude_2015,25730765,PMC4553269,10.1038/ng.3230,ok,
3
+ acyc_fmi_2014,24418857,PMC11834963,,error;elink,Identifier not found in PMC
4
+ acyc_mgh_2016,26829750,PMC4767593,10.1038/ng.3502,ok,
5
+ appendiceal_msk_2022,36493333,PMC10043565,10.1200/JCO.22.01392,ok,
6
+ blca_plasmacytoid_mskcc_2016,26901067,PMC4827439,10.1038/ng.3503,ok,
7
+ blca_mskcc_solit_2014,25092538,PMC4312739,10.1016/j.eururo.2014.06.050,ok,
8
+ blca_nmibc_2017,28583311,PMC6007852,10.1016/j.eururo.2017.05.032,ok,
9
+ brca_mapk_hp_msk_2021,34795269,PMC8602441,10.1038/s41467-021-27093-y,ok,
10
+ bowel_colitis_msk_2022,36611031,PMC9825391,10.1038/s41467-022-35592-9,ok,
11
+ bladder_columbia_msk_2018,29625057,PMC5890941,10.1016/j.cell.2018.03.017,ok,
12
+ bladder_msk_2023,37682528,PMC11233068,10.1158/1078-0432.CCR-23-1283,ok,
13
+ bm_nsclc_mskcc_2023,37591896,PMC10435547,10.1038/s41467-023-40793-x,ok,
14
+ cfdna_msk_2019,31768066,PMC7061455,10.1038/s41591-019-0652-7,ok,
15
+ ccrcc_dfci_2019,29301960,PMC6035749,10.1126/science.aan5951,ok,
16
+ cervix_msk_2023,37643132,PMC10644000,10.1158/1078-0432.CCR-23-1078,ok,
17
+ chol_jhu_2013,24185509,PMC4013720,10.1038/ng.2813,ok,
18
+ chol_nccs_2013,24185513,PMC12468851,,error;elink,Identifier not found in PMC
19
+ chol_nus_2012,22561520,PMC12468851,,error;elink,Identifier not found in PMC
20
+ coadread_mskcc,25164765,PMC4189196,10.1186/s13059-014-0454-7,ok,
21
+ cllsll_icgc_2011,22158541,PMC12541443,,error;elink,Identifier not found in PMC
22
+ coad_caseccc_2015,25583493,PMC4313860,10.1073/pnas.1417064112,ok,
23
+ chol_msk_2018,29848569,PMC6642361,10.1158/1078-0432.CCR-18-0078,ok,
24
+ chol_icgc_2017,28667006,PMC5628134,10.1158/2159-8290.CD-17-0368,ok,
25
+ coadread_mskresistance_2022,36355783,PMC9827113,10.1158/2159-8290.CD-22-0405,ok,
26
+ cscc_dfarber_2015,25589618,PMC4359951,10.1158/1078-0432.CCR-14-1773,ok,
27
+ ctcl_columbia_2015,26551667,PMC4878831,10.1038/ng.3442,ok,
28
+ crc_eo_2020,34405229,PMC8634406,10.1093/jnci/djab124,ok,
29
+ crc_apc_impact_2020,32730818,PMC7680360,10.1053/j.gastro.2020.07.041,ok,
30
+ crc_nigerian_2020,34819518,PMC8613248,10.1038/s41467-021-27106-w,ok,
31
+ crc_dd_2022,35235413,PMC8906458,10.1200/PO.21.00365,ok,
32
+ difg_msk_2023,37910594,PMC10841595,10.1158/1078-0432.CCR-23-1180,ok,
33
+ escc_ucla_2014,24686850,PMC4070589,10.1038/ng.2935,ok,
34
+ hcc_msk_venturaa_2018,30052636,PMC6063411,10.1371/journal.pone.0200776,ok,
35
+ gct_msk_2020,32897884,PMC7685753,10.1172/JCI139682,ok,
36
+ egc_msk_tp53_ccr_2022,35377946,PMC9197876,10.1158/1078-0432.CCR-21-4016,ok,
37
+ gbc_mskcc_2022,36228155,PMC9772093,10.1158/1078-0432.CCR-22-1954,ok,
38
+ gist_msk_2022,36593350,PMC9807588,10.1038/s41698-022-00342-z,ok,
39
+ egc_msk_2023,37699004,PMC10852615,10.1093/jnci/djad186,ok,
40
+ hcc_jcopo_msk_2023,37769223,PMC10581608,10.1200/PO.23.00272,ok,
41
+ histiocytosis_cobi_msk_2019,30867592,PMC6438729,10.1038/s41586-019-1012-y,ok,
42
+ ihch_ismms_2015,25608663,PMC12463604,,error;elink,Identifier not found in PMC
43
+ lgsoc_mapk_msk_2022,35443055,PMC9582036,10.1158/1078-0432.CCR-21-4183,ok,
44
+ luad_tsp,18948947,PMC2694412,10.1038/nature07423,ok,
45
+ lymphoma_cellline_msk_2020,33067607,PMC8057264,10.1182/blood.2020008017,ok,
46
+ lung_msk_mind_2020,36038778,PMC9586871,10.1038/s43018-022-00416-8,ok,
47
+ mbc_msk_2021,33863915,PMC8052452,10.1038/s41523-021-00250-8,ok,
48
+ mnm_washu_2016,27959731,PMC5217532,10.1056/NEJMoa1605949,ok,
49
+ metastatic_solid_tumors_mich_2017,28783718,PMC5995337,10.1038/nature23306,ok,
50
+ mixed_kunga_msk_2022,35585047,PMC9117241,10.1038/s41467-022-30233-7,ok,
51
+ msk_ch_ped_2021,35078859,PMC9983778,10.1158/1078-0432.CCR-21-2451,ok,
52
+ mtnn_msk_2022,37078708,PMC10480533,10.1182/bloodadvances.2023009953,ok,
53
+ msk_ch_2023,38147626,PMC10875331,10.1182/bloodadvances.2023011262,ok,
54
+ nsclc_unito_2016,27346245,PMC12433863,,error;elink,Identifier not found in PMC
55
+ nsclc_ctdx_msk_2022,36357680,PMC10338177,10.1038/s41591-022-02047-z,ok,
56
+ pediatric_dkfz_2017,29489754,PMC12541558,,error;elink,Identifier not found in PMC
57
+ paired_bladder_2022,36543146,PMC9882421,10.1016/j.celrep.2022.111859,ok,
58
+ scco_mskcc,24658004,PMC5699446,10.1038/ng.2922,ok,
59
+ sarc_mskcc,20601955,PMC2911503,10.1038/ng.619,ok,
60
+ rectal_msk_2019,31591597,PMC7385919,10.1038/s41591-019-0584-2,ok,
61
+ rbl_cfdna_msk_2020,32633890,PMC7476838,10.1002/cam4.3144,ok,
62
+ rbl_mskcc_2020,33466343,PMC7796332,10.3390/cancers13010149,ok,
63
+ rms_msk_2023,37315267,PMC10309566,10.1200/PO.22.00705,ok,
64
+ sarcoma_msk_2023,37350195,PMC10756077,10.1002/path.6137,ok,
65
+ skcm_vanderbilt_mskcc_2015,32913971,PMC7446400,10.1200/PO.16.00054,ok,
66
+ soft_tissue_msk_2023,37730754,PMC10511463,10.1038/s41698-023-00445-1,ok,
67
+ ucec_ccr_msk_2022,35849120,PMC9529954,10.1158/1078-0432.CCR-22-0713,ok,
68
+ ucec_ancestry_cds_msk_2023,37651310,PMC11149479,10.1158/2159-8290.CD-23-0546,ok,
69
+ ucec_msk_2024,38653864,PMC11108776,10.1038/s41591-024-02942-7,ok,
70
+ urcc_mskcc_2016,27713405,PMC5059781,10.1038/ncomms13131,ok,
71
+ utuc_mskcc_2015,26278805,PMC4675454,10.1016/j.eururo.2015.07.039,ok,
72
+ utuc_msk_2019,32332851,PMC7181640,10.1038/s41467-020-15885-7,ok,
73
+ utuc_pdx_msk_2019,32332851,PMC7181640,10.1038/s41467-020-15885-7,ok,
74
+ plmeso_msk_2024,38630790,PMC11216861,10.1158/1078-0432.CCR-24-0085,ok,
75
+ pancreas_msk_2024,39214094,PMC11419252,10.1016/j.ccell.2024.08.002,ok,
76
+ lms_msk_2024,38488807,PMC11096044,10.1158/1078-0432.CCR-24-0148,ok,
77
+ prostate_msk_2024,38949888,PMC11371520,10.1158/1078-0432.CCR-23-3403,ok,
78
+ panet_msk_2018,30687805,PMC6345401,10.1200/PO.17.00267,ok,
79
+ makeanimpact_ccr_2023,36862133,PMC10330105,10.1158/1078-0432.CCR-22-3247,ok,
80
+ acbc_mskcc_2015,26095796,PMC4676955,10.1002/path.4573,ok,
81
+ blca_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
82
+ blca_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
83
+ blca_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
84
+ blca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
85
+ blca_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
86
+ blca_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
87
+ blca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
88
+ blca_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
89
+ blca_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
90
+ blca_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
91
+ blca_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
92
+ acc_2019,31483290,PMC6763222,10.1172/JCI128227,ok,
93
+ blca_msk_tcga_2020,30290956,PMC6339572,10.1016/j.eururo.2018.09.002,ok,
94
+ pcnsl_msk_2024,38995739,PMC11398981,10.1158/1078-0432.CCR-24-0605,ok,
95
+ msk_ctdna_vte_2024,39147831,PMC11405286,10.1038/s41591-024-03195-0,ok,
96
+ cellline_ccle_broad,22460905,PMC3320027,10.1038/nature11003,ok,
97
+ ccrcc_irc_2014,24487277,PMC4636053,10.1038/ng.2891,ok,
98
+ ccrcc_utokyo_2013,23797736,PMC12533196,,error;elink,Identifier not found in PMC
99
+ coadread_genentech,22895193,PMC3690621,10.1038/nature11282,ok,
100
+ cellline_nci60,22802077,PMC3399763,10.1158/0008-5472.CAN-12-1370,ok,
101
+ cll_iuopa_2015,26200345,PMC12487679,,error;elink,Identifier not found in PMC
102
+ brca_metabric,27161491,PMC4866047,10.1038/ncomms11479,ok,
103
+ brca_metabric,30867590,PMC6647838,10.1038/s41586-019-1007-8,ok,
104
+ brca_metabric,22522925,PMC3440846,10.1038/nature10983,ok,
105
+ coadread_dfci_2016,27149842,PMC4850357,10.1016/j.celrep.2016.03.075,ok,
106
+ cll_broad_2015,26466571,PMC4815041,10.1038/nature15395,ok,
107
+ brca_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
108
+ brca_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
109
+ brca_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
110
+ brca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
111
+ brca_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
112
+ brca_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
113
+ brca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
114
+ brca_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
115
+ brca_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
116
+ brca_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
117
+ brca_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
118
+ brca_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
119
+ cesc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
120
+ cesc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
121
+ cesc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
122
+ cesc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
123
+ cesc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
124
+ cesc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
125
+ cesc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
126
+ cesc_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
127
+ cesc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
128
+ cesc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
129
+ cesc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
130
+ cesc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
131
+ chol_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
132
+ chol_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
133
+ chol_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
134
+ chol_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
135
+ chol_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
136
+ chol_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
137
+ chol_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
138
+ chol_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
139
+ chol_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
140
+ chol_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
141
+ chol_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
142
+ ccle_broad_2019,31068700,PMC6697103,10.1038/s41586-019-1186-3,ok,
143
+ ccle_broad_2019,31978347,PMC7339254,10.1016/j.cell.2019.12.023,ok,
144
+ coad_cptac_2019,31031003,PMC6768830,10.1016/j.cell.2019.03.030,ok,
145
+ coadread_cass_2020,32888432,PMC12545938,,error;elink,Identifier not found in PMC
146
+ cll_broad_2022,35927489,PMC10084830,10.1038/s41588-022-01140-w,ok,
147
+ coadread_tcga_pub,22810696,PMC3401966,10.1038/nature11252,ok,
148
+ desm_broad_2015,26343386,PMC4589486,10.1038/ng.3382,ok,
149
+ dlbc_broad_2012,22343534,PMC3309757,10.1073/pnas.1121343109,ok,
150
+ cscc_hgsc_bcm_2014,25303977,PMC4367811,10.1158/1078-0432.CCR-14-1768,ok,
151
+ coadread_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
152
+ coadread_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
153
+ coadread_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
154
+ coadread_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
155
+ coadread_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
156
+ coadread_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
157
+ coadread_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
158
+ coadread_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
159
+ coadread_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
160
+ coadread_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
161
+ coadread_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
162
+ coadread_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
163
+ dlbc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
164
+ dlbc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
165
+ dlbc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
166
+ dlbc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
167
+ dlbc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
168
+ dlbc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
169
+ dlbc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
170
+ dlbc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
171
+ dlbc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
172
+ dlbc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
173
+ dlbc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
174
+ difg_glass_2019,31748746,PMC6897368,10.1038/s41586-019-1775-1,ok,
175
+ cscc_ucsf_2021,34272401,PMC8285521,10.1038/s41525-021-00226-4,ok,
176
+ cscc_ranson_2022,35982973,PMC9379253,10.3389/fonc.2022.919118,ok,
177
+ difg_glass,35649412,PMC9189056,10.1016/j.cell.2022.04.038,ok,
178
+ difg_glass,38117484,PMC10911804,10.1158/0008-5472.CAN-23-2093,ok,
179
+ es_dfarber_broad_2014,25186949,PMC12552549,,error;elink,Identifier not found in PMC
180
+ esca_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
181
+ esca_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
182
+ esca_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
183
+ esca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
184
+ esca_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
185
+ esca_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
186
+ esca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
187
+ esca_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
188
+ esca_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
189
+ esca_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
190
+ esca_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
191
+ gist_msk_2023,37477937,PMC11095631,10.1158/1078-0432.CCR-23-1184,ok,
192
+ kirc_tcga_pub,23792563,PMC3771322,10.1038/nature12222,ok,
193
+ hcc_msk_2024,38864854,PMC11326964,10.1158/1078-0432.CCR-24-0657,ok,
194
+ laml_tcga_pub,23634996,PMC3767041,10.1056/NEJMoa1301689,ok,
195
+ luad_mskcc_2023_met_organotropism,37084736,PMC10391526,10.1016/j.ccell.2023.03.018,ok,
196
+ mbl_sickkids_2016,26760213,PMC4936195,10.1038/nature16478,ok,
197
+ mixed_pipseq_2017,28007021,PMC5180407,10.1186/s13073-016-0389-6,ok,
198
+ mds_mskcc_2020,27276561,PMC4979995,10.1056/NEJMoa1516192,ok,
199
+ mds_mskcc_2020,30333627,PMC6280667,10.1038/s41586-018-0623-z,ok,
200
+ mds_mskcc_2020,24030381,PMC3837510,10.1182/blood-2013-08-518886,ok,
201
+ mixed_msk_tcga_2021,34635660,PMC8505423,10.1038/s41523-021-00339-0,ok,
202
+ nhl_bcgsc_2011,21796119,PMC3210554,10.1038/nature10351,ok,
203
+ nhl_bcgsc_2013,23699601,PMC3744992,10.1182/blood-2013-02-483727,ok,
204
+ prad_broad_2013,23622249,PMC3690918,10.1016/j.cell.2013.03.021,ok,
205
+ prad_mskcc,20579941,PMC3198787,10.1016/j.ccr.2010.05.026,ok,
206
+ prad_tcga_pub,26544944,PMC4695400,10.1016/j.cell.2015.10.025,ok,
207
+ pcpg_tcga_pub,28162975,PMC5643159,10.1016/j.ccell.2017.01.001,ok,
208
+ pptc_2019,31693904,PMC6880934,10.1016/j.celrep.2019.09.071,ok,
209
+ prad_cdk12_mskcc_2020,32317181,PMC7572747,10.1016/j.eururo.2020.03.024,ok,
210
+ prad_pik3r1_msk_2021,35670774,PMC9438279,10.1158/1078-0432.CCR-21-4272,ok,
211
+ pog570_bcgsc_2020,35121966,PMC12504664,,error;elink,Identifier not found in PMC
212
+ prad_organoids_msk_2022,35617398,PMC9299269,10.1126/science.abe1505,ok,
213
+ ptad_msk_2024,38758238,PMC11101347,10.1007/s00401-024-02736-8,ok,
214
+ prad_msk_mdanderson_2023,38488813,PMC11094415,10.1158/1078-0432.CCR-23-2438,ok,
215
+ stad_tcga_pub,25079317,PMC4170219,10.1038/nature13480,ok,
216
+ rectal_msk_2022,35970919,PMC9801308,10.1038/s41591-022-01930-z,ok,
217
+ sarcoma_msk_2022,35705558,PMC9200814,10.1038/s41467-022-30496-0,ok,
218
+ hnsc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
219
+ hnsc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
220
+ hnsc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
221
+ hnsc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
222
+ hnsc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
223
+ hnsc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
224
+ hnsc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
225
+ hnsc_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
226
+ hnsc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
227
+ hnsc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
228
+ hnsc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
229
+ hnsc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
230
+ kich_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
231
+ kich_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
232
+ kich_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
233
+ kich_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
234
+ kich_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
235
+ kich_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
236
+ kich_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
237
+ kich_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
238
+ kich_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
239
+ kich_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
240
+ kich_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
241
+ kirc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
242
+ kirc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
243
+ kirc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
244
+ kirc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
245
+ kirc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
246
+ kirc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
247
+ kirc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
248
+ kirc_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
249
+ kirc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
250
+ kirc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
251
+ kirc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
252
+ kirc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
253
+ kirp_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
254
+ kirp_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
255
+ kirp_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
256
+ kirp_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
257
+ kirp_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
258
+ kirp_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
259
+ kirp_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
260
+ kirp_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
261
+ kirp_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
262
+ kirp_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
263
+ kirp_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
264
+ kirp_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
265
+ laml_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
266
+ laml_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
267
+ laml_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
268
+ laml_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
269
+ laml_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
270
+ laml_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
271
+ laml_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
272
+ laml_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
273
+ laml_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
274
+ laml_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
275
+ laml_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
276
+ lihc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
277
+ lihc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
278
+ lihc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
279
+ lihc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
280
+ lihc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
281
+ lihc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
282
+ lihc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
283
+ lihc_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
284
+ lihc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
285
+ lihc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
286
+ lihc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
287
+ lihc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
288
+ luad_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
289
+ luad_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
290
+ luad_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
291
+ luad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
292
+ luad_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
293
+ luad_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
294
+ luad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
295
+ luad_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
296
+ luad_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
297
+ luad_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
298
+ luad_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
299
+ luad_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
300
+ lusc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
301
+ lusc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
302
+ lusc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
303
+ lusc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
304
+ lusc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
305
+ lusc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
306
+ lusc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
307
+ lusc_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
308
+ lusc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
309
+ lusc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
310
+ lusc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
311
+ lusc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
312
+ meso_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
313
+ meso_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
314
+ meso_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
315
+ meso_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
316
+ meso_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
317
+ meso_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
318
+ meso_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
319
+ meso_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
320
+ meso_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
321
+ meso_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
322
+ meso_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
323
+ ov_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
324
+ ov_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
325
+ ov_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
326
+ ov_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
327
+ ov_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
328
+ ov_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
329
+ ov_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
330
+ ov_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
331
+ ov_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
332
+ ov_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
333
+ ov_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
334
+ ov_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
335
+ paad_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
336
+ paad_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
337
+ paad_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
338
+ paad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
339
+ paad_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
340
+ paad_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
341
+ paad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
342
+ paad_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
343
+ paad_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
344
+ paad_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
345
+ paad_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
346
+ paad_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
347
+ pcpg_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
348
+ pcpg_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
349
+ pcpg_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
350
+ pcpg_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
351
+ pcpg_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
352
+ pcpg_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
353
+ pcpg_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
354
+ pcpg_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
355
+ pcpg_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
356
+ pcpg_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
357
+ pcpg_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
358
+ pcpg_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
359
+ prad_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
360
+ prad_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
361
+ prad_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
362
+ prad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
363
+ prad_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
364
+ prad_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
365
+ prad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
366
+ prad_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
367
+ prad_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
368
+ prad_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
369
+ prad_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
370
+ prad_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
371
+ sarc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
372
+ sarc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
373
+ sarc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
374
+ sarc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
375
+ sarc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
376
+ sarc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
377
+ sarc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
378
+ sarc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
379
+ sarc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
380
+ sarc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
381
+ sarc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
382
+ skcm_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
383
+ skcm_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
384
+ skcm_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
385
+ skcm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
386
+ skcm_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
387
+ skcm_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
388
+ skcm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
389
+ skcm_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
390
+ skcm_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
391
+ skcm_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
392
+ skcm_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
393
+ skcm_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
394
+ stad_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
395
+ stad_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
396
+ stad_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
397
+ stad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
398
+ stad_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
399
+ stad_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
400
+ stad_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
401
+ stad_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
402
+ stad_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
403
+ stad_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
404
+ stad_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
405
+ tgct_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
406
+ tgct_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
407
+ tgct_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
408
+ tgct_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
409
+ tgct_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
410
+ tgct_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
411
+ tgct_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
412
+ tgct_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
413
+ tgct_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
414
+ tgct_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
415
+ tgct_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
416
+ thca_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
417
+ thca_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
418
+ thca_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
419
+ thca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
420
+ thca_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
421
+ thca_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
422
+ thca_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
423
+ thca_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
424
+ thca_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
425
+ thca_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
426
+ thca_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
427
+ thca_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
428
+ thym_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
429
+ thym_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
430
+ thym_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
431
+ thym_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
432
+ thym_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
433
+ thym_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
434
+ thym_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
435
+ thym_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
436
+ thym_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
437
+ thym_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
438
+ thym_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
439
+ ucec_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
440
+ ucec_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
441
+ ucec_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
442
+ ucec_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
443
+ ucec_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
444
+ ucec_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
445
+ ucec_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
446
+ ucec_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
447
+ ucec_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
448
+ ucec_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
449
+ ucec_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
450
+ ucec_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
451
+ ucs_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
452
+ ucs_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
453
+ ucs_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
454
+ ucs_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
455
+ ucs_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
456
+ ucs_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
457
+ ucs_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
458
+ ucs_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
459
+ ucs_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
460
+ ucs_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
461
+ ucs_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
462
+ uvm_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
463
+ uvm_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
464
+ uvm_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
465
+ uvm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
466
+ uvm_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
467
+ uvm_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
468
+ uvm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
469
+ uvm_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
470
+ uvm_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
471
+ uvm_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
472
+ uvm_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
473
+ coad_silu_2022,37202560,PMC10202816,10.1038/s41591-023-02324-5,ok,
474
+ acc_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
475
+ acc_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
476
+ acc_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
477
+ acc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
478
+ acc_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
479
+ acc_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
480
+ acc_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
481
+ acc_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
482
+ acc_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
483
+ acc_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
484
+ acc_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
485
+ msk_chord_2024,39506116,PMC11655358,10.1038/s41586-024-08167-5,ok,
486
+ pancan_mappyacts_2022,35292802,PMC9394403,10.1158/2159-8290.CD-21-1136,ok,
487
+ msk_met_2021,35120664,PMC9147702,10.1016/j.cell.2022.01.003,ok,
488
+ blca_msk_2024,39499893,PMC12088707,10.1200/PO.24.00287,ok,
489
+ brca_fuscc_2020,32719455,PMC8027015,10.1038/s41422-020-0375-9,ok,
490
+ thyroid_gatci_2024,38412093,PMC11077417,10.1016/j.celrep.2024.113826,ok,
491
+ braf_msk_impact_2024,38922339,PMC11371517,10.1158/1078-0432.CCR-23-3981,ok,
492
+ bcc_unige_2016,26950094,PMC12448954,,error;elink,Identifier not found in PMC
493
+ ampca_bcm_2016,26804919,PMC4982376,10.1016/j.celrep.2015.12.005,ok,
494
+ blca_dfarber_mskcc_2014,25096233,PMC4238969,10.1158/2159-8290.CD-14-0623,ok,
495
+ blca_mskcc_solit_2012,23897969,PMC3753703,10.1200/JCO.2012.46.5740,ok,
496
+ blca_bgi,24121792,PMC7512009,10.1038/ng.2798,ok,
497
+ all_stjude_2013,23334668,PMC3919793,10.1038/ng.2532,ok,
498
+ acyc_mskcc_2013,23685749,PMC3708595,10.1038/ng.2643,ok,
499
+ acyc_jhu_2016,26862087,PMC4818686,10.1158/1940-6207.CAPR-15-0316,ok,
500
+ acyc_mda_2015,26631609,PMC4807116,10.1158/1078-0432.CCR-15-2867-T,ok,
501
+ acyc_sanger_2013,23778141,PMC3999050,10.1172/JCI67201,ok,
502
+ all_stjude_2016,27776115,PMC5144107,10.1038/ng.3691,ok,
503
+ angs_project_painter_2018,32042194,PMC12534164,,error;elink,Identifier not found in PMC
504
+ bfn_duke_nus_2015,26437033,PMC12458290,,error;elink,Identifier not found in PMC
505
+ blca_cornell_2016,27749842,PMC5549141,10.1038/ng.3692,ok,
506
+ aml_ohsu_2018,30333627,PMC6280667,10.1038/s41586-018-0623-z,ok,
507
+ blca_bcan_hcrn_2022,36333289,PMC9636269,10.1038/s41467-022-33980-9,ok,
508
+ aml_ohsu_2022,35868306,PMC9378589,10.1016/j.ccell.2022.07.002,ok,
509
+ asclc_msk_2024,39185963,PMC11726019,10.1158/2159-8290.CD-24-0286,ok,
510
+ brca_bccrc_xenograft_2014,25470049,PMC4864027,10.1038/nature13952,ok,
511
+ brca_broad,22722202,PMC4148686,10.1038/nature11154,ok,
512
+ brca_bccrc,22495314,PMC3863681,10.1038/nature10933,ok,
513
+ brca_igr_2015,28027327,PMC5189935,10.1371/journal.pmed.1002201,ok,
514
+ blca_tcga_pub_2017,28988769,PMC5687509,10.1016/j.cell.2017.09.007,ok,
515
+ brca_mskcc_2019,31552290,PMC6757060,10.1038/s41523-019-0126-6,ok,
516
+ brca_jup_msk_2020,33263939,PMC7869928,10.1002/cjp2.190,ok,
517
+ brain_cptac_2020,33242424,PMC8143193,10.1016/j.cell.2020.10.044,ok,
518
+ brca_cptac_2020,33212010,PMC8077737,10.1016/j.cell.2020.10.036,ok,
519
+ brca_dfci_2020,32404308,PMC8815415,10.1158/2159-8290.CD-19-1390,ok,
520
+ brca_sanger,22722201,PMC3428862,10.1038/nature11017,ok,
521
+ brca_tcga_pub,23000897,PMC3465532,10.1038/nature11412,ok,
522
+ breast_msk_2018,30205045,PMC6327853,10.1016/j.ccell.2018.08.008,ok,
523
+ breast_alpelisib_2020,32864625,PMC7450824,10.1038/s43018-020-0047-1,ok,
524
+ brca_smc_2018,29713003,PMC5928087,10.1038/s41467-018-04129-4,ok,
525
+ breast_ink4_msk_2021,34544752,PMC8831444,10.1158/2159-8290.CD-20-1726,ok,
526
+ brca_pareja_msk_2020,32220886,PMC7367727,10.1158/1078-0432.CCR-19-2563,ok,
527
+ crc_msk_2017,29316426,PMC5765991,10.1016/j.ccell.2017.12.004,ok,
528
+ pancan_pcawg_2020,32025007,PMC7025898,10.1038/s41586-020-1969-6,ok,
529
+ pdac_msk_2024,39753968,PMC11835752,10.1038/s41591-024-03362-3,ok,
530
+ braf_msk_archer_2024,38922339,PMC11371517,10.1158/1078-0432.CCR-23-3981,ok,
531
+ sarcoma_ucla_2024,39305899,PMC12318355,10.1016/j.stem.2024.08.010,ok,
532
+ csf_msk_2024,39289779,PMC11406943,10.1186/s40478-024-01846-4,ok,
533
+ normal_skin_fibroblast_2024,39091884,PMC11291049,10.1101/2024.07.23.604673,ok,
534
+ normal_skin_keratinocytes_2024,39091884,PMC11291049,10.1101/2024.07.23.604673,ok,
535
+ normal_skin_melanocytes_2024,33029006,PMC7581540,10.1038/s41586-020-2785-8,ok,
536
+ normal_skin_melanocytes_2024,39091884,PMC11291049,10.1101/2024.07.23.604673,ok,
537
+ normal_skin_melanocytes_2024,38895302,PMC11185634,10.1101/2024.06.04.597225,ok,
538
+ normal_skin_melanocytes_2024,39975212,PMC11839034,10.1101/2025.02.07.637114,ok,
539
+ chl_sccc_2023,36723991,PMC10150291,10.1158/2643-3230.BCD-22-0128,ok,
540
+ blca_msk_2025,40256659,PMC12008543,10.1016/j.euros.2025.03.009,ok,
541
+ esca_broad,23525077,PMC3678719,10.1038/ng.2591,ok,
542
+ escc_icgc,24670651,PMC12524419,,error;elink,Identifier not found in PMC
543
+ es_iocurie_2014,25223734,PMC4264969,10.1158/2159-8290.CD-14-0622,ok,
544
+ gbc_shanghai_2014,24997986,PMC12363069,,error;elink,Identifier not found in PMC
545
+ egc_tmucih_2015,25583476,PMC4313862,10.1073/pnas.1422640112,ok,
546
+ egc_msk_2017,29122777,PMC5813492,10.1158/2159-8290.CD-17-0787,ok,
547
+ dlbcl_duke_2017,28985567,PMC5659841,10.1016/j.cell.2017.09.027,ok,
548
+ dlbcl_dfci_2018,29713087,PMC6613387,10.1038/s41591-018-0016-8,ok,
549
+ gbc_msk_2018,30427539,PMC6636637,10.1002/cncr.31850,ok,
550
+ egc_trap_msk_2020,32437664,PMC8229851,10.1016/S1470-2045(20)30169-8,ok,
551
+ egc_mskcc_2020,33795256,PMC8228505,10.1158/1078-0432.CCR-20-4707,ok,
552
+ egc_trap_ccr_msk_2023,37406106,PMC10502449,10.1158/1078-0432.CCR-22-3769,ok,
553
+ hnsc_broad,21798893,PMC3415217,10.1126/science.1208130,ok,
554
+ hnc_mskcc_2016,27442865,PMC5253129,10.1001/jamaoncol.2016.1790,ok,
555
+ hcc_inserm_fr_2015,25822088,PMC4587544,10.1038/ng.3252,ok,
556
+ gct_msk_2016,27646943,PMC5477828,10.1200/JCO.2016.68.7798,ok,
557
+ hcc_mskimpact_2018,30373752,PMC6689131,10.1158/1078-0432.CCR-18-2293,ok,
558
+ glioma_mskcc_2019,31263031,PMC6753053,10.1158/1078-0432.CCR-19-0032,ok,
559
+ glioma_msk_2018,30675060,PMC6457907,10.1038/s41586-019-0882-3,ok,
560
+ hccihch_pku_2019,31130341,PMC8317046,10.1016/j.ccell.2019.04.007,ok,
561
+ hgsoc_msk_2021,34819508,PMC8613272,10.1038/s41525-021-00259-9,ok,
562
+ hcc_meric_2021,35508466,PMC9068765,10.1038/s41467-022-29960-8,ok,
563
+ hcc_clca_2024,38355797,PMC12548999,,error;elink,Identifier not found in PMC
564
+ kirc_bgi,22138691,PMC12432380,,error;elink,Identifier not found in PMC
565
+ kich_tcga_pub,25155756,PMC4160352,10.1016/j.ccr.2014.07.014,ok,
566
+ hnsc_jhu,21798897,PMC3162986,10.1126/science.1206923,ok,
567
+ hnsc_mdanderson_2013,23619168,PMC3858325,10.1158/2159-8290.CD-12-0537,ok,
568
+ ihch_smmu_2014,25526346,PMC12468851,,error;elink,Identifier not found in PMC
569
+ ihch_mskcc_2020,33963001,PMC8282702,10.1158/1078-0432.CCR-21-0412,ok,
570
+ ihch_msk_2021,33765338,PMC8713028,10.1002/hep.31829,ok,
571
+ lgg_ucsf_2014,24336570,PMC3998672,10.1126/science.1239947,ok,
572
+ lgggbm_tcga_pub,26824661,PMC4754110,10.1016/j.cell.2015.12.028,ok,
573
+ lihc_amc_prv,24798001,PMC12452113,,error;elink,Identifier not found in PMC
574
+ lihc_riken,22634756,PMC12529571,,error;elink,Identifier not found in PMC
575
+ luad_mskcc_2015,25765070,PMC4993154,10.1126/science.aaa1348,ok,
576
+ luad_broad,22980975,PMC3557932,10.1016/j.cell.2012.08.029,ok,
577
+ liad_inserm_fr_2014,24735922,PMC12261305,,error;elink,Identifier not found in PMC
578
+ lcll_broad_2013,23415222,PMC3575604,10.1016/j.cell.2013.01.019,ok,
579
+ luad_msk_npjpo_2021,34290393,PMC8295366,10.1038/s41698-021-00210-2,ok,
580
+ luad_cptac_2020,32649874,PMC7373300,10.1016/j.cell.2020.06.013,ok,
581
+ lusc_tcga_pub,22960745,PMC3466113,10.1038/nature11404,ok,
582
+ mbl_broad_2012,22820256,PMC3413789,10.1038/nature11329,ok,
583
+ mbl_icgc,22832583,PMC3662966,10.1038/nature11284,ok,
584
+ mbl_pcgp,22722829,PMC3412905,10.1038/nature11213,ok,
585
+ lung_msk_2017,28336552,PMC5482929,10.1158/2159-8290.CD-16-1337,ok,
586
+ luad_mskcc_2020,32791233,PMC7704768,10.1016/j.jtho.2020.08.005,ok,
587
+ luad_oncosg_2020,32015526,PMC12443039,,error;elink,Identifier not found in PMC
588
+ lung_smc_2016,27634761,PMC10937974,,error;elink,Identifier not found in PMC
589
+ lung_pdx_msk_2021,35440124,PMC9018685,10.1038/s41467-022-29794-4,ok,
590
+ mbl_dkfz_2017,28726821,PMC5905700,10.1038/nature22973,ok,
591
+ lusc_cptac_2021,34358469,PMC8475722,10.1016/j.cell.2021.07.016,ok,
592
+ lung_nci_2022,34493867,PMC8432745,10.1038/s41588-021-00920-0,ok,
593
+ mm_broad,24434212,PMC4241387,10.1016/j.ccr.2013.12.015,ok,
594
+ mcl_idibips_2013,24145436,PMC3831489,10.1073/pnas.1314608110,ok,
595
+ mds_tokyo_2011,21909114,PMC12550706,,error;elink,Identifier not found in PMC
596
+ mel_tsam_liang_2017,28373299,PMC5378171,10.1101/gr.213348.116,ok,
597
+ mel_ucla_2016,26997480,PMC4808437,10.1016/j.cell.2016.02.065,ok,
598
+ mixed_allen_2018,30150660,PMC6119118,10.1038/s41588-018-0200-2,ok,
599
+ mixed_selpercatinib_2020,35304457,PMC8933489,10.1038/s41467-022-28848-x,ok,
600
+ mixed_cfdna_msk_2020,34059130,PMC8165771,10.1186/s13073-021-00898-8,ok,
601
+ mel_dfci_2019,31792460,PMC6898788,10.1038/s41591-019-0654-5,ok,
602
+ mel_mskimpact_2020,33509808,PMC8046739,10.1158/1078-0432.CCR-20-4189,ok,
603
+ mbn_sfu_2023,36201743,PMC10023728,10.1182/blood.2022016534,ok,
604
+ mbn_msk_2024,38497151,PMC11215372,10.3324/haematol.2023.284565,ok,
605
+ npc_nusingapore,24952746,PMC12468275,,error;elink,Identifier not found in PMC
606
+ nepc_wcm_2016,26855148,PMC4777652,10.1038/nm.4045,ok,
607
+ nbl_ucologne_2015,26466568,PMC4881306,10.1038/nature14980,ok,
608
+ nbl_broad_2013,23334666,PMC3682833,10.1038/ng.2529,ok,
609
+ mrt_bcgsc_2016,26977886,PMC5094835,10.1016/j.ccell.2016.02.009,ok,
610
+ mpn_cimr_2013,24325359,PMC3966280,10.1056/NEJMoa1312542,ok,
611
+ nsclc_mskcc_2015,25765070,PMC4993154,10.1126/science.aaa1348,ok,
612
+ nsclc_mskcc_2018,29657128,PMC5953836,10.1016/j.ccell.2018.03.018,ok,
613
+ msk_access_2021,34145282,PMC8213710,10.1038/s41467-021-24109-5,ok,
614
+ mng_utoronto_2021,34433969,PMC11604310,10.1038/s41586-021-03850-3,ok,
615
+ mpnst_mskcc,25240281,PMC4249650,10.1038/ng.3095,ok,
616
+ nbl_amc_2012,22367537,PMC12508281,,error;elink,Identifier not found in PMC
617
+ nccrcc_genentech_2014,25401301,PMC4489427,10.1038/ng.3146,ok,
618
+ ov_tcga_pub,21720365,PMC3163504,10.1038/nature10166,ok,
619
+ paac_jhu_2014,24293293,PMC4048021,10.1002/path.4310,ok,
620
+ paad_icgc,23103869,PMC3530898,10.1038/nature11547,ok,
621
+ paad_utsw_2015,25855536,PMC4403382,10.1038/ncomms7744,ok,
622
+ nsclc_tcga_broad_2016,27158780,PMC4884143,10.1038/ng.3564,ok,
623
+ paad_qcmg_uq_2016,26909576,PMC12553238,,error;elink,Identifier not found in PMC
624
+ pact_jhu_2011,22158988,PMC3248495,10.1073/pnas.1118046108,ok,
625
+ nsclc_tracerx_2017,28445112,PMC12553238,,error;elink,Identifier not found in PMC
626
+ nsclc_tracerx_2017,28445469,PMC5812436,10.1038/nature22364,ok,
627
+ nsclc_pd1_msk_2018,29337640,PMC6075848,10.1200/JCO.2017.75.3384,ok,
628
+ ntrk_msk_2019,31871300,PMC7124988,10.1158/1078-0432.CCR-19-3165,ok,
629
+ pan_origimed_2020,35871175,PMC9308789,10.1038/s41467-022-31780-9,ok,
630
+ paad_cptac_2021,34534465,PMC8654574,10.1016/j.cell.2021.08.023,ok,
631
+ nst_nfosi_ntap,32561749,PMC7305302,10.1038/s41597-020-0508-5,ok,
632
+ panet_jhu_2011,21252315,PMC3144496,10.1126/science.1200609,ok,
633
+ pcnsl_mayo_2015,25991819,PMC4558226,10.1158/1078-0432.CCR-14-2116,ok,
634
+ prad_broad,22610119,PMC3673022,10.1038/ng.2279,ok,
635
+ crc_hta11_htan_2021,34910928,PMC8941949,10.1016/j.cell.2021.11.031,ok,
636
+ panet_shanghai_2013,24326773,PMC12056210,,error;elink,Identifier not found in PMC
637
+ plmeso_nyu_2015,25488749,PMC12404184,,error;elink,Identifier not found in PMC
638
+ prad_cpcg_2017,28068672,PMC12513462,,error;elink,Identifier not found in PMC
639
+ panet_arcnet_2017,28199314,PMC12533323,,error;elink,Identifier not found in PMC
640
+ past_dkfz_heidelberg_2013,23817572,PMC3951336,10.1038/ng.2682,ok,
641
+ prad_eururol_2017,28927585,PMC12508145,,error;elink,Identifier not found in PMC
642
+ prad_fhcrc,26928463,PMC5045679,10.1038/nm.4053,ok,
643
+ prad_mich,22722839,PMC3396711,10.1038/nature11125,ok,
644
+ prad_mskcc_2014,25024180,PMC4121784,10.1073/pnas.1411446111,ok,
645
+ prad_su2c_2015,26000489,PMC4484602,10.1016/j.cell.2015.05.001,ok,
646
+ prad_mskcc_2017,28825054,PMC5558263,10.1200/PO.17.00029,ok,
647
+ prad_p1000,29610475,PMC6107367,10.1038/s41588-018-0078-z,ok,
648
+ prad_su2c_2019,31061129,PMC6561293,10.1073/pnas.1902651116,ok,
649
+ prostate_dkfz_2018,30537516,PMC7444093,10.1016/j.ccell.2018.10.016,ok,
650
+ prad_msk_2019,31564440,PMC6949382,10.1016/j.cmet.2019.08.024,ok,
651
+ prad_mskcc_cheny1_organoids_2014,25201530,PMC4237931,10.1016/j.cell.2014.08.016,ok,
652
+ prad_mcspc_mskcc_2020,32220891,PMC7334067,10.1158/1078-0432.CCR-20-0168,ok,
653
+ prad_msk_stopsack_2021,34667026,PMC8776579,10.1158/1078-0432.CCR-21-2577,ok,
654
+ prostate_pcbm_swiss_2019,35504881,PMC9065149,10.1038/s41467-022-30003-5,ok,
655
+ sclc_clcgp,22941188,PMC4915822,10.1038/ng.2396,ok,
656
+ sclc_jhu,22941189,PMC3557461,10.1038/ng.2405,ok,
657
+ skcm_broad,22817889,PMC3600117,10.1016/j.cell.2012.06.024,ok,
658
+ rms_nih_2014,24436047,PMC4462130,10.1158/2159-8290.CD-13-0639,ok,
659
+ sarc_tcga_pub,29100075,PMC5693358,10.1016/j.cell.2017.10.014,ok,
660
+ sclc_cancercell_gardner_2017,28196596,PMC5313262,10.1016/j.ccell.2017.01.006,ok,
661
+ sclc_ucologne_2015,26168399,PMC4861069,10.1038/nature14664,ok,
662
+ sarcoma_mskcc_2022,35705560,PMC9200818,10.1038/s41467-022-30453-x,ok,
663
+ skcm_broad_dfarber,22622578,PMC3367798,10.1038/nature11071,ok,
664
+ skcm_yale,22842228,PMC3432702,10.1038/ng.2359,ok,
665
+ stad_pfizer_uhongkong,24816253,PMC12468435,,error;elink,Identifier not found in PMC
666
+ skcm_broad_brafresist_2012,24265153,PMC3947264,10.1158/2159-8290.CD-13-0617,ok,
667
+ skcm_mskcc_2014,25409260,PMC4315319,10.1056/NEJMoa1406498,ok,
668
+ skcm_tcga_pub_2015,26091043,PMC4580370,10.1016/j.cell.2015.05.044,ok,
669
+ skcm_dfci_2015,26359337,PMC5054517,10.1126/science.aad0095,ok,
670
+ stad_uhongkong,22037554,PMC12489181,,error;elink,Identifier not found in PMC
671
+ stad_utokyo,24816255,PMC12523349,,error;elink,Identifier not found in PMC
672
+ tet_nci_2014,24974848,PMC5705185,10.1038/ng.3016,ok,
673
+ thyroid_mskcc_2016,26878173,PMC4767360,10.1172/JCI85271,ok,
674
+ stes_tcga_pub,28052061,PMC5651175,10.1038/nature20805,ok,
675
+ summit_2018,29420467,PMC5808581,10.1038/nature25475,ok,
676
+ stmyec_wcm_2022,36577525,PMC9808553,10.1101/mcs.a006227,ok,
677
+ ucs_jhu_2014,25233892,PMC4354107,10.1038/ncomms6006,ok,
678
+ ucec_tcga_pub,23636398,PMC3704730,10.1038/nature12113,ok,
679
+ um_qimr_2016,26683228,PMC4826231,10.18632/oncotarget.6614,ok,
680
+ ucec_msk_2018,30068706,PMC6279519,10.1158/1078-0432.CCR-18-0412,ok,
681
+ uccc_nih_2017,28485815,PMC5587124,10.1002/cncr.30745,ok,
682
+ tmb_mskcc_2018,30643254,PMC6365097,10.1038/s41588-018-0312-8,ok,
683
+ ucec_cptac_2020,32059776,PMC7233456,10.1016/j.cell.2020.01.026,ok,
684
+ ucec_ccr_cfdna_msk_2022,36007103,PMC9852004,10.1158/1078-0432.CCR-22-1134,ok,
685
+ vsc_cuk_2018,29422544,PMC5903820,10.1038/emm.2017.265,ok,
686
+ utuc_cornell_baylor_mdacc_2019,31278255,PMC6611775,10.1038/s41467-019-10873-y,ok,
687
+ usarc_msk_2020,32299819,PMC7367750,10.1158/1078-0432.CCR-19-3959,ok,
688
+ utuc_igbmc_2021,33397444,PMC7780630,10.1186/s13059-020-02230-w,ok,
689
+ lgg_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
690
+ lgg_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
691
+ lgg_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
692
+ lgg_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
693
+ lgg_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
694
+ lgg_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
695
+ lgg_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
696
+ lgg_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
697
+ lgg_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
698
+ lgg_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
699
+ lgg_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
700
+ lgg_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
701
+ crc_orion_2024,39386479,PMC11463659,10.1101/2024.09.24.614701,ok,
702
+ brca_aurora_2023,36585450,PMC9886551,10.1038/s43018-022-00491-x,ok,
703
+ schw_ctf_synodos_2025,33025139,PMC7785562,10.1007/s00401-020-02230-x,ok,
704
+ ovary_geomx_gray_foundation_2024,39386723,PMC11463462,10.1101/2024.09.25.615007,ok,
705
+ brca_tcga_pub2015,26451490,PMC4603750,10.1016/j.cell.2015.09.033,ok,
706
+ hnsc_tcga_pub,25631445,PMC4311405,10.1038/nature14129,ok,
707
+ luad_tcga_pub,25079552,PMC4231481,10.1038/nature13385,ok,
708
+ thca_tcga_pub,25417114,PMC4243044,10.1016/j.cell.2014.09.050,ok,
709
+ blca_tcga_pub,24476821,PMC3962515,10.1038/nature12965,ok,
710
+ msk_ch_2020,33106634,PMC7891089,10.1038/s41588-020-00710-0,ok,
711
+ msk_spectrum_tme_2022,36517593,PMC9771812,10.1038/s41586-022-05496-1,ok,
712
+ pancan_mimsi_msk_2024,39746944,PMC11696176,10.1038/s41467-024-54970-z,ok,
713
+ mel_iatlas_riaz_nivolumab_2017,29033130,PMC5685550,10.1016/j.cell.2017.09.028,ok,
714
+ stad_oncosg_2018,29670109,PMC5906695,10.1038/s41467-018-03828-2,ok,
715
+ gbm_tcga_pub,18772890,PMC2671642,10.1038/nature07385,ok,
716
+ gbm_tcga_pub2013,24120142,PMC3910500,10.1016/j.cell.2013.09.034,ok,
717
+ odg_msk_2017,28472509,PMC5596171,10.1093/neuonc/nox086,ok,
718
+ gbm_tcga_pan_can_atlas_2018,29625048,PMC5957518,10.1016/j.cell.2018.03.022,ok,
719
+ gbm_tcga_pan_can_atlas_2018,29596782,PMC6075717,10.1016/j.cels.2018.03.002,ok,
720
+ gbm_tcga_pan_can_atlas_2018,29622463,PMC6028190,10.1016/j.ccell.2018.03.007,ok,
721
+ gbm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
722
+ gbm_tcga_pan_can_atlas_2018,29625055,PMC6066282,10.1016/j.cell.2018.02.052,ok,
723
+ gbm_tcga_pan_can_atlas_2018,29625050,PMC6070353,10.1016/j.cell.2018.03.035,ok,
724
+ gbm_tcga_pan_can_atlas_2018,29617662,PMC5916809,10.1016/j.celrep.2018.03.050,ok,
725
+ gbm_tcga_pan_can_atlas_2018,30643250,PMC12521747,,error;elink,Identifier not found in PMC
726
+ gbm_tcga_pan_can_atlas_2018,32214244,PMC7500457,10.1038/s41586-020-2095-1,ok,
727
+ gbm_tcga_pan_can_atlas_2018,29625049,PMC5916814,10.1016/j.cell.2018.03.033,ok,
728
+ gbm_tcga_pan_can_atlas_2018,29850653,PMC5972025,10.1200/PO.17.00073,ok,
729
+ gbm_tcga_pan_can_atlas_2018,36334560,PMC12390932,,error;elink,Identifier not found in PMC
730
+ gbm_mayo_pdx_sarkaria_2019,31852831,PMC7056576,10.1158/1078-0432.CCR-19-0909,ok,
731
+ gbm_columbia_2019,30742119,PMC6810613,10.1038/s41591-019-0349-y,ok,
732
+ gbm_cptac_2021,33577785,PMC8044053,10.1016/j.ccell.2021.01.006,ok,
733
+ msk_impact_2017,28481359,PMC5461196,10.1038/nm.4333,ok,
pull_pdfs.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import csv, sys, time, requests
3
+
4
+ BASE = "https://www.cbioportal.org/api"
5
+ HEADERS = {"Accept": "application/json"} # add 'X-API-KEY' here if your instance needs it
6
+
7
+ def get_all_studies(page_size=500):
8
+ # cBioPortal API supports paging via pageSize/pageNumber
9
+ studies = []
10
+ page = 0
11
+ while True:
12
+ params = {"pageSize": page_size, "pageNumber": page}
13
+ r = requests.get(f"{BASE}/studies", headers=HEADERS, params=params, timeout=60)
14
+ r.raise_for_status()
15
+ batch = r.json()
16
+ if not batch:
17
+ break
18
+ studies.extend(batch)
19
+ page += 1
20
+ # friendly throttle
21
+ time.sleep(0.2)
22
+ return studies
23
+
24
+ def to_list(x):
25
+ if x is None:
26
+ return []
27
+ if isinstance(x, list):
28
+ return x
29
+ # some portals store comma-separated string
30
+ return [s.strip() for s in str(x).split(",") if s.strip()]
31
+
32
+ def main(out_csv="cbioportal_study_pmids.csv"):
33
+ studies = get_all_studies()
34
+ # fields commonly present: studyId, name, shortName, cancerTypeId, description, citation, pmid, etc.
35
+ rows = []
36
+ for s in studies:
37
+ pmids = to_list(s.get("pmid"))
38
+ for pmid in pmids:
39
+ rows.append({
40
+ "studyId": s.get("studyId"),
41
+ #"name": s.get("name"),
42
+ #"pmids": ";".join(pmids) if pmids else ""
43
+ "pmid": pmid
44
+ })
45
+ # write CSV
46
+ with open(out_csv, "w", newline="", encoding="utf-8") as f:
47
+ #w = csv.DictWriter(f, fieldnames=["studyId", "name", "pmids"])
48
+ w = csv.DictWriter(f, fieldnames=["studyId", "pmids"])
49
+ w.writeheader()
50
+ w.writerows(rows)
51
+ print(f"wrote {len(rows)} rows to {out_csv}")
52
+
53
+ if __name__ == "__main__":
54
+ out = sys.argv[1] if len(sys.argv) > 1 else "cbioportal_study_pmids.csv"
55
+ main(out)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ sqlalchemy
4
+ langchain-community
5
+ langchain-openai
6
+ pypdf
7
+ tiktoken
8
+ openai
9
+ langchain-text-splitters
10
+ pdfplumber
11
+ pillow
12
+ sentence-transformers
13
+ faiss-cpu
14
+ spacy
15
+ tqdm
16
+ fastmcp
17
+
18
+ # Only needed if you use --ocr
19
+ pdf2image
20
+ pytesseract
unfetched_pmcids.tsv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ PMC12088707 PMC12088707
2
+ PMC12404184 PMC12404184