ervau commited on
Commit
3e35a58
·
1 Parent(s): 580fffd

fix: replace gdown with hf_hub_download for dataset file retrieval

Browse files
Files changed (2) hide show
  1. ProtHGT_app.py +3 -1
  2. run_prothgt_app.py +17 -5
ProtHGT_app.py CHANGED
@@ -320,8 +320,10 @@ with st.sidebar:
320
  st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
321
 
322
  if st.session_state.submitted:
323
- with st.spinner("Generating predictions..."):
 
324
 
 
325
  # Generate predictions only if not already in session state
326
  if st.session_state.predictions_df is None:
327
 
 
320
  st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
321
 
322
  if st.session_state.submitted:
323
+ kg_path = "data/prothgt-kg.pt"
324
+ first_run = not os.path.exists(kg_path)
325
 
326
+ with st.spinner("Downloading dataset files for the first time, this may take a few minutes..." if first_run else "Generating predictions..."):
327
  # Generate predictions only if not already in session state
328
  if st.session_state.predictions_df is None:
329
 
run_prothgt_app.py CHANGED
@@ -54,7 +54,9 @@ from torch_geometric.nn import HGTConv, MLP
54
  import pandas as pd
55
  import yaml
56
  from datasets import load_dataset
57
- import gdown
 
 
58
  import copy
59
  import json
60
  import gzip
@@ -214,14 +216,24 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
214
  # Load dataset once
215
  # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
216
  print('Loading data...')
217
- file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
218
  output = "data/prothgt-kg.pt"
219
 
220
  if not os.path.exists(output):
221
  try:
222
- url = f"https://drive.google.com/uc?id={file_id}"
223
- print(f"Downloading file from {url}...")
224
- gdown.download(url, output, quiet=False, fuzzy=True)
 
 
 
 
 
 
 
 
 
 
225
  print(f"File downloaded to {output}")
226
  except Exception as e:
227
  print(f"Error downloading file: {e}")
 
54
  import pandas as pd
55
  import yaml
56
  from datasets import load_dataset
57
+ # import gdown
58
+ import shutil
59
+ from huggingface_hub import hf_hub_download
60
  import copy
61
  import json
62
  import gzip
 
216
  # Load dataset once
217
  # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
218
  print('Loading data...')
219
+ # file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
220
  output = "data/prothgt-kg.pt"
221
 
222
  if not os.path.exists(output):
223
  try:
224
+ # url = f"https://drive.google.com/uc?id={file_id}"
225
+ # print(f"Downloading file from {url}...")
226
+ # gdown.download(url, output, quiet=False)
227
+ # print(f"File downloaded to {output}")
228
+ cached_path = hf_hub_download(
229
+ repo_id="HUBioDataLab/ProtHGT",
230
+ filename="knowledge_graphs/prothgt-kg.pt",
231
+ repo_type="dataset",
232
+ )
233
+
234
+ output = "data/prothgt-kg.pt"
235
+ os.makedirs("data", exist_ok=True)
236
+ shutil.copy(cached_path, output)
237
  print(f"File downloaded to {output}")
238
  except Exception as e:
239
  print(f"Error downloading file: {e}")