Spaces:

HUBioDataLab
/

ProtHGT

Sleeping

ervau commited on 15 days ago

Commit

3e35a58

1 Parent(s): 580fffd

fix: replace gdown with hf_hub_download for dataset file retrieval

Files changed (2) hide show

ProtHGT_app.py CHANGED Viewed

@@ -320,8 +320,10 @@ with st.sidebar:
     st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
 if st.session_state.submitted:
-    with st.spinner("Generating predictions..."):
         # Generate predictions only if not already in session state
         if st.session_state.predictions_df is None:

     st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
 if st.session_state.submitted:
+    kg_path = "data/prothgt-kg.pt"
+    first_run = not os.path.exists(kg_path)
+    with st.spinner("Downloading dataset files for the first time, this may take a few minutes..." if first_run else "Generating predictions..."):
         # Generate predictions only if not already in session state
         if st.session_state.predictions_df is None:

run_prothgt_app.py CHANGED Viewed

@@ -54,7 +54,9 @@ from torch_geometric.nn import HGTConv, MLP
 import pandas as pd
 import yaml
 from datasets import load_dataset
-import gdown
 import copy
 import json
 import gzip
@@ -214,14 +216,24 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
     # Load dataset once
     # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
     print('Loading data...')
-    file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
     output = "data/prothgt-kg.pt"
     if not os.path.exists(output):
         try:
-            url = f"https://drive.google.com/uc?id={file_id}"
-            print(f"Downloading file from {url}...")
-            gdown.download(url, output, quiet=False, fuzzy=True)
             print(f"File downloaded to {output}")
         except Exception as e:
             print(f"Error downloading file: {e}")

 import pandas as pd
 import yaml
 from datasets import load_dataset
+# import gdown
+import shutil
+from huggingface_hub import hf_hub_download
 import copy
 import json
 import gzip
     # Load dataset once
     # heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
     print('Loading data...')
+    # file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
     output = "data/prothgt-kg.pt"
     if not os.path.exists(output):
         try:
+            # url = f"https://drive.google.com/uc?id={file_id}"
+            # print(f"Downloading file from {url}...")
+            # gdown.download(url, output, quiet=False)
+            # print(f"File downloaded to {output}")
+            cached_path = hf_hub_download(
+                repo_id="HUBioDataLab/ProtHGT",
+                filename="knowledge_graphs/prothgt-kg.pt",
+                repo_type="dataset",
+            )
+            output = "data/prothgt-kg.pt"
+            os.makedirs("data", exist_ok=True)
+            shutil.copy(cached_path, output)
             print(f"File downloaded to {output}")
         except Exception as e:
             print(f"Error downloading file: {e}")