Spaces:
Sleeping
Sleeping
fix: replace gdown with hf_hub_download for dataset file retrieval
Browse files- ProtHGT_app.py +3 -1
- run_prothgt_app.py +17 -5
ProtHGT_app.py
CHANGED
|
@@ -320,8 +320,10 @@ with st.sidebar:
|
|
| 320 |
st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
|
| 321 |
|
| 322 |
if st.session_state.submitted:
|
| 323 |
-
|
|
|
|
| 324 |
|
|
|
|
| 325 |
# Generate predictions only if not already in session state
|
| 326 |
if st.session_state.predictions_df is None:
|
| 327 |
|
|
|
|
| 320 |
st.warning("⚠️ Due to memory and computational constraints, the maximum number of proteins that can be processed at once is limited to 100 proteins. For larger datasets, please consider running the model locally using our [GitHub repository](https://github.com/HUBioDataLab/ProtHGT).")
|
| 321 |
|
| 322 |
if st.session_state.submitted:
|
| 323 |
+
kg_path = "data/prothgt-kg.pt"
|
| 324 |
+
first_run = not os.path.exists(kg_path)
|
| 325 |
|
| 326 |
+
with st.spinner("Downloading dataset files for the first time, this may take a few minutes..." if first_run else "Generating predictions..."):
|
| 327 |
# Generate predictions only if not already in session state
|
| 328 |
if st.session_state.predictions_df is None:
|
| 329 |
|
run_prothgt_app.py
CHANGED
|
@@ -54,7 +54,9 @@ from torch_geometric.nn import HGTConv, MLP
|
|
| 54 |
import pandas as pd
|
| 55 |
import yaml
|
| 56 |
from datasets import load_dataset
|
| 57 |
-
import gdown
|
|
|
|
|
|
|
| 58 |
import copy
|
| 59 |
import json
|
| 60 |
import gzip
|
|
@@ -214,14 +216,24 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
|
|
| 214 |
# Load dataset once
|
| 215 |
# heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
|
| 216 |
print('Loading data...')
|
| 217 |
-
file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
|
| 218 |
output = "data/prothgt-kg.pt"
|
| 219 |
|
| 220 |
if not os.path.exists(output):
|
| 221 |
try:
|
| 222 |
-
url = f"https://drive.google.com/uc?id={file_id}"
|
| 223 |
-
print(f"Downloading file from {url}...")
|
| 224 |
-
gdown.download(url, output, quiet=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
print(f"File downloaded to {output}")
|
| 226 |
except Exception as e:
|
| 227 |
print(f"Error downloading file: {e}")
|
|
|
|
| 54 |
import pandas as pd
|
| 55 |
import yaml
|
| 56 |
from datasets import load_dataset
|
| 57 |
+
# import gdown
|
| 58 |
+
import shutil
|
| 59 |
+
from huggingface_hub import hf_hub_download
|
| 60 |
import copy
|
| 61 |
import json
|
| 62 |
import gzip
|
|
|
|
| 216 |
# Load dataset once
|
| 217 |
# heterodata = load_dataset('HUBioDataLab/ProtHGT-KG', data_files="prothgt-kg.json.gz")
|
| 218 |
print('Loading data...')
|
| 219 |
+
# file_id = "18u1o2sm8YjMo9joFw4Ilwvg0-rUU0PXK"
|
| 220 |
output = "data/prothgt-kg.pt"
|
| 221 |
|
| 222 |
if not os.path.exists(output):
|
| 223 |
try:
|
| 224 |
+
# url = f"https://drive.google.com/uc?id={file_id}"
|
| 225 |
+
# print(f"Downloading file from {url}...")
|
| 226 |
+
# gdown.download(url, output, quiet=False)
|
| 227 |
+
# print(f"File downloaded to {output}")
|
| 228 |
+
cached_path = hf_hub_download(
|
| 229 |
+
repo_id="HUBioDataLab/ProtHGT",
|
| 230 |
+
filename="knowledge_graphs/prothgt-kg.pt",
|
| 231 |
+
repo_type="dataset",
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
output = "data/prothgt-kg.pt"
|
| 235 |
+
os.makedirs("data", exist_ok=True)
|
| 236 |
+
shutil.copy(cached_path, output)
|
| 237 |
print(f"File downloaded to {output}")
|
| 238 |
except Exception as e:
|
| 239 |
print(f"Error downloading file: {e}")
|