Spaces:

HUBioDataLab
/

ProtHGT

Sleeping

App Files Files Community

Erva Ulusoy commited on Jan 7

Commit

23f3d8f

1 Parent(s): cdefd25

fix to avoid overwriting graph edges during inference + add prediction generation threshold

Browse files

Files changed (2) hide show

ProtHGT_app.py +12 -1
run_prothgt_app.py +50 -35

ProtHGT_app.py CHANGED Viewed

@@ -270,6 +270,16 @@ with st.sidebar:
             disabled=disabled
         )
     if selected_proteins and selected_go_category:
         button_disabled = st.session_state.submitted
@@ -355,7 +365,8 @@ if st.session_state.submitted:
                 protein_ids=selected_proteins,
                 model_paths=model_paths,
                 model_config_paths=model_config_paths,
-                go_category=go_categories
             )
             st.session_state.heterodata = heterodata

             disabled=disabled
         )
+        generation_threshold = st.number_input(
+            "Generation threshold (optional)",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.0,
+            step=0.05,
+            help="If > 0, only predictions with Probability >= threshold are generated. This reduces output size and speeds up the app.",
+            disabled=disabled,
+        )
     if selected_proteins and selected_go_category:
         button_disabled = st.session_state.submitted
                 protein_ids=selected_proteins,
                 model_paths=model_paths,
                 model_config_paths=model_config_paths,
+                go_category=go_categories,
+                threshold=generation_threshold,
             )
             st.session_state.heterodata = heterodata

run_prothgt_app.py CHANGED Viewed

@@ -48,44 +48,44 @@ class ProtHGT(torch.nn.Module):
         return self.mlp(z).view(-1), x_dict
-def _load_data(heterodata, protein_ids, go_category):
-    """Process the loaded heterodata for specific proteins and GO categories."""
-    # Get protein indices for all input proteins
-    protein_indices = [heterodata['Protein']['id_mapping'][pid] for pid in protein_ids]
     n_terms = len(heterodata[go_category]['id_mapping'])
-    all_edges = []
-    for protein_idx in protein_indices:
-        for term_idx in range(n_terms):
-            all_edges.append([protein_idx, term_idx])
-    edge_index = torch.tensor(all_edges).t()
-    heterodata[('Protein', 'protein_function', go_category)].edge_index = edge_index
-    heterodata[(go_category, 'rev_protein_function', 'Protein')].edge_index = torch.stack([edge_index[1], edge_index[0]])
-    return heterodata
 def get_available_proteins(name_file='data/name_info.json.gz'):
     with gzip.open(name_file, 'rt', encoding='utf-8') as file:
         name_info = json.load(file)
     return list(name_info['Protein'].keys())
-def _generate_predictions(heterodata, model, target_type):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     model.to(device)
     model.eval()
     heterodata = heterodata.to(device)
     with torch.no_grad():
-        edge_label_index = heterodata.edge_index_dict[('Protein', 'protein_function', target_type)]
         predictions, _ = model(heterodata.x_dict, heterodata.edge_index_dict, edge_label_index, target_type)
         predictions = torch.sigmoid(predictions)
     return predictions.cpu()
-def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
     go_category_dict = {
         'GO_term_F': 'Molecular Function',
         'GO_term_P': 'Biological Process',
@@ -96,8 +96,8 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
     with gzip.open('data/name_info.json.gz', 'rt', encoding='utf-8') as file:
         name_info = json.load(file)
-    # Get number of GO terms for this category
-    n_go_terms = len(heterodata[go_category]['id_mapping'])
     # Create lists to store the data
     all_proteins = []
@@ -107,8 +107,10 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
     all_categories = []
     all_probabilities = []
-    # Get list of GO terms once
-    go_terms = list(heterodata[go_category]['id_mapping'].keys())
     # Process predictions for each protein
     for i, protein_id in enumerate(protein_ids):
@@ -116,16 +118,29 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
         start_idx = i * n_go_terms
         end_idx = (i + 1) * n_go_terms
         protein_predictions = predictions[start_idx:end_idx]
         # Get protein name
         protein_name = name_info['Protein'].get(protein_id, protein_id)
         # Extend the lists
-        all_proteins.extend([protein_id] * n_go_terms)
-        all_protein_names.extend([protein_name] * n_go_terms)
-        all_go_terms.extend(go_terms)
-        all_go_term_names.extend([name_info['GO_term'].get(term_id, term_id) for term_id in go_terms])
-        all_categories.extend([go_category_dict[go_category]] * n_go_terms)
         all_probabilities.extend(protein_predictions.tolist())
     # Create DataFrame
@@ -140,7 +155,7 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
     return prediction_df
-def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category):
     all_predictions = []
     # Convert single protein ID to list if necessary
@@ -171,9 +186,9 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
     for go_cat, model_config_path, model_path in zip(go_category, model_config_paths, model_paths):
         print(f'Generating predictions for {go_cat}...')
-        # Process data for current GO category
-        processed_data = _load_data(copy.deepcopy(heterodata), protein_ids, go_cat)
         # Load model config
         with open(model_config_path, 'r') as file:
@@ -181,7 +196,7 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
         # Initialize model with configuration
         model = ProtHGT(
-            processed_data,
             hidden_channels=model_config['hidden_channels'][0],
             num_heads=model_config['num_heads'],
             num_layers=model_config['num_layers'],
@@ -194,12 +209,12 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
         print(f'Loaded model weights from {model_path}')
         # Generate predictions
-        predictions = _generate_predictions(processed_data, model, go_cat)
-        prediction_df = _create_prediction_df(predictions, processed_data, protein_ids, go_cat)
         all_predictions.append(prediction_df)
         # Clean up memory
-        del processed_data
         del model
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU

         return self.mlp(z).view(-1), x_dict
+def _build_edge_label_index(heterodata, protein_ids, go_category):
+    """
+    Build a dense candidate edge_label_index (Protein × GO terms) for inference.
+    IMPORTANT: Do NOT overwrite heterodata.edge_index_dict here.
+    Graph edges are used for message passing; candidate edges are only for scoring.
+    """
+    protein_indices = torch.tensor(
+        [heterodata['Protein']['id_mapping'][pid] for pid in protein_ids],
+        dtype=torch.long,
+    )
     n_terms = len(heterodata[go_category]['id_mapping'])
+    term_indices = torch.arange(n_terms, dtype=torch.long)
+    row = protein_indices.repeat_interleave(n_terms)
+    col = term_indices.repeat(len(protein_indices))
+    return torch.stack([row, col], dim=0)
 def get_available_proteins(name_file='data/name_info.json.gz'):
     with gzip.open(name_file, 'rt', encoding='utf-8') as file:
         name_info = json.load(file)
     return list(name_info['Protein'].keys())
+def _generate_predictions(heterodata, model, edge_label_index, target_type):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     model.to(device)
     model.eval()
     heterodata = heterodata.to(device)
+    edge_label_index = edge_label_index.to(device)
     with torch.no_grad():
         predictions, _ = model(heterodata.x_dict, heterodata.edge_index_dict, edge_label_index, target_type)
         predictions = torch.sigmoid(predictions)
     return predictions.cpu()
+def _create_prediction_df(predictions, heterodata, protein_ids, go_category, threshold: float = 0.0):
     go_category_dict = {
         'GO_term_F': 'Molecular Function',
         'GO_term_P': 'Biological Process',
     with gzip.open('data/name_info.json.gz', 'rt', encoding='utf-8') as file:
         name_info = json.load(file)
+    id_mapping = heterodata[go_category]['id_mapping']  # dict: GO_id -> index
+    n_go_terms = len(id_mapping)
     # Create lists to store the data
     all_proteins = []
     all_categories = []
     all_probabilities = []
+    # Build GO terms list aligned with their numeric indices (critical for correctness)
+    go_terms = [None] * n_go_terms
+    for go_id, idx in id_mapping.items():
+        go_terms[int(idx)] = go_id
     # Process predictions for each protein
     for i, protein_id in enumerate(protein_ids):
         start_idx = i * n_go_terms
         end_idx = (i + 1) * n_go_terms
         protein_predictions = predictions[start_idx:end_idx]
+        # Optional pre-filter for performance
+        if threshold and threshold > 0.0:
+            keep_mask = protein_predictions >= float(threshold)
+            if keep_mask.any():
+                keep_idx = torch.nonzero(keep_mask, as_tuple=False).view(-1)
+                protein_predictions = protein_predictions[keep_idx]
+            else:
+                continue
+        else:
+            keep_idx = torch.arange(n_go_terms)
         # Get protein name
         protein_name = name_info['Protein'].get(protein_id, protein_id)
         # Extend the lists
+        k = int(protein_predictions.numel())
+        all_proteins.extend([protein_id] * k)
+        all_protein_names.extend([protein_name] * k)
+        kept_go_ids = [go_terms[int(j)] for j in keep_idx.tolist()]
+        all_go_terms.extend(kept_go_ids)
+        all_go_term_names.extend([name_info['GO_term'].get(term_id, term_id) for term_id in kept_go_ids])
+        all_categories.extend([go_category_dict[go_category]] * k)
         all_probabilities.extend(protein_predictions.tolist())
     # Create DataFrame
     return prediction_df
+def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category, threshold: float = 0.0):
     all_predictions = []
     # Convert single protein ID to list if necessary
     for go_cat, model_config_path, model_path in zip(go_category, model_config_paths, model_paths):
         print(f'Generating predictions for {go_cat}...')
+        # Build candidate edges for inference (do NOT modify graph edges)
+        edge_label_index = _build_edge_label_index(heterodata, protein_ids, go_cat)
         # Load model config
         with open(model_config_path, 'r') as file:
         # Initialize model with configuration
         model = ProtHGT(
+            heterodata,
             hidden_channels=model_config['hidden_channels'][0],
             num_heads=model_config['num_heads'],
             num_layers=model_config['num_layers'],
         print(f'Loaded model weights from {model_path}')
         # Generate predictions
+        predictions = _generate_predictions(heterodata, model, edge_label_index, go_cat)
+        prediction_df = _create_prediction_df(predictions, heterodata, protein_ids, go_cat, threshold=threshold)
         all_predictions.append(prediction_df)
         # Clean up memory
+        del edge_label_index
         del model
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU