Spaces:

kaurm43
/

PolyFusionAgent

Running

App Files Files Community

kaurm43 commited on 18 days ago

Commit

983d53f

verified ·

1 Parent(s): b5ed4b6

Update PolyFusion/CL.py

Browse files

Files changed (1) hide show

PolyFusion/CL.py +9 -13

PolyFusion/CL.py CHANGED Viewed

@@ -335,7 +335,7 @@ def prepare_or_load_data_streaming(
                 s = row.get("psmiles", "")
                 psmiles_raw = "" if s is None else str(s)
-            # Require at least 2 modalities to keep sample
             modalities_present = sum(
                 [1 if x is not None else 0 for x in [gine_sample, schnet_sample, fp_sample, psmiles_raw]]
             )
@@ -352,7 +352,7 @@ def prepare_or_load_data_streaming(
                     torch.save(sample, sample_path)
                 except Exception as save_e:
                     print("Warning: failed to torch.save sample:", save_e)
-                    # fallback JSON for debugging (kept from your original)
                     try:
                         with open(sample_path + ".json", "w") as fjson:
                             json.dump(sample, fjson)
@@ -396,7 +396,7 @@ class LazyMultimodalDataset(Dataset):
     def __getitem__(self, idx: int) -> Dict[str, Dict[str, torch.Tensor]]:
         sample_path = self.files[idx]
-        # prefer torch.load if .pt, else try json (kept behavior)
         if sample_path.endswith(".pt"):
             sample = torch.load(sample_path, map_location="cpu")
         else:
@@ -512,7 +512,6 @@ def multimodal_collate(batch_list: List[Dict[str, Dict[str, torch.Tensor]]]) ->
             ei_offset = g["edge_index"] + node_offset
             all_edge_index.append(ei_offset)
-            # REUSED helper from GINE.py
             ea = match_edge_attr_to_index(g["edge_index"], g["edge_attr"], target_dim=3)
             all_edge_attr.append(ea)
@@ -685,7 +684,7 @@ class MultimodalContrastiveModel(nn.Module):
     def forward(self, batch_mods: Dict[str, torch.Tensor], mask_target: str):
         """
-        Compute total loss = InfoNCE + REC_LOSS_WEIGHT * reconstruction_loss (if any labels exist).
         """
         device = next(self.parameters()).device
         embs = self.encode(batch_mods)
@@ -949,7 +948,6 @@ def mask_batch_for_modality(batch: dict, modality: str, tokenizer, p_mask: float
 def mm_batch_to_model_input(masked_batch: dict) -> dict:
     """
     Normalize the masked batch dict into the exact structure expected by MultimodalContrastiveModel.
-    (Kept identical semantics.)
     """
     mm = {}
     if "gine" in masked_batch:
@@ -1027,7 +1025,7 @@ def evaluate_multimodal(model: MultimodalContrastiveModel, val_loader: DataLoade
             acc = (preds == labels).float().mean().item()
             acc_sum += acc * B
-            # Weighted F1 over instance IDs (kept as in your prior logic)
             try:
                 labels_np = labels.cpu().numpy()
                 preds_np = preds.cpu().numpy()
@@ -1158,8 +1156,6 @@ class ContrastiveDataCollator:
 class VerboseTrainingCallback(TrainerCallback):
     """
     Console-first training callback with early stopping on eval_loss.
-    Behavior is kept consistent with your original callback; changes are comment/structure only.
     """
     def __init__(self, patience: int = 10):
@@ -1578,7 +1574,7 @@ def main():
     trainer.get_train_dataloader = lambda dataset=None: train_loader
     trainer.get_eval_dataloader = lambda eval_dataset=None: val_loader
-    # Optimizer (kept as in original script)
     _optimizer = torch.optim.AdamW(multimodal_model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
     total_params = sum(p.numel() for p in multimodal_model.parameters())
@@ -1597,12 +1593,12 @@ def main():
         except Exception:
             pass
-    # ---- train ----
     training_start_time = time.time()
     trainer.train()
     training_end_time = time.time()
-    # ---- save best ----
     best_dir = os.path.join(OUTPUT_DIR, "best")
     os.makedirs(best_dir, exist_ok=True)
@@ -1616,7 +1612,7 @@ def main():
     except Exception as e:
         print("Warning: failed to load/save best model from Trainer:", e)
-    # ---- final evaluation ----
     final_metrics = {}
     try:
         if trainer.state.best_model_checkpoint:

                 s = row.get("psmiles", "")
                 psmiles_raw = "" if s is None else str(s)
+            # Require at least 2 modalities
             modalities_present = sum(
                 [1 if x is not None else 0 for x in [gine_sample, schnet_sample, fp_sample, psmiles_raw]]
             )
                     torch.save(sample, sample_path)
                 except Exception as save_e:
                     print("Warning: failed to torch.save sample:", save_e)
+                    # fallback JSON for debugging
                     try:
                         with open(sample_path + ".json", "w") as fjson:
                             json.dump(sample, fjson)
     def __getitem__(self, idx: int) -> Dict[str, Dict[str, torch.Tensor]]:
         sample_path = self.files[idx]
+        # prefer torch.load if .pt, else try json
         if sample_path.endswith(".pt"):
             sample = torch.load(sample_path, map_location="cpu")
         else:
             ei_offset = g["edge_index"] + node_offset
             all_edge_index.append(ei_offset)
             ea = match_edge_attr_to_index(g["edge_index"], g["edge_attr"], target_dim=3)
             all_edge_attr.append(ea)
     def forward(self, batch_mods: Dict[str, torch.Tensor], mask_target: str):
         """
+        Compute total loss = InfoNCE + REC_LOSS_WEIGHT * reconstruction_loss
         """
         device = next(self.parameters()).device
         embs = self.encode(batch_mods)
 def mm_batch_to_model_input(masked_batch: dict) -> dict:
     """
     Normalize the masked batch dict into the exact structure expected by MultimodalContrastiveModel.
     """
     mm = {}
     if "gine" in masked_batch:
             acc = (preds == labels).float().mean().item()
             acc_sum += acc * B
+            # Weighted F1 over instance IDs
             try:
                 labels_np = labels.cpu().numpy()
                 preds_np = preds.cpu().numpy()
 class VerboseTrainingCallback(TrainerCallback):
     """
     Console-first training callback with early stopping on eval_loss.
     """
     def __init__(self, patience: int = 10):
     trainer.get_train_dataloader = lambda dataset=None: train_loader
     trainer.get_eval_dataloader = lambda eval_dataset=None: val_loader
+    # Optimizer
     _optimizer = torch.optim.AdamW(multimodal_model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
     total_params = sum(p.numel() for p in multimodal_model.parameters())
         except Exception:
             pass
+    # ---- Train ----
     training_start_time = time.time()
     trainer.train()
     training_end_time = time.time()
+    # ---- Save best ----
     best_dir = os.path.join(OUTPUT_DIR, "best")
     os.makedirs(best_dir, exist_ok=True)
     except Exception as e:
         print("Warning: failed to load/save best model from Trainer:", e)
+    # ---- Final Evaluation ----
     final_metrics = {}
     try:
         if trainer.state.best_model_checkpoint: