manpreet88 commited on
Commit
9c37841
·
1 Parent(s): a5954f7

Update Property_Prediction.py

Browse files
Downstream Tasks/Property_Prediction.py CHANGED
@@ -19,7 +19,7 @@ import csv
19
  import copy
20
  from typing import List, Dict, Optional, Tuple, Any
21
 
22
- # Increase CSV field size limit (PolyInfo modality JSON fields can be large).
23
  csv.field_size_limit(sys.maxsize)
24
 
25
  # =============================================================================
@@ -36,7 +36,7 @@ from PolyFusion.DeBERTav2 import PSMILESDebertaEncoder, build_psmiles_tokenizer
36
  BASE_DIR = "/path/to/Polymer_Foundational_Model"
37
  POLYINFO_PATH = "/path/to/polyinfo_with_modalities.csv"
38
 
39
- # Pretrained encoder directories (update these placeholders)
40
  PRETRAINED_MULTIMODAL_DIR = "/path/to/multimodal_output/best"
41
  BEST_GINE_DIR = "/path/to/gin_output/best"
42
  BEST_SCHNET_DIR = "/path/to/schnet_output/best"
@@ -50,7 +50,7 @@ OUTPUT_RESULTS = "/path/to/multimodal_downstream_results.txt"
50
  BEST_WEIGHTS_DIR = "/path/to/multimodal_downstream_bestweights"
51
 
52
  # -----------------------------------------------------------------------------
53
- # Model sizes / dims (must match your pretrained multimodal encoder settings)
54
  # -----------------------------------------------------------------------------
55
  MAX_ATOMIC_Z = 85
56
  MASK_ATOM_ID = MAX_ATOMIC_Z + 1
@@ -80,12 +80,12 @@ DEBERTA_HIDDEN = 600
80
  PSMILES_MAX_LEN = 128
81
 
82
  # -----------------------------------------------------------------------------
83
- # Uni-Poly style fusion + regression head hyperparameters
84
  # -----------------------------------------------------------------------------
85
  POLYF_EMB_DIM = 600
86
  POLYF_ATTN_HEADS = 8
87
  POLYF_DROPOUT = 0.1
88
- POLYF_FF_MULT = 4 # FFN hidden = 4*d (common transformer practice)
89
 
90
  # -----------------------------------------------------------------------------
91
  # Fine-tuning parameters (single-task per property)
@@ -99,7 +99,7 @@ WEIGHT_DECAY = 0.0
99
 
100
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
101
 
102
- # Properties to evaluate (order preserved)
103
  REQUESTED_PROPERTIES = [
104
  "density",
105
  "glass transition",
@@ -193,7 +193,6 @@ def summarize_state_dict_load(full_state: dict, model_state: dict, filtered_stat
193
  print(f" {k}: ckpt={tuple(full_state[k].shape)} model={tuple(model_state[k].shape)}")
194
  print("")
195
 
196
-
197
  def find_property_columns(columns):
198
  """
199
  Robust property column matching with guardrails:
@@ -878,7 +877,7 @@ class PolymerPropertyDataset(Dataset):
878
  psmiles_data = None
879
 
880
  # ---------------------------------------------------------------------
881
- # Fill defaults for missing modalities (keeps collate simpler)
882
  # ---------------------------------------------------------------------
883
  if gine_data is None:
884
  gine_data = {
@@ -1062,7 +1061,7 @@ def multimodal_collate_fn(batch):
1062
 
1063
 
1064
  # =============================================================================
1065
- # Single-task regressor head (Uni-Poly-style fine-tuning)
1066
  # =============================================================================
1067
  class PolyFPropertyRegressor(nn.Module):
1068
  """
@@ -1283,7 +1282,7 @@ def load_pretrained_multimodal(pretrained_path: str) -> MultimodalContrastiveMod
1283
  )
1284
 
1285
  # -------------------------
1286
- # Optional: load full multimodal checkpoint (projects/fusion, etc.)
1287
  # -------------------------
1288
  ckpt_path = os.path.join(pretrained_path, "pytorch_model.bin")
1289
  if os.path.isfile(ckpt_path):
@@ -1358,7 +1357,7 @@ def build_samples_for_property(df: pd.DataFrame, prop_col: str) -> List[dict]:
1358
  def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_raw: pd.DataFrame,
1359
  pretrained_path: str, output_file: str):
1360
  """
1361
- Uni-Poly-matched downstream evaluation:
1362
 
1363
  For each property:
1364
  - Build samples from PolyInfo
@@ -1380,7 +1379,6 @@ def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_
1380
  all_results = {"per_property": {}, "mode": "POLYF_MATCHED_SINGLE_TASK"}
1381
 
1382
  for pname, pcol in zip(property_list, property_cols):
1383
- print(f"\n=== [DOWNSTREAM] Uni-Poly matched fine-tuning: {pname} (col='{pcol}') ===")
1384
  samples = build_samples_for_property(df_proc, pcol)
1385
 
1386
  print(f"[DATA] {pname}: n_samples={len(samples)}")
 
19
  import copy
20
  from typing import List, Dict, Optional, Tuple, Any
21
 
22
+ # Increase CSV field size limit
23
  csv.field_size_limit(sys.maxsize)
24
 
25
  # =============================================================================
 
36
  BASE_DIR = "/path/to/Polymer_Foundational_Model"
37
  POLYINFO_PATH = "/path/to/polyinfo_with_modalities.csv"
38
 
39
+ # Pretrained encoder directories
40
  PRETRAINED_MULTIMODAL_DIR = "/path/to/multimodal_output/best"
41
  BEST_GINE_DIR = "/path/to/gin_output/best"
42
  BEST_SCHNET_DIR = "/path/to/schnet_output/best"
 
50
  BEST_WEIGHTS_DIR = "/path/to/multimodal_downstream_bestweights"
51
 
52
  # -----------------------------------------------------------------------------
53
+ # Model sizes / dims
54
  # -----------------------------------------------------------------------------
55
  MAX_ATOMIC_Z = 85
56
  MASK_ATOM_ID = MAX_ATOMIC_Z + 1
 
80
  PSMILES_MAX_LEN = 128
81
 
82
  # -----------------------------------------------------------------------------
83
+ # Fusion + regression head hyperparameters
84
  # -----------------------------------------------------------------------------
85
  POLYF_EMB_DIM = 600
86
  POLYF_ATTN_HEADS = 8
87
  POLYF_DROPOUT = 0.1
88
+ POLYF_FF_MULT = 4 # FFN hidden = 4*d
89
 
90
  # -----------------------------------------------------------------------------
91
  # Fine-tuning parameters (single-task per property)
 
99
 
100
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
101
 
102
+ # Properties to evaluate
103
  REQUESTED_PROPERTIES = [
104
  "density",
105
  "glass transition",
 
193
  print(f" {k}: ckpt={tuple(full_state[k].shape)} model={tuple(model_state[k].shape)}")
194
  print("")
195
 
 
196
  def find_property_columns(columns):
197
  """
198
  Robust property column matching with guardrails:
 
877
  psmiles_data = None
878
 
879
  # ---------------------------------------------------------------------
880
+ # Fill defaults for missing modalities
881
  # ---------------------------------------------------------------------
882
  if gine_data is None:
883
  gine_data = {
 
1061
 
1062
 
1063
  # =============================================================================
1064
+ # Single-task regressor head
1065
  # =============================================================================
1066
  class PolyFPropertyRegressor(nn.Module):
1067
  """
 
1282
  )
1283
 
1284
  # -------------------------
1285
+ # Optional: load full multimodal checkpoint
1286
  # -------------------------
1287
  ckpt_path = os.path.join(pretrained_path, "pytorch_model.bin")
1288
  if os.path.isfile(ckpt_path):
 
1357
  def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_raw: pd.DataFrame,
1358
  pretrained_path: str, output_file: str):
1359
  """
1360
+ Downstream evaluation:
1361
 
1362
  For each property:
1363
  - Build samples from PolyInfo
 
1379
  all_results = {"per_property": {}, "mode": "POLYF_MATCHED_SINGLE_TASK"}
1380
 
1381
  for pname, pcol in zip(property_list, property_cols):
 
1382
  samples = build_samples_for_property(df_proc, pcol)
1383
 
1384
  print(f"[DATA] {pname}: n_samples={len(samples)}")