Spaces:
Running
Running
manpreet88
commited on
Commit
·
9c37841
1
Parent(s):
a5954f7
Update Property_Prediction.py
Browse files
Downstream Tasks/Property_Prediction.py
CHANGED
|
@@ -19,7 +19,7 @@ import csv
|
|
| 19 |
import copy
|
| 20 |
from typing import List, Dict, Optional, Tuple, Any
|
| 21 |
|
| 22 |
-
# Increase CSV field size limit
|
| 23 |
csv.field_size_limit(sys.maxsize)
|
| 24 |
|
| 25 |
# =============================================================================
|
|
@@ -36,7 +36,7 @@ from PolyFusion.DeBERTav2 import PSMILESDebertaEncoder, build_psmiles_tokenizer
|
|
| 36 |
BASE_DIR = "/path/to/Polymer_Foundational_Model"
|
| 37 |
POLYINFO_PATH = "/path/to/polyinfo_with_modalities.csv"
|
| 38 |
|
| 39 |
-
# Pretrained encoder directories
|
| 40 |
PRETRAINED_MULTIMODAL_DIR = "/path/to/multimodal_output/best"
|
| 41 |
BEST_GINE_DIR = "/path/to/gin_output/best"
|
| 42 |
BEST_SCHNET_DIR = "/path/to/schnet_output/best"
|
|
@@ -50,7 +50,7 @@ OUTPUT_RESULTS = "/path/to/multimodal_downstream_results.txt"
|
|
| 50 |
BEST_WEIGHTS_DIR = "/path/to/multimodal_downstream_bestweights"
|
| 51 |
|
| 52 |
# -----------------------------------------------------------------------------
|
| 53 |
-
# Model sizes / dims
|
| 54 |
# -----------------------------------------------------------------------------
|
| 55 |
MAX_ATOMIC_Z = 85
|
| 56 |
MASK_ATOM_ID = MAX_ATOMIC_Z + 1
|
|
@@ -80,12 +80,12 @@ DEBERTA_HIDDEN = 600
|
|
| 80 |
PSMILES_MAX_LEN = 128
|
| 81 |
|
| 82 |
# -----------------------------------------------------------------------------
|
| 83 |
-
#
|
| 84 |
# -----------------------------------------------------------------------------
|
| 85 |
POLYF_EMB_DIM = 600
|
| 86 |
POLYF_ATTN_HEADS = 8
|
| 87 |
POLYF_DROPOUT = 0.1
|
| 88 |
-
POLYF_FF_MULT = 4 # FFN hidden = 4*d
|
| 89 |
|
| 90 |
# -----------------------------------------------------------------------------
|
| 91 |
# Fine-tuning parameters (single-task per property)
|
|
@@ -99,7 +99,7 @@ WEIGHT_DECAY = 0.0
|
|
| 99 |
|
| 100 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 101 |
|
| 102 |
-
# Properties to evaluate
|
| 103 |
REQUESTED_PROPERTIES = [
|
| 104 |
"density",
|
| 105 |
"glass transition",
|
|
@@ -193,7 +193,6 @@ def summarize_state_dict_load(full_state: dict, model_state: dict, filtered_stat
|
|
| 193 |
print(f" {k}: ckpt={tuple(full_state[k].shape)} model={tuple(model_state[k].shape)}")
|
| 194 |
print("")
|
| 195 |
|
| 196 |
-
|
| 197 |
def find_property_columns(columns):
|
| 198 |
"""
|
| 199 |
Robust property column matching with guardrails:
|
|
@@ -878,7 +877,7 @@ class PolymerPropertyDataset(Dataset):
|
|
| 878 |
psmiles_data = None
|
| 879 |
|
| 880 |
# ---------------------------------------------------------------------
|
| 881 |
-
# Fill defaults for missing modalities
|
| 882 |
# ---------------------------------------------------------------------
|
| 883 |
if gine_data is None:
|
| 884 |
gine_data = {
|
|
@@ -1062,7 +1061,7 @@ def multimodal_collate_fn(batch):
|
|
| 1062 |
|
| 1063 |
|
| 1064 |
# =============================================================================
|
| 1065 |
-
# Single-task regressor head
|
| 1066 |
# =============================================================================
|
| 1067 |
class PolyFPropertyRegressor(nn.Module):
|
| 1068 |
"""
|
|
@@ -1283,7 +1282,7 @@ def load_pretrained_multimodal(pretrained_path: str) -> MultimodalContrastiveMod
|
|
| 1283 |
)
|
| 1284 |
|
| 1285 |
# -------------------------
|
| 1286 |
-
# Optional: load full multimodal checkpoint
|
| 1287 |
# -------------------------
|
| 1288 |
ckpt_path = os.path.join(pretrained_path, "pytorch_model.bin")
|
| 1289 |
if os.path.isfile(ckpt_path):
|
|
@@ -1358,7 +1357,7 @@ def build_samples_for_property(df: pd.DataFrame, prop_col: str) -> List[dict]:
|
|
| 1358 |
def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_raw: pd.DataFrame,
|
| 1359 |
pretrained_path: str, output_file: str):
|
| 1360 |
"""
|
| 1361 |
-
|
| 1362 |
|
| 1363 |
For each property:
|
| 1364 |
- Build samples from PolyInfo
|
|
@@ -1380,7 +1379,6 @@ def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_
|
|
| 1380 |
all_results = {"per_property": {}, "mode": "POLYF_MATCHED_SINGLE_TASK"}
|
| 1381 |
|
| 1382 |
for pname, pcol in zip(property_list, property_cols):
|
| 1383 |
-
print(f"\n=== [DOWNSTREAM] Uni-Poly matched fine-tuning: {pname} (col='{pcol}') ===")
|
| 1384 |
samples = build_samples_for_property(df_proc, pcol)
|
| 1385 |
|
| 1386 |
print(f"[DATA] {pname}: n_samples={len(samples)}")
|
|
|
|
| 19 |
import copy
|
| 20 |
from typing import List, Dict, Optional, Tuple, Any
|
| 21 |
|
| 22 |
+
# Increase CSV field size limit
|
| 23 |
csv.field_size_limit(sys.maxsize)
|
| 24 |
|
| 25 |
# =============================================================================
|
|
|
|
| 36 |
BASE_DIR = "/path/to/Polymer_Foundational_Model"
|
| 37 |
POLYINFO_PATH = "/path/to/polyinfo_with_modalities.csv"
|
| 38 |
|
| 39 |
+
# Pretrained encoder directories
|
| 40 |
PRETRAINED_MULTIMODAL_DIR = "/path/to/multimodal_output/best"
|
| 41 |
BEST_GINE_DIR = "/path/to/gin_output/best"
|
| 42 |
BEST_SCHNET_DIR = "/path/to/schnet_output/best"
|
|
|
|
| 50 |
BEST_WEIGHTS_DIR = "/path/to/multimodal_downstream_bestweights"
|
| 51 |
|
| 52 |
# -----------------------------------------------------------------------------
|
| 53 |
+
# Model sizes / dims
|
| 54 |
# -----------------------------------------------------------------------------
|
| 55 |
MAX_ATOMIC_Z = 85
|
| 56 |
MASK_ATOM_ID = MAX_ATOMIC_Z + 1
|
|
|
|
| 80 |
PSMILES_MAX_LEN = 128
|
| 81 |
|
| 82 |
# -----------------------------------------------------------------------------
|
| 83 |
+
# Fusion + regression head hyperparameters
|
| 84 |
# -----------------------------------------------------------------------------
|
| 85 |
POLYF_EMB_DIM = 600
|
| 86 |
POLYF_ATTN_HEADS = 8
|
| 87 |
POLYF_DROPOUT = 0.1
|
| 88 |
+
POLYF_FF_MULT = 4 # FFN hidden = 4*d
|
| 89 |
|
| 90 |
# -----------------------------------------------------------------------------
|
| 91 |
# Fine-tuning parameters (single-task per property)
|
|
|
|
| 99 |
|
| 100 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 101 |
|
| 102 |
+
# Properties to evaluate
|
| 103 |
REQUESTED_PROPERTIES = [
|
| 104 |
"density",
|
| 105 |
"glass transition",
|
|
|
|
| 193 |
print(f" {k}: ckpt={tuple(full_state[k].shape)} model={tuple(model_state[k].shape)}")
|
| 194 |
print("")
|
| 195 |
|
|
|
|
| 196 |
def find_property_columns(columns):
|
| 197 |
"""
|
| 198 |
Robust property column matching with guardrails:
|
|
|
|
| 877 |
psmiles_data = None
|
| 878 |
|
| 879 |
# ---------------------------------------------------------------------
|
| 880 |
+
# Fill defaults for missing modalities
|
| 881 |
# ---------------------------------------------------------------------
|
| 882 |
if gine_data is None:
|
| 883 |
gine_data = {
|
|
|
|
| 1061 |
|
| 1062 |
|
| 1063 |
# =============================================================================
|
| 1064 |
+
# Single-task regressor head
|
| 1065 |
# =============================================================================
|
| 1066 |
class PolyFPropertyRegressor(nn.Module):
|
| 1067 |
"""
|
|
|
|
| 1282 |
)
|
| 1283 |
|
| 1284 |
# -------------------------
|
| 1285 |
+
# Optional: load full multimodal checkpoint
|
| 1286 |
# -------------------------
|
| 1287 |
ckpt_path = os.path.join(pretrained_path, "pytorch_model.bin")
|
| 1288 |
if os.path.isfile(ckpt_path):
|
|
|
|
| 1357 |
def run_polyf_downstream(property_list: List[str], property_cols: List[str], df_raw: pd.DataFrame,
|
| 1358 |
pretrained_path: str, output_file: str):
|
| 1359 |
"""
|
| 1360 |
+
Downstream evaluation:
|
| 1361 |
|
| 1362 |
For each property:
|
| 1363 |
- Build samples from PolyInfo
|
|
|
|
| 1379 |
all_results = {"per_property": {}, "mode": "POLYF_MATCHED_SINGLE_TASK"}
|
| 1380 |
|
| 1381 |
for pname, pcol in zip(property_list, property_cols):
|
|
|
|
| 1382 |
samples = build_samples_for_property(df_proc, pcol)
|
| 1383 |
|
| 1384 |
print(f"[DATA] {pname}: n_samples={len(samples)}")
|