diff --git a/training_data/hemo-negative.npz b/training_data/hemo-negative.npz
index 37eac85dba8ee35d792caf5e71157166188ea6c4..ab11bf7dd8957596d82f583b291a1f94c924adce 100644
--- a/training_data/hemo-negative.npz
+++ b/training_data/hemo-negative.npz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f83aad41f160deb6401bc0801bddc931488da6e1785749e6f72de6d0f154a37f
-size 109451
+oid sha256:bcf254803d7f5a809153007989de42b9ca9030c3a45b1f4048f40b289d010012
+size 11385064
diff --git a/training_data/hemo-positive.npz b/training_data/hemo-positive.npz
index 48c4df6ed11eef1cafb0ac10f7dd26e6256d94e0..287ec76b48b5b228d502beb96c096e943ec60a95 100644
--- a/training_data/hemo-positive.npz
+++ b/training_data/hemo-positive.npz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96cb24d5a7617f7e211cd48d2b0b424a46affa95716b96058058902068068d27
-size 27840
+oid sha256:c4dd8a83ebf887e285bd5d10e3cee919452d8ddf97463e02f52ee51789aebb41
+size 2775784
diff --git a/training_data/nf-negative.npz b/training_data/nf-negative.npz
index 5cda7af5e41bbfedbbac0eeb3fd25b72f69e509d..8fa50477c360227708a42c2bc355003258ae7974 100644
--- a/training_data/nf-negative.npz
+++ b/training_data/nf-negative.npz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e506e52e101308dd3882ca6bd45833a6e0837f9f240aa85d575c2a41e305b854
-size 21845190
+oid sha256:697db57ca3cf2366caabc000b69019e3b84fe88fea5da98de3fd75b3c9920aeb
+size 21736264
diff --git a/training_data/nf-positive.npz b/training_data/nf-positive.npz
index 85b64212de7042581621d0cdd6f16335df8eb54a..cbbb9240f4296497e28c7ea1da773dc7867921d0 100644
--- a/training_data/nf-positive.npz
+++ b/training_data/nf-positive.npz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:78caae183fe840b145275d9486a3f94a963989deb9d55a57995653bf1d497bf2
-size 41326
+oid sha256:2d49dea7969a0e408dfede599f165746bb45c83dc5bac1cc1a7d14e32de13406
+size 5760264
diff --git a/training_data_cleaned/hemolysis/hemo_meta_with_split.csv b/training_data_cleaned/hemolysis/hemo_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..2ef1aac90abd02b1e0b2744569bfbafde4965a74
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0326835d831524088e84ab86b1555cac366219fb5982fca7ac9ddddfc43b1b0
+size 233220
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_meta_with_split.csv b/training_data_cleaned/hemolysis/hemo_smiles_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..a238bb0dbf69175c4b5454875f91d2bed18648ad
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e521d4f4344bebdce1b5aa57f9e7fb1c6dc848319cb980baec38574573f079f
+size 4726077
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/dataset_dict.json b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..e8b754a884498fb161ba5373c0cbcd52feb79e4b
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1af97dd02933072c0522101cb6c382703093fcdeee3185e509c6edd6fc070b8a
+size 16876472
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/dataset_info.json b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/state.json b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6beac364a8eca17bd45678af8505350ddb5c25ed
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "48acc3da44ca47b8",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..185a1471e5832c5d3fe0a734d602c48fbff761b9
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40cb576c6993d53b26152f4cd954ba372c2b7817811ee4da9513036e9d2cc573
+size 4157120
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/dataset_info.json b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/state.json b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..932725dad0f4e809bc99396cea52f603167d9cdb
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_smiles_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fd2db53d34e0b66a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/dataset_dict.json b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..037847164e0782f3b3fa8cd6da2ad67b189e324a
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b790e2b015c19f7e0af6b8c412543677d3cdb792591952c98717f5a38eb62fc
+size 25054912
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/dataset_info.json b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/state.json b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d5346bae2aaab83391c2ef1d33546fb59950486
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "e02b995aa75a9a40",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..2a84a4b1b1776a1eac599361fbeaa51a98c6ae00
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29ace9ae3f6248785f08e15fff28be45e2d4832aaf929008470b055d60b3a523
+size 6268920
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/dataset_info.json b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/state.json b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef986403e8b8db21e99972c3d114fc38542dec1a
--- /dev/null
+++ b/training_data_cleaned/hemolysis/hemo_wt_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b7d24c190523afa3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_meta_with_split.csv b/training_data_cleaned/nf/nf_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f5998324773a84fe2140d1d66d32062e93378f11
--- /dev/null
+++ b/training_data_cleaned/nf/nf_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcce644701612db54d5f3505dce201c351234837e3685739b2fde68d53c8cf5e
+size 1756049
diff --git a/training_data_cleaned/nf/nf_smiles_meta_with_split.csv b/training_data_cleaned/nf/nf_smiles_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..c73a0c9da4a733889c4f525e041749d666e98e4a
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e36214670d615dca1a48df6500c203707cdbf31c88261bf719f7d7c1eabc201c
+size 57456921
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/dataset_dict.json b/training_data_cleaned/nf/nf_smiles_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..dbc1f5c4a995bd2ef9ee1b052c2c51d1332b7d0b
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d682c81cf7d3c0ce5f1f11ebf2b097d7f100f2d2a78a8ad8a40b1f38b94cc5a
+size 23581248
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/train/dataset_info.json b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/train/state.json b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1fbe3485974ee37405b23433f870451ee82c384
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fb6f5e1e2e124220",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..de2a336a5f9fb4e27a595ff5059f287768f7ece5
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd5724627360561f961366f3c57b8de1ed0f6187bbf670bd3a4254da1d0ba571
+size 57618824
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/val/dataset_info.json b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_smiles_with_embeddings/val/state.json b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef50afcca563c66a099c0edb43bb31563b397f4e
--- /dev/null
+++ b/training_data_cleaned/nf/nf_smiles_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "51b64b0e80ee5ffd",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/dataset_dict.json b/training_data_cleaned/nf/nf_wt_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/nf/nf_wt_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..f7c83929acb35a7c40d150f417eae807a7a6dabd
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8f13aa3f2a7dadafc529956ed192be52d9aa20e592ae9f99ee298f6f22748f
+size 71732104
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/train/dataset_info.json b/training_data_cleaned/nf/nf_wt_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/train/state.json b/training_data_cleaned/nf/nf_wt_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6101c208c233c64e95405713907965e7911de24f
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7e9e61eb2e38bf25",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/nf/nf_wt_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..98b574e378f33d7dff6336dcdaebd76ee01d8fd7
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:480c6294956397a18619267bc07d880e29e96532fe9a9618a052dec3969b46cc
+size 17930608
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/val/dataset_info.json b/training_data_cleaned/nf/nf_wt_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/nf/nf_wt_with_embeddings/val/state.json b/training_data_cleaned/nf/nf_wt_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3122025540619145b8798a7c3653fdb364183e85
--- /dev/null
+++ b/training_data_cleaned/nf/nf_wt_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "52fcbc4e0da87fa3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_caco2/caco2_meta_with_split.csv b/training_data_cleaned/permeability_caco2/caco2_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..143791dc98561c9bce408089b72330fffab6cea7
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa8f0fb32da50e69eafd3e585d68c7876710951fc54c20dd85c2501745dbb38c
+size 233334
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/dataset_dict.json b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..6ec6af6c7c53e5412634388fbcc1a8e9fe95a97c
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ee6d2739e733de4465bc3b3229614c02efd9397c10178b2aac1509a9878b68
+size 1592344
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/dataset_info.json b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dda6fb9a59209053d809c5c6d31003dd524f076
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "float64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/state.json b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..269e3dff3362f68f4bb87865311fe8df66816552
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b67b8e734ab59271",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..e92d555bb3a254652ccf6c8ee39db895169a50c5
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e534dfe6744d9fbb89ecea63e54aeac9fcd319dd018f0c7939dfad856eb9599d
+size 392528
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/dataset_info.json b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dda6fb9a59209053d809c5c6d31003dd524f076
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "float64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/state.json b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c432ac0f9ae055ad919eaae1c264d81e02a68ab7
--- /dev/null
+++ b/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c5abdd50b2a6a84c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_pampa/pampa_meta_with_split.csv b/training_data_cleaned/permeability_pampa/pampa_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f2195b98d28b57e61360600e625a223118798544
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d04d3767f03a4846003f404db6d03f8392ef9ad73830546064769beded3cfa80
+size 2180806
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/dataset_dict.json b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..0eecb0606fb050facdc5dce23d77b2b1c8da83bb
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac8d9409bc3e1f8ea60bf13881e59b431b494093d8cb211ecad75d9940ee9957
+size 16838472
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/dataset_info.json b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dda6fb9a59209053d809c5c6d31003dd524f076
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "float64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/state.json b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..97112e0af0e587f25e8db709341d4738e31ec573
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "b3fc67db512e6dff",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..0460ea1b5978f1823de87aa3c3011b2f53d7d585
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93ce6ce347c39cc97353985efbd9d57a18fa61c0cec0c258293008d06d04a1fe
+size 5412880
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/dataset_info.json b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dda6fb9a59209053d809c5c6d31003dd524f076
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "float64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/state.json b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..891a4598c88c5d2a2d3d011db84f43720724de1d
--- /dev/null
+++ b/training_data_cleaned/permeability_pampa/pampa_smiles_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "10810b5ed6df45a9",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/solubility/sol_meta_with_split.csv b/training_data_cleaned/solubility/sol_meta_with_split.csv
new file mode 100644
index 0000000000000000000000000000000000000000..443d94951ed5ae87385aae8fe231d7049b87c0bf
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_meta_with_split.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc9621a30eff95c3345e825557437befbb021d58bd608d71a6e47fdb03ccdcc3
+size 2928410
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/dataset_dict.json b/training_data_cleaned/solubility/sol_wt_with_embeddings/dataset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda4a18f1db7243cbbde261db68cf05ae54dae74
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/dataset_dict.json
@@ -0,0 +1 @@
+{"splits": ["train", "val"]}
\ No newline at end of file
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/train/data-00000-of-00001.arrow b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..1919b3768016e406bc821196215b8b130a36393d
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c98ec7e6f1d0fd673fbfcc17bddc296163c84f932c90d767002214004749fe8c
+size 77876848
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/train/dataset_info.json b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/train/state.json b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..69f1a5d8ad6bccf805a6d4844c1fb7f0c822d873
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/train/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "2e7260d06cdf4b0a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/val/data-00000-of-00001.arrow b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/data-00000-of-00001.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..46cfffd97a92e24de58e737104dfc836027dc887
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/data-00000-of-00001.arrow
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4c76cc6d883b1ad797f40d5f25cdbfba5e08989436004b7b8e1ee0cb509c079
+size 19471728
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/val/dataset_info.json b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1be902ba0361c345a756bbe936d9ca5c70958d
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/dataset_info.json
@@ -0,0 +1,23 @@
+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "sequence": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}
\ No newline at end of file
diff --git a/training_data_cleaned/solubility/sol_wt_with_embeddings/val/state.json b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef620b13300b103e96e8fb7d345cc70ebca06303
--- /dev/null
+++ b/training_data_cleaned/solubility/sol_wt_with_embeddings/val/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "499986169da9afde",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}
\ No newline at end of file