jpuglia commited on
Commit
4df44a7
·
1 Parent(s): 923a7e5

Streamline training of rf and svm models, renameing notebooks, minor changes to my_utils.py and creation of evaluation.csv

Browse files
Data/evaluations.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ model,Accuracy,Recall,Precision,F1
2
+ Prost T5_rf,0.9494152841990753,0.9494152841990753,0.9500906030394936,0.9487261816656973
3
+ Prost T5_svm,0.9597497960293717,0.9597497960293717,0.9595957881278095,0.959225689183014
4
+ ESMC-300m_rf,0.939896654881697,0.939896654881697,0.9410635663803479,0.9399078225424956
5
+ ESMC-300m_svm,0.9621974435681262,0.9621974435681262,0.9622014817178194,0.961806189217868
6
+ ESMC-600m_rf,0.9472395974979603,0.9472395974979603,0.9471989241244075,0.9464063102910955
7
+ ESMC-600m_svm,0.9602937177046506,0.9602937177046506,0.9597863973858514,0.9596645033195284
notebooks/Get_embeddings.ipynb → Models/ESMC-300m_le_rf.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4d087d9e61aa44b98adedab8e1a483a1d981137f826da03f14a897617f8ef53
3
- size 10847
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
notebooks/hyperparamsRF.ipynb → Models/ESMC-300m_le_svm.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be08020829c6e68c1b659bca93f71ede388f4c5d6fba3b7bd4aa85b363806f28
3
- size 101568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/{svmESM600.joblib → ESMC-300m_rf.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b17cb4232fd3faae3336c91158ee48e56e2bc3605c98db530665a77870f79a8d
3
- size 23689781
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1314aee5d738f8ad952773301e6bcecab06f36a3dc84f3e3bbd62779ebef2b64
3
+ size 18049497
Models/{svmProst.joblib → ESMC-300m_svm.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b2c74b8bfff1dff844283d247799a44d98dacfb77277345d9e8065caa752b22
3
- size 17969877
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd3e89ce2d691e60852c7660e8a52209d93dbe5d4d2fd9723faff902aad34c7d
3
+ size 18294469
Models/{rfProst.joblib → ESMC-600m_le_rf.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97b7afaa64ab3742b04fff5dbf0f48cf1a521e2714941b096a085071839cc944
3
- size 47713945
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/{svm300.joblib → ESMC-600m_le_svm.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6ae921216cd0b305c81411b8ea9df7bfc0f5c50f105f1feefed48737909349c
3
- size 17875413
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/ESMC-600m_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a1bbdfe47decc8acdb934f7c79297b9cd842e63d5567c39d058ce8ef4ebfb0
3
+ size 9024153
Models/ESMC-600m_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e4d8f22fac3eecca31048d054bbd097219d2e345b05b5dbc5c98ad9d259e40
3
+ size 22787493
Models/Prost T5_le_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/Prost T5_le_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/Prost T5_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fa2d7e7bb0d6000314f955b77a27ffa5fdf9fb8492afc4714e952dab1d722cf
3
+ size 4842553
Models/Prost T5_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:184f44563c41fc75ab38769f13e1d95488b1bee2767bb9b8cd80a62cd8e826a6
3
+ size 18267605
Models/esm_300m_le_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/esm_300m_le_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/esm_300m_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35685779464c1a40f0ba3f56437387caf4a7786368421239d9b97b496a42b49d
3
+ size 4513481
Models/esm_300m_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e9c14f7e1ae5eca0d17a36b92457cdf9ec59d3f364cd49f308c05a972b659cb
3
+ size 18294469
Models/esm_600m_le_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/esm_600m_le_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/esm_600m_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9487c6ba0d19aba648ba17bbeaf8aedb9b8983b0b8ec43df7c8f291a05028be2
3
+ size 15835961
Models/esm_600m_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee69d1c43f36c0bbb19200159cbaffee3ee8aa53c68a2b00c5c11b96985499d
3
+ size 22787493
Models/prost_le_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/prost_le_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3533ba487721d516e3f29f3fb614d78459774cff74121f5d74aeaa4ab2e45f
3
+ size 635
Models/prost_rf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ac62a197618dc89bce7e8c4979bd2c3b922772359cc7edb8a4ee06207d92e6b
3
+ size 9177529
Models/prost_svm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba95ca02d8b3af55683bfd1aa2e89334f0b103e740ecabdb55bb32874dcce95d
3
+ size 18267605
Models/rfESM300.joblib DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4945e8970e2daa8a08ee1299d5037cf1bf26c994799a06cd44d4f0db6261d8c2
3
- size 128239785
 
 
 
 
Models/rfESM600.joblib DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd53311c8f33278c82be9867184e7a72a9328039e237cc46bd8c238a2d106db7
3
- size 125433513
 
 
 
 
notebooks/{EDA_Psort.ipynb → 01_EDA_Psort.ipynb} RENAMED
File without changes
notebooks/02_Get_embeddings.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f052aee04179938913521a555b8224712135ec3671362d864b48721d005cc94
3
+ size 10859
notebooks/{EmbAnalisis.ipynb → 03_EmbAnalisis.ipynb} RENAMED
File without changes
notebooks/04_Training.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17540ccc0a1db8bed0f6e4959369941aeae7bcadf80ec881075196f51c053d0c
3
+ size 580320
src/my_utils.py CHANGED
@@ -13,7 +13,7 @@ import numpy as np
13
 
14
  from sklearn.ensemble import RandomForestClassifier
15
  from sklearn import svm
16
- from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
17
  from sklearn.metrics import (
18
  classification_report,
19
  accuracy_score,
@@ -27,7 +27,6 @@ from sklearn.preprocessing import StandardScaler, LabelEncoder
27
  from sklearn.pipeline import Pipeline
28
  from sklearn.manifold import TSNE
29
  from sklearn.model_selection import train_test_split
30
- from sklearn.utils import resample
31
  from sklearn.base import BaseEstimator
32
 
33
  import umap
@@ -50,8 +49,6 @@ from joblib import load
50
 
51
  import torch
52
 
53
-
54
-
55
  # Load one chunk of embeddings
56
  def load_emb(path: str, acc: list[str]) -> np.ndarray:
57
 
@@ -357,7 +354,7 @@ def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[P
357
  svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
358
  pipeline = Pipeline([
359
  ('scaler', StandardScaler()),
360
- ('svm', svm.SVC(**svc_params, probability = True))
361
  ])
362
 
363
  pipeline.fit(x_train, y_train)
@@ -370,7 +367,7 @@ def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[P
370
  y_test_str = le.inverse_transform(y_test)
371
 
372
  confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
373
-
374
 
375
  print(classification_report(y_test, y_pred, zero_division=0, target_names = le.classes_))
376
 
@@ -396,9 +393,9 @@ def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
396
  'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
397
  'svm__shrinking': [True, False],
398
  'svm__class_weight': ['balanced'],
 
399
  'svm__tol': [1e-5, 1e-4, 1e-3, 1e-2],
400
- 'svm__max_iter': [-1, 1000, 5000, 10000],
401
- 'svm__probability': [False, True],
402
  'svm__decision_function_shape': ['ovr', 'ovo'],
403
  'svm__cache_size': [200, 400, 600]
404
  }
@@ -406,22 +403,21 @@ def randomSVM(x: np.ndarray, y: np.ndarray) -> dict:
406
  random_search = RandomizedSearchCV(
407
  estimator=pipeline,
408
  param_distributions=param_distributions,
409
- n_iter=50,
410
  scoring='f1_weighted',
411
  cv=3,
412
- verbose=1,
413
  random_state=42,
414
  n_jobs=-1
415
  )
416
 
417
  random_search.fit(x_train, y_train)
418
- random_search.best_params_['svm__probability'] = True
419
 
420
  pprint(random_search.best_params_)
421
 
422
  return random_search.best_params_
423
 
424
- def randomSearch(x: np.ndarray, y: np.ndarray) -> dict:
425
 
426
  le = LabelEncoder()
427
  y_encoded = le.fit_transform(y)
@@ -430,7 +426,7 @@ def randomSearch(x: np.ndarray, y: np.ndarray) -> dict:
430
  classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
431
 
432
  param_grid = {
433
- 'n_estimators': list(np.arange(500,4000, 400)),
434
  'max_depth': [None, 10, 20, 30, 40, 50],
435
  'min_samples_split': [2, 5, 10, 15, 20],
436
  'min_samples_leaf': [1, 2, 4, 8, 10],
@@ -443,10 +439,10 @@ def randomSearch(x: np.ndarray, y: np.ndarray) -> dict:
443
 
444
  rf_random = RandomizedSearchCV(estimator = classifier,
445
  param_distributions = param_grid,
446
- n_iter= 50,
447
  scoring = 'f1_weighted',
448
  cv = 3,
449
- verbose = 1,
450
  n_jobs = -1)
451
 
452
  rf_random.fit(X = x_train, y = y_train)
@@ -456,31 +452,6 @@ def randomSearch(x: np.ndarray, y: np.ndarray) -> dict:
456
 
457
  return rf_random.best_params_
458
 
459
- def gridSearch(X: np.ndarray, y: np.ndarray, grid: dict):
460
-
461
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
462
-
463
- # Initialize GridSearchCV with the base model and hyperparameters
464
- grid_search: GridSearchCV = GridSearchCV(
465
- estimator=RandomForestClassifier(random_state=42),
466
- param_grid=grid,
467
- cv=1,
468
- scoring = 'f1_weighted',
469
- verbose = 1,
470
- pre_dispatch = 5,
471
- n_jobs=-1
472
- )
473
-
474
- grid_search.fit(X = X_train, y = y_train)
475
-
476
- print('Best Estimator')
477
- pprint(grid_search.best_estimator_)
478
-
479
- evaluation = evaluate(grid_search, X_test, y_test)
480
-
481
- return grid_search, evaluation
482
-
483
-
484
  def fetch_uniprot_sequence(uniprot_id: str):
485
 
486
  """
 
13
 
14
  from sklearn.ensemble import RandomForestClassifier
15
  from sklearn import svm
16
+ from sklearn.model_selection import RandomizedSearchCV
17
  from sklearn.metrics import (
18
  classification_report,
19
  accuracy_score,
 
27
  from sklearn.pipeline import Pipeline
28
  from sklearn.manifold import TSNE
29
  from sklearn.model_selection import train_test_split
 
30
  from sklearn.base import BaseEstimator
31
 
32
  import umap
 
49
 
50
  import torch
51
 
 
 
52
  # Load one chunk of embeddings
53
  def load_emb(path: str, acc: list[str]) -> np.ndarray:
54
 
 
354
  svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
355
  pipeline = Pipeline([
356
  ('scaler', StandardScaler()),
357
+ ('svm', svm.SVC(**svc_params))
358
  ])
359
 
360
  pipeline.fit(x_train, y_train)
 
367
  y_test_str = le.inverse_transform(y_test)
368
 
369
  confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
370
+
371
 
372
  print(classification_report(y_test, y_pred, zero_division=0, target_names = le.classes_))
373
 
 
393
  'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
394
  'svm__shrinking': [True, False],
395
  'svm__class_weight': ['balanced'],
396
+ 'svm__probability' : [True],
397
  'svm__tol': [1e-5, 1e-4, 1e-3, 1e-2],
398
+ 'svm__max_iter': [-1, 5000, 7500, 10000],
 
399
  'svm__decision_function_shape': ['ovr', 'ovo'],
400
  'svm__cache_size': [200, 400, 600]
401
  }
 
403
  random_search = RandomizedSearchCV(
404
  estimator=pipeline,
405
  param_distributions=param_distributions,
406
+ n_iter=10,
407
  scoring='f1_weighted',
408
  cv=3,
409
+ verbose=2,
410
  random_state=42,
411
  n_jobs=-1
412
  )
413
 
414
  random_search.fit(x_train, y_train)
 
415
 
416
  pprint(random_search.best_params_)
417
 
418
  return random_search.best_params_
419
 
420
+ def randomSearch(x: np.ndarray, y: np.ndarray) -> dict: #type: ignore
421
 
422
  le = LabelEncoder()
423
  y_encoded = le.fit_transform(y)
 
426
  classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
427
 
428
  param_grid = {
429
+ 'n_estimators': [100, 200, 300, 400, 500],
430
  'max_depth': [None, 10, 20, 30, 40, 50],
431
  'min_samples_split': [2, 5, 10, 15, 20],
432
  'min_samples_leaf': [1, 2, 4, 8, 10],
 
439
 
440
  rf_random = RandomizedSearchCV(estimator = classifier,
441
  param_distributions = param_grid,
442
+ n_iter= 10,
443
  scoring = 'f1_weighted',
444
  cv = 3,
445
+ verbose = 2,
446
  n_jobs = -1)
447
 
448
  rf_random.fit(X = x_train, y = y_train)
 
452
 
453
  return rf_random.best_params_
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  def fetch_uniprot_sequence(uniprot_id: str):
456
 
457
  """