jpuglia commited on
Commit
fe2d945
·
1 Parent(s): 572d8c1

Refactor code structure for improved readability and maintainability

Browse files
Classification_Reports/ESMC-300m - Random Forest_classification_report.csv CHANGED
@@ -1,5 +1,5 @@
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
- precision,0.6875,0.9736842105263158,0.9544658493870403,0.823321554770318,0.9058823529411765,0.7301587301587301,0.9409844982322546,0.8458354496305969,0.9431482190855852
3
- recall,0.7096774193548387,0.9907949790794979,0.8384615384615385,0.8859315589353612,0.8415300546448088,0.8625,0.9409844982322546,0.8548159250793409,0.9409844982322546
4
- f1-score,0.6984126984126984,0.9821650767316467,0.8927108927108927,0.8534798534798534,0.8725212464589235,0.7908309455587392,0.9409844982322546,0.8483534522254591,0.9409727898172672
5
- support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9409844982322546,3677.0,3677.0
 
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
+ precision,0.7692307692307693,0.9547456948338006,0.9548611111111112,0.8941176470588236,0.9316770186335404,0.8024691358024691,0.9415284199075333,0.8845168961117523,0.9410913744998538
3
+ recall,0.6451612903225806,0.9974895397489539,0.8461538461538461,0.8669201520912547,0.819672131147541,0.8125,0.9415284199075333,0.8313161599106961,0.9415284199075333
4
+ f1-score,0.7017543859649122,0.9756496828320033,0.8972267536704731,0.8803088803088803,0.872093023255814,0.8074534161490683,0.9415284199075333,0.8557476903635252,0.9401852932227653
5
+ support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9415284199075333,3677.0,3677.0
Classification_Reports/ESMC-600m - Random Forest_classification_report.csv CHANGED
@@ -1,5 +1,5 @@
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
- precision,0.7307692307692307,0.9733388022969647,0.975736568457539,0.8404255319148937,0.92,0.7821229050279329,0.9499592058743541,0.8703988397444268,0.9512357717810928
3
- recall,0.6129032258064516,0.992887029288703,0.8661538461538462,0.9011406844106464,0.8797814207650273,0.875,0.9499592058743541,0.8546443677374458,0.9499592058743541
4
- f1-score,0.6666666666666666,0.9830157415078707,0.9176854115729421,0.8697247706422019,0.8994413407821229,0.8259587020648967,0.9499592058743541,0.8604154388727835,0.9497031761667939
5
- support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9499592058743541,3677.0,3677.0
 
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
+ precision,0.8095238095238095,0.9662189662189662,0.9689119170984456,0.8368794326241135,0.9629629629629629,0.7897727272727273,0.9472395974979603,0.8890449692835043,0.9482829651451068
3
+ recall,0.5483870967741935,0.9933054393305439,0.8630769230769231,0.8973384030418251,0.8524590163934426,0.86875,0.9472395974979603,0.8372194797694882,0.9472395974979603
4
+ f1-score,0.6538461538461539,0.9795749948421704,0.9129373474369405,0.8660550458715597,0.9043478260869565,0.8273809523809523,0.9472395974979603,0.8573570534107889,0.9465629115842034
5
+ support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9472395974979603,3677.0,3677.0
Classification_Reports/Prost T5 - Random Forest_classification_report.csv CHANGED
@@ -1,5 +1,5 @@
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
- precision,0.8181818181818182,0.9783783783783784,0.9553429027113237,0.7916666666666666,0.9523809523809523,0.8391608391608392,0.9510470492249116,0.8891852595799964,0.9522492872817795
3
- recall,0.5806451612903226,0.9845188284518829,0.9215384615384615,0.9391634980988594,0.8743169398907104,0.75,0.9510470492249116,0.841697148211706,0.9510470492249116
4
- f1-score,0.6792452830188679,0.9814389989572472,0.9381362568519969,0.8591304347826086,0.9116809116809117,0.7920792079207921,0.9510470492249116,0.8602851822020706,0.9507767100048853
5
- support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9510470492249116,3677.0,3677.0
 
1
  ,Cellwall,Cytoplasmic,CytoplasmicMembrane,Extracellular,OuterMembrane,Periplasmic,accuracy,macro avg,weighted avg
2
+ precision,0.6428571428571429,0.9832285115303984,0.9485530546623794,0.7954545454545454,0.9578313253012049,0.7619047619047619,0.9477835191732391,0.8483048902850722,0.9499038997569209
3
+ recall,0.5806451612903226,0.9811715481171548,0.9076923076923077,0.9315589353612167,0.8688524590163934,0.8,0.9477835191732391,0.8449867352462325,0.9477835191732391
4
+ f1-score,0.6101694915254238,0.9821989528795811,0.9276729559748428,0.8581436077057794,0.9111747851002865,0.7804878048780488,0.9477835191732391,0.8449745996773269,0.9482385032046764
5
+ support,31.0,2390.0,650.0,263.0,183.0,160.0,0.9477835191732391,3677.0,3677.0
Classification_Reports/results_ESM300_RF.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.8842105263157894,0.8842105263157894,0.8842105263157894,95.0
3
- Cytoplasmic,0.9852189528940462,0.9852189528940462,0.9852189528940462,7239.0
4
- CytoplasmicMembrane,0.9573234984193888,0.9228034535297105,0.9397465735712438,1969.0
5
- Extracellular,0.9062870699881376,0.958594730238394,0.9317073170731708,797.0
6
- OuterMembrane,0.9796296296296296,0.9531531531531532,0.9662100456621004,555.0
7
- Periplasmic,0.878095238095238,0.9505154639175257,0.9128712871287129,485.0
8
- accuracy,0.9683123877917414,0.9683123877917414,0.9683123877917414,0.9683123877917414
9
- macro avg,0.9317941525570382,0.9424160466747699,0.9366607837741773,11140.0
10
- weighted avg,0.9688376479433536,0.9683123877917414,0.9683950524837511,11140.0
 
 
 
 
 
 
 
 
 
 
 
Classification_Reports/results_ESM300_SVM.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.9540229885057471,0.8736842105263158,0.9120879120879121,95.0
3
- Cytoplasmic,0.9931299807639461,0.9984804531012571,0.995798029895984,7239.0
4
- CytoplasmicMembrane,0.990726429675425,0.9766378872524124,0.9836317135549872,1969.0
5
- Extracellular,0.9672955974842767,0.9648682559598495,0.9660804020100503,797.0
6
- OuterMembrane,0.9815157116451017,0.9567567567567568,0.968978102189781,555.0
7
- Periplasmic,0.9357429718875502,0.9608247422680413,0.948118006103764,485.0
8
- accuracy,0.9874326750448833,0.9874326750448833,0.9874326750448833,0.9874326750448833
9
- macro avg,0.9704056133270078,0.9552087176441054,0.962449027640413,11140.0
10
- weighted avg,0.9874462843099304,0.9874326750448833,0.9873956278395704,11140.0
 
 
 
 
 
 
 
 
 
 
 
Classification_Reports/results_ESM600_RF.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.8913043478260869,0.8631578947368421,0.8770053475935828,95.0
3
- Cytoplasmic,0.9903846153846154,0.995993921812405,0.9931813485777258,7239.0
4
- CytoplasmicMembrane,0.9894902785076195,0.9563230066023362,0.9726239669421488,1969.0
5
- Extracellular,0.9381067961165048,0.9698870765370138,0.9537322640345466,797.0
6
- OuterMembrane,0.9744058500914077,0.9603603603603603,0.9673321234119783,555.0
7
- Periplasmic,0.9331983805668016,0.9505154639175257,0.9417773237997957,485.0
8
- accuracy,0.9822262118491921,0.9822262118491921,0.9822262118491921,0.9822262118491921
9
- macro avg,0.9528150447488395,0.9493729539944139,0.9509420623932964,11140.0
10
- weighted avg,0.9823556624842636,0.9822262118491921,0.9822089610643375,11140.0
 
 
 
 
 
 
 
 
 
 
 
Classification_Reports/results_ESM600_SVM.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.9647058823529412,0.8631578947368421,0.9111111111111111,95.0
3
- Cytoplasmic,0.9927207801126219,0.9984804531012571,0.9955922865013774,7239.0
4
- CytoplasmicMembrane,0.986659825551565,0.9766378872524124,0.9816232771822359,1969.0
5
- Extracellular,0.9636135508155583,0.9636135508155583,0.9636135508155583,797.0
6
- OuterMembrane,0.9833948339483395,0.9603603603603603,0.9717411121239745,555.0
7
- Periplasmic,0.9465020576131687,0.9484536082474226,0.9474768280123584,485.0
8
- accuracy,0.9868940754039497,0.9868940754039497,0.9868940754039497,0.9868940754039497
9
- macro avg,0.9729328217323658,0.9517839590856422,0.9618596942911025,11140.0
10
- weighted avg,0.986851311791162,0.9868940754039497,0.9868318607832719,11140.0
 
 
 
 
 
 
 
 
 
 
 
Classification_Reports/results_PROST_RF.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.9529411764705882,0.8526315789473684,0.9,95.0
3
- Cytoplasmic,0.9911966987620358,0.9954413593037712,0.9933144944517196,7239.0
4
- CytoplasmicMembrane,0.9845360824742269,0.9700355510411376,0.9772320286518291,1969.0
5
- Extracellular,0.9207100591715977,0.9761606022584692,0.9476248477466505,797.0
6
- OuterMembrane,0.9887850467289719,0.9531531531531532,0.9706422018348624,555.0
7
- Periplasmic,0.9505376344086022,0.911340206185567,0.9305263157894736,485.0
8
- accuracy,0.9825852782764811,0.9825852782764811,0.9825852782764811,0.9825852782764811
9
- macro avg,0.9647844496693371,0.9431270751482446,0.9532233147457557,11140.0
10
- weighted avg,0.9827599848543402,0.9825852782764811,0.9825441812012363,11140.0
 
 
 
 
 
 
 
 
 
 
 
Classification_Reports/results_PROST_SVM.csv DELETED
@@ -1,10 +0,0 @@
1
- ,precision,recall,f1-score,support
2
- Cellwall,0.9761904761904762,0.8631578947368421,0.9162011173184358,95.0
3
- Cytoplasmic,0.9921638713225186,0.9969609062025142,0.9945566044236201,7239.0
4
- CytoplasmicMembrane,0.9886889460154241,0.9766378872524124,0.9826264690853347,1969.0
5
- Extracellular,0.9616336633663366,0.9749058971141782,0.9682242990654205,797.0
6
- OuterMembrane,0.9834862385321101,0.9657657657657658,0.9745454545454545,555.0
7
- Periplasmic,0.9442148760330579,0.9422680412371134,0.9432404540763674,485.0
8
- accuracy,0.9867145421903052,0.9867145421903052,0.9867145421903052,0.9867145421903052
9
- macro avg,0.9743963452433206,0.9532827320514711,0.9632323997524388,11140.0
10
- weighted avg,0.9867093358537257,0.9867145421903052,0.9866647214588659,11140.0
 
 
 
 
 
 
 
 
 
 
 
Data/evaluations.csv CHANGED
@@ -1,7 +1,7 @@
1
- model,Accuracy,Recall,Precision,F1
2
- Prost T5_rf,0.9510470492249116,0.9510470492249116,0.9522492872817795,0.9507767100048853
3
- Prost T5_svm,0.9597497960293717,0.9597497960293717,0.9595957881278095,0.959225689183014
4
- ESMC-300m_rf,0.9409844982322546,0.9409844982322546,0.9431482190855852,0.9409727898172672
5
- ESMC-300m_svm,0.9621974435681262,0.9621974435681262,0.9622014817178194,0.961806189217868
6
- ESMC-600m_rf,0.9499592058743541,0.9499592058743541,0.9512357717810928,0.9497031761667939
7
- ESMC-600m_svm,0.9602937177046506,0.9602937177046506,0.9597863973858514,0.9596645033195284
 
1
+ model,Recall_weighted,Precision_weighted,F1_weighted,Recall_micro,Precision_micro,F1_micro,Recall_macro,Precision_macro,F1_macro
2
+ Prost T5_rf,0.9477835191732391,0.9499038997569209,0.9482385032046764,0.8449867352462325,0.8483048902850722,0.8449745996773269,0.8449867352462325,0.8483048902850722,0.8449745996773269
3
+ Prost T5_svm,0.9597497960293717,0.9595957881278095,0.959225689183014,0.8576333576020237,0.917729657853231,0.8816910350147221,0.8576333576020237,0.917729657853231,0.8816910350147221
4
+ ESMC-300m_rf,0.9415284199075333,0.9410913744998538,0.9401852932227653,0.8313161599106961,0.8845168961117523,0.8557476903635252,0.8313161599106961,0.8845168961117523,0.8557476903635252
5
+ ESMC-300m_svm,0.9621974435681262,0.9622014817178194,0.961806189217868,0.8635980792180346,0.9057293449016118,0.8814818006164681,0.8635980792180346,0.9057293449016118,0.8814818006164681
6
+ ESMC-600m_rf,0.9472395974979603,0.9482829651451068,0.9465629115842034,0.8372194797694882,0.8890449692835043,0.8573570534107889,0.8372194797694882,0.8890449692835043,0.8573570534107889
7
+ ESMC-600m_svm,0.9602937177046506,0.9597863973858514,0.9596645033195284,0.8530897801883417,0.9115637055531502,0.8776369716697019,0.8530897801883417,0.9115637055531502,0.8776369716697019
Models/ESMC-300m_rf.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0395f0e953b36fdb2ab140c5dd1b957759a9932a00037a76bfd26150ffdedac1
3
- size 10191625
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f753d32b23ac48f1f3f3084ae127c7cdeae04bc8965ab5c182bd97f6b5f0446
3
+ size 7672329
Models/ESMC-300m_svm.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2002c8a06e4fcf8ead2098ac214eb78327730bf9095e32e324245a9b9a3fdbc6
3
  size 18294469
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5270c6c0a57613aeb214822256ac586f636432aac662d592148af0ec0519ae5
3
  size 18294469
Models/ESMC-600m_rf.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c6f14ba2ad661fa7878a69b107280df0b82e762bc47215496d22f578974f606
3
- size 18049497
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d378d8fd0fa931f5e99d8ac3359911974b1801481eabefca5d616a69004a29f
3
+ size 4371481
Models/ESMC-600m_svm.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20b804e6ef0c58a22af1f5c68e10221f8da4d1745e88404df9fe8e3048674ae6
3
  size 22787493
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f603530e3336b61445344f497dabcb7e2c0c115b2760f98d614e556f13d6eaad
3
  size 22787493
Models/Prost T5_rf.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4ed73692393a2a87059379e3f4d07aa54d3aed5659394a79419f9b6d9837179
3
- size 10359641
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503b4c0ffcc0e89abb9c319ae0b7c2e983490676dd3cd14f74a17de37357f131
3
+ size 22161465
Models/Prost T5_svm.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a3000d817cbf6466de239488ea378b6438d15fba03e7e102e0de5508351a54d
3
  size 18267605
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aff572ed367bbfce4ed874058a06a987cdf5c0c629ab98bde1af651d97765ee
3
  size 18267605
Plots/ESMC-300m - Random Forest_confusion_matrix.png ADDED
Plots/ESMC-300m - Support Vector Machine_confusion_matrix.png ADDED
Plots/ESMC-600m - Random Forest_confusion_matrix.png ADDED
Plots/ESMC-600m - Support Vector Machine_confusion_matrix.png ADDED
Plots/Prost T5 - Random Forest_confusion_matrix.png ADDED
Plots/Prost T5 - Support Vector Machine_confusion_matrix.png ADDED
notebooks/04_Training.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b903593cd4834f92401fa1423f8d5fd103e9c8cc961554e8f9cd883c4405b0aa
3
- size 8532
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f5956b989bbe058168dec81b42ee1d48a05ba0accace96af4bd6e8c86d7f63
3
+ size 728565
src/my_utils.py CHANGED
@@ -261,10 +261,18 @@ def evaluate(model: BaseEstimator,
261
  result = {}
262
  y_pred = model.predict(x_test) # type: ignore
263
 
264
- result['Accuracy'] = accuracy_score(y_test, y_pred)
265
- result['Recall'] = recall_score(y_test, y_pred, average = 'weighted')
266
- result['Precision'] = precision_score(y_test, y_pred, average='weighted')
267
- result['F1'] = f1_score(y_test, y_pred, average='weighted')
 
 
 
 
 
 
 
 
268
 
269
  pprint(result)
270
  return result
@@ -324,7 +332,9 @@ def train_rf(title: str,
324
  y_pred_str = le.inverse_transform(y_pred)
325
  y_test_str = le.inverse_transform(y_test)
326
 
327
- confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
 
 
328
 
329
  return classifier, evaluation, le
330
 
@@ -371,7 +381,8 @@ def train_svm(title: str, x: np.ndarray, y: np.ndarray, params: dict) -> tuple[P
371
  y_pred_str = le.inverse_transform(y_pred)
372
  y_test_str = le.inverse_transform(y_test)
373
 
374
- confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
 
375
 
376
 
377
  classification = classification_report(y_test,
 
261
  result = {}
262
  y_pred = model.predict(x_test) # type: ignore
263
 
264
+
265
+ result['Recall_weighted'] = recall_score(y_test, y_pred, average = 'weighted')
266
+ result['Precision_weighted'] = precision_score(y_test, y_pred, average='weighted')
267
+ result['F1_weighted'] = f1_score(y_test, y_pred, average='weighted')
268
+
269
+ result['Recall_micro'] = recall_score(y_test, y_pred, average = 'macro')
270
+ result['Precision_micro'] = precision_score(y_test, y_pred, average='macro')
271
+ result['F1_micro'] = f1_score(y_test, y_pred, average='macro')
272
+
273
+ result['Recall_macro'] = recall_score(y_test, y_pred, average = 'macro')
274
+ result['Precision_macro'] = precision_score(y_test, y_pred, average='macro')
275
+ result['F1_macro'] = f1_score(y_test, y_pred, average='macro')
276
 
277
  pprint(result)
278
  return result
 
332
  y_pred_str = le.inverse_transform(y_pred)
333
  y_test_str = le.inverse_transform(y_test)
334
 
335
+ fig = confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
336
+
337
+ fig.savefig(os.path.join(project_root, 'Plots', f'{title}_confusion_matrix.png'))
338
 
339
  return classifier, evaluation, le
340
 
 
381
  y_pred_str = le.inverse_transform(y_pred)
382
  y_test_str = le.inverse_transform(y_test)
383
 
384
+ fig = confusion(title=title, y_true=y_test_str, y_pred=y_pred_str)
385
+ fig.savefig(os.path.join(project_root, 'Plots', f'{title}_confusion_matrix.png'))
386
 
387
 
388
  classification = classification_report(y_test,