Update project structure and remove unused notebooks

- Added `*/embeddings` to .gitignore to exclude embedding directories.
- Updated `EDA_Psort.ipynb` with new file size for version control.
- Deleted obsolete notebooks: `ESMC_300m.ipynb`, `ESMC_600m.ipynb`, and `ProstT5.ipynb`.
- Introduced `Get_embeddings.ipynb` to streamline embedding processes.
- Enhanced `my_utils.py` with error handling for embedding functions.

Files changed (6) hide show

.gitignore +2 -1
notebooks/EDA_Psort.ipynb +2 -2
notebooks/ESMC_600m.ipynb +0 -3
notebooks/{ESMC_300m.ipynb → Get_embeddings.ipynb} +2 -2
notebooks/ProstT5.ipynb +0 -3
src/my_utils.py +8 -8

.gitignore CHANGED Viewed

@@ -3,4 +3,5 @@
 __pycache__/
 *.pyc
 /home/juan/ProteinLocationPredictor/notebooks/__pycache__
-*.txt

 __pycache__/
 *.pyc
 /home/juan/ProteinLocationPredictor/notebooks/__pycache__
+*.txt
+*/embeddings

notebooks/EDA_Psort.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef532d89c742e3dad6e1aeea89215d4e1910ec825030b7c385594e59418998f1
-size 15116012

 version https://git-lfs.github.com/spec/v1
+oid sha256:f3676927b7ed8eeef1a881f840243f931c72e429e8c5af70db1cbc4b8d82e900
+size 15130990

notebooks/ESMC_600m.ipynb DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:06608effffc76bbb3ca65c67a84f06762d459b56a823bff9e61695cab83bb10c
-size 10350

notebooks/{ESMC_300m.ipynb → Get_embeddings.ipynb} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7bcc1ec16a5d8992cfdb6ca4d61d8c69cad64b683a697f6622e0c1f0d921076
-size 13125

 version https://git-lfs.github.com/spec/v1
+oid sha256:0e07c094294b597ad35d3ab0bd89cf3c5708a68cc09ef7ad66f9ca77490e9461
+size 15520

notebooks/ProstT5.ipynb DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1ccbc986f147709315a8cb774f4c8523f9da34af321c111781e15e2d8c30c5f1
-size 56006

src/my_utils.py CHANGED Viewed

@@ -91,7 +91,7 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
     class_names = np.unique(y_true)
     plt.figure(figsize=(6, 4))
     sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
-                xticklabels=class_names, yticklabels=class_names)
     plt.xlabel('Predicted Label')
     plt.ylabel('True Label')
@@ -149,7 +149,7 @@ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) ->
 def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
-    X_array = np.vstack(X)
     pca = PCA(n_components=2, random_state=42)
     if scale:
@@ -271,7 +271,7 @@ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[P
     confusion(title = title,
             y_true = y_test,
-            y_pred = y_pred)
     print(classification_report(y_test, y_pred, zero_division=0))
@@ -283,14 +283,14 @@ def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.33,
-                                              stratify=y,
-                                              random_state=42)
     X_sample, y_sample = resample(X_train,
                                    y_train,
                                      n_samples = 3500,
                                        stratify = y_train,
-                                         random_state = 42)
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
@@ -334,7 +334,7 @@ def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
                                   y_train,
                                   n_samples = 3500,
                                   stratify = y_train,
-                                  random_state = 42)
     param_grid = {
         'n_estimators': list(np.arange(500,4000, 400)),
@@ -579,7 +579,7 @@ def esm_embed(model: ESMC,
                 arr_output = arr_output.mean(axis=0)
             return arr_output
-    except ESMProteinError as e:
         messagebox.showerror("Error", f"Error processing {acc}: {e}")
         return

     class_names = np.unique(y_true)
     plt.figure(figsize=(6, 4))
     sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
+                xticklabels=class_names, yticklabels=class_names) #type: ignore
     plt.xlabel('Predicted Label')
     plt.ylabel('True Label')
 def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
+    X_array = np.vstack(X) #type: ignore
     pca = PCA(n_components=2, random_state=42)
     if scale:
     confusion(title = title,
             y_true = y_test,
+            y_pred = y_pred)#type: ignore
     print(classification_report(y_test, y_pred, zero_division=0))
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.33,
+                                              stratify=y,#type: ignore
+                                              random_state=42)
     X_sample, y_sample = resample(X_train,
                                    y_train,
                                      n_samples = 3500,
                                        stratify = y_train,
+                                         random_state = 42) #type: ignore
     pipeline = Pipeline([('scaler', StandardScaler()),
                          ('svm', svm.SVC())])
                                   y_train,
                                   n_samples = 3500,
                                   stratify = y_train,
+                                  random_state = 42) #type: ignore
     param_grid = {
         'n_estimators': list(np.arange(500,4000, 400)),
                 arr_output = arr_output.mean(axis=0)
             return arr_output
+    except (ESMProteinError, RuntimeError) as e:
         messagebox.showerror("Error", f"Error processing {acc}: {e}")
         return