jpuglia commited on
Commit
e10ca7d
·
1 Parent(s): 3be0ce1

Update project structure and remove unused notebooks

Browse files

- Added `*/embeddings` to .gitignore to exclude embedding directories.
- Updated `EDA_Psort.ipynb` with new file size for version control.
- Deleted obsolete notebooks: `ESMC_300m.ipynb`, `ESMC_600m.ipynb`, and `ProstT5.ipynb`.
- Introduced `Get_embeddings.ipynb` to streamline embedding processes.
- Enhanced `my_utils.py` with error handling for embedding functions.

.gitignore CHANGED
@@ -3,4 +3,5 @@
3
  __pycache__/
4
  *.pyc
5
  /home/juan/ProteinLocationPredictor/notebooks/__pycache__
6
- *.txt
 
 
3
  __pycache__/
4
  *.pyc
5
  /home/juan/ProteinLocationPredictor/notebooks/__pycache__
6
+ *.txt
7
+ */embeddings
notebooks/EDA_Psort.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef532d89c742e3dad6e1aeea89215d4e1910ec825030b7c385594e59418998f1
3
- size 15116012
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3676927b7ed8eeef1a881f840243f931c72e429e8c5af70db1cbc4b8d82e900
3
+ size 15130990
notebooks/ESMC_600m.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:06608effffc76bbb3ca65c67a84f06762d459b56a823bff9e61695cab83bb10c
3
- size 10350
 
 
 
 
notebooks/{ESMC_300m.ipynb → Get_embeddings.ipynb} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7bcc1ec16a5d8992cfdb6ca4d61d8c69cad64b683a697f6622e0c1f0d921076
3
- size 13125
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e07c094294b597ad35d3ab0bd89cf3c5708a68cc09ef7ad66f9ca77490e9461
3
+ size 15520
notebooks/ProstT5.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ccbc986f147709315a8cb774f4c8523f9da34af321c111781e15e2d8c30c5f1
3
- size 56006
 
 
 
 
src/my_utils.py CHANGED
@@ -91,7 +91,7 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
91
  class_names = np.unique(y_true)
92
  plt.figure(figsize=(6, 4))
93
  sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
94
- xticklabels=class_names, yticklabels=class_names)
95
 
96
  plt.xlabel('Predicted Label')
97
  plt.ylabel('True Label')
@@ -149,7 +149,7 @@ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) ->
149
 
150
 
151
  def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
152
- X_array = np.vstack(X)
153
  pca = PCA(n_components=2, random_state=42)
154
 
155
  if scale:
@@ -271,7 +271,7 @@ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[P
271
 
272
  confusion(title = title,
273
  y_true = y_test,
274
- y_pred = y_pred)
275
 
276
  print(classification_report(y_test, y_pred, zero_division=0))
277
 
@@ -283,14 +283,14 @@ def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
283
  X_train, _, y_train, _ = train_test_split(X,
284
  y,
285
  test_size=0.33,
286
- stratify=y,
287
- random_state=42)
288
 
289
  X_sample, y_sample = resample(X_train,
290
  y_train,
291
  n_samples = 3500,
292
  stratify = y_train,
293
- random_state = 42)
294
 
295
  pipeline = Pipeline([('scaler', StandardScaler()),
296
  ('svm', svm.SVC())])
@@ -334,7 +334,7 @@ def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
334
  y_train,
335
  n_samples = 3500,
336
  stratify = y_train,
337
- random_state = 42)
338
 
339
  param_grid = {
340
  'n_estimators': list(np.arange(500,4000, 400)),
@@ -579,7 +579,7 @@ def esm_embed(model: ESMC,
579
  arr_output = arr_output.mean(axis=0)
580
 
581
  return arr_output
582
- except ESMProteinError as e:
583
  messagebox.showerror("Error", f"Error processing {acc}: {e}")
584
  return
585
 
 
91
  class_names = np.unique(y_true)
92
  plt.figure(figsize=(6, 4))
93
  sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
94
+ xticklabels=class_names, yticklabels=class_names) #type: ignore
95
 
96
  plt.xlabel('Predicted Label')
97
  plt.ylabel('True Label')
 
149
 
150
 
151
  def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
152
+ X_array = np.vstack(X) #type: ignore
153
  pca = PCA(n_components=2, random_state=42)
154
 
155
  if scale:
 
271
 
272
  confusion(title = title,
273
  y_true = y_test,
274
+ y_pred = y_pred)#type: ignore
275
 
276
  print(classification_report(y_test, y_pred, zero_division=0))
277
 
 
283
  X_train, _, y_train, _ = train_test_split(X,
284
  y,
285
  test_size=0.33,
286
+ stratify=y,#type: ignore
287
+ random_state=42)
288
 
289
  X_sample, y_sample = resample(X_train,
290
  y_train,
291
  n_samples = 3500,
292
  stratify = y_train,
293
+ random_state = 42) #type: ignore
294
 
295
  pipeline = Pipeline([('scaler', StandardScaler()),
296
  ('svm', svm.SVC())])
 
334
  y_train,
335
  n_samples = 3500,
336
  stratify = y_train,
337
+ random_state = 42) #type: ignore
338
 
339
  param_grid = {
340
  'n_estimators': list(np.arange(500,4000, 400)),
 
579
  arr_output = arr_output.mean(axis=0)
580
 
581
  return arr_output
582
+ except (ESMProteinError, RuntimeError) as e:
583
  messagebox.showerror("Error", f"Error processing {acc}: {e}")
584
  return
585