Update project structure and remove unused notebooks
Browse files- Added `*/embeddings` to .gitignore to exclude embedding directories.
- Updated `EDA_Psort.ipynb` with new file size for version control.
- Deleted obsolete notebooks: `ESMC_300m.ipynb`, `ESMC_600m.ipynb`, and `ProstT5.ipynb`.
- Introduced `Get_embeddings.ipynb` to streamline embedding processes.
- Enhanced `my_utils.py` with error handling for embedding functions.
- .gitignore +2 -1
- notebooks/EDA_Psort.ipynb +2 -2
- notebooks/ESMC_600m.ipynb +0 -3
- notebooks/{ESMC_300m.ipynb → Get_embeddings.ipynb} +2 -2
- notebooks/ProstT5.ipynb +0 -3
- src/my_utils.py +8 -8
.gitignore
CHANGED
|
@@ -3,4 +3,5 @@
|
|
| 3 |
__pycache__/
|
| 4 |
*.pyc
|
| 5 |
/home/juan/ProteinLocationPredictor/notebooks/__pycache__
|
| 6 |
-
*.txt
|
|
|
|
|
|
| 3 |
__pycache__/
|
| 4 |
*.pyc
|
| 5 |
/home/juan/ProteinLocationPredictor/notebooks/__pycache__
|
| 6 |
+
*.txt
|
| 7 |
+
*/embeddings
|
notebooks/EDA_Psort.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3676927b7ed8eeef1a881f840243f931c72e429e8c5af70db1cbc4b8d82e900
|
| 3 |
+
size 15130990
|
notebooks/ESMC_600m.ipynb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:06608effffc76bbb3ca65c67a84f06762d459b56a823bff9e61695cab83bb10c
|
| 3 |
-
size 10350
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/{ESMC_300m.ipynb → Get_embeddings.ipynb}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e07c094294b597ad35d3ab0bd89cf3c5708a68cc09ef7ad66f9ca77490e9461
|
| 3 |
+
size 15520
|
notebooks/ProstT5.ipynb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1ccbc986f147709315a8cb774f4c8523f9da34af321c111781e15e2d8c30c5f1
|
| 3 |
-
size 56006
|
|
|
|
|
|
|
|
|
|
|
|
src/my_utils.py
CHANGED
|
@@ -91,7 +91,7 @@ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
|
|
| 91 |
class_names = np.unique(y_true)
|
| 92 |
plt.figure(figsize=(6, 4))
|
| 93 |
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
|
| 94 |
-
xticklabels=class_names, yticklabels=class_names)
|
| 95 |
|
| 96 |
plt.xlabel('Predicted Label')
|
| 97 |
plt.ylabel('True Label')
|
|
@@ -149,7 +149,7 @@ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) ->
|
|
| 149 |
|
| 150 |
|
| 151 |
def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
|
| 152 |
-
X_array = np.vstack(X)
|
| 153 |
pca = PCA(n_components=2, random_state=42)
|
| 154 |
|
| 155 |
if scale:
|
|
@@ -271,7 +271,7 @@ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[P
|
|
| 271 |
|
| 272 |
confusion(title = title,
|
| 273 |
y_true = y_test,
|
| 274 |
-
y_pred = y_pred)
|
| 275 |
|
| 276 |
print(classification_report(y_test, y_pred, zero_division=0))
|
| 277 |
|
|
@@ -283,14 +283,14 @@ def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
|
|
| 283 |
X_train, _, y_train, _ = train_test_split(X,
|
| 284 |
y,
|
| 285 |
test_size=0.33,
|
| 286 |
-
stratify=y
|
| 287 |
-
random_state=42)
|
| 288 |
|
| 289 |
X_sample, y_sample = resample(X_train,
|
| 290 |
y_train,
|
| 291 |
n_samples = 3500,
|
| 292 |
stratify = y_train,
|
| 293 |
-
random_state = 42)
|
| 294 |
|
| 295 |
pipeline = Pipeline([('scaler', StandardScaler()),
|
| 296 |
('svm', svm.SVC())])
|
|
@@ -334,7 +334,7 @@ def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
|
|
| 334 |
y_train,
|
| 335 |
n_samples = 3500,
|
| 336 |
stratify = y_train,
|
| 337 |
-
random_state = 42)
|
| 338 |
|
| 339 |
param_grid = {
|
| 340 |
'n_estimators': list(np.arange(500,4000, 400)),
|
|
@@ -579,7 +579,7 @@ def esm_embed(model: ESMC,
|
|
| 579 |
arr_output = arr_output.mean(axis=0)
|
| 580 |
|
| 581 |
return arr_output
|
| 582 |
-
except ESMProteinError as e:
|
| 583 |
messagebox.showerror("Error", f"Error processing {acc}: {e}")
|
| 584 |
return
|
| 585 |
|
|
|
|
| 91 |
class_names = np.unique(y_true)
|
| 92 |
plt.figure(figsize=(6, 4))
|
| 93 |
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
|
| 94 |
+
xticklabels=class_names, yticklabels=class_names) #type: ignore
|
| 95 |
|
| 96 |
plt.xlabel('Predicted Label')
|
| 97 |
plt.ylabel('True Label')
|
|
|
|
| 149 |
|
| 150 |
|
| 151 |
def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
|
| 152 |
+
X_array = np.vstack(X) #type: ignore
|
| 153 |
pca = PCA(n_components=2, random_state=42)
|
| 154 |
|
| 155 |
if scale:
|
|
|
|
| 271 |
|
| 272 |
confusion(title = title,
|
| 273 |
y_true = y_test,
|
| 274 |
+
y_pred = y_pred)#type: ignore
|
| 275 |
|
| 276 |
print(classification_report(y_test, y_pred, zero_division=0))
|
| 277 |
|
|
|
|
| 283 |
X_train, _, y_train, _ = train_test_split(X,
|
| 284 |
y,
|
| 285 |
test_size=0.33,
|
| 286 |
+
stratify=y,#type: ignore
|
| 287 |
+
random_state=42)
|
| 288 |
|
| 289 |
X_sample, y_sample = resample(X_train,
|
| 290 |
y_train,
|
| 291 |
n_samples = 3500,
|
| 292 |
stratify = y_train,
|
| 293 |
+
random_state = 42) #type: ignore
|
| 294 |
|
| 295 |
pipeline = Pipeline([('scaler', StandardScaler()),
|
| 296 |
('svm', svm.SVC())])
|
|
|
|
| 334 |
y_train,
|
| 335 |
n_samples = 3500,
|
| 336 |
stratify = y_train,
|
| 337 |
+
random_state = 42) #type: ignore
|
| 338 |
|
| 339 |
param_grid = {
|
| 340 |
'n_estimators': list(np.arange(500,4000, 400)),
|
|
|
|
| 579 |
arr_output = arr_output.mean(axis=0)
|
| 580 |
|
| 581 |
return arr_output
|
| 582 |
+
except (ESMProteinError, RuntimeError) as e:
|
| 583 |
messagebox.showerror("Error", f"Error processing {acc}: {e}")
|
| 584 |
return
|
| 585 |
|