multiclass_model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d799eecd128c540ab311a7cb77db6ae088d9b8159a2a6d7f04238ea7859e4d6
3
- size 1178808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a97e0d9147fd9f3a5750bf863d4fc36eb3de0a60dd4b8952cb7daca408acdc6
3
+ size 665737
phase_1a_sample_solution_multiclass.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# A. Extract Features"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 39,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/plain": [
18
+ "<module 'submission.utils.utils' from 'c:\\\\Users\\\\sharv\\\\Documents\\\\TUHH\\\\sem-3\\\\intelligent systems in medicine\\\\project\\\\baselines\\\\phase_1a\\\\submission\\\\utils\\\\utils.py'>"
19
+ ]
20
+ },
21
+ "execution_count": 39,
22
+ "metadata": {},
23
+ "output_type": "execute_result"
24
+ }
25
+ ],
26
+ "source": [
27
+ "# from submission.utils.utils import extract_features_from_image, perform_pca\n",
28
+ "import submission.utils.utils as utils\n",
29
+ "import importlib\n",
30
+ "importlib.reload(utils)"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "metadata": {},
36
+ "source": [
37
+ "## A.1. Extract Features for Multiclass"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 40,
43
+ "metadata": {},
44
+ "outputs": [
45
+ {
46
+ "name": "stdout",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "Features shape: (2845, 2013)\n",
50
+ "Labels shape: (2845,)\n",
51
+ "[1 1 1 ... 1 2 1]\n"
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "from sklearn.model_selection import train_test_split\n",
57
+ "from sklearn.metrics import classification_report\n",
58
+ "import os\n",
59
+ "import pandas as pd\n",
60
+ "import cv2\n",
61
+ "import numpy as np\n",
62
+ "\n",
63
+ "BASE_PATH = \"C:/Users/sharv/Documents/TUHH/sem-3/intelligent systems in medicine/project/baselines/phase_1a\"\n",
64
+ "PATH_TO_GT = os.path.join(BASE_PATH, \"gt_for_classification_multiclass_from_filenames_0_index.csv\")\n",
65
+ "PATH_TO_IMAGES = os.path.join(BASE_PATH, \"images\")\n",
66
+ "\n",
67
+ "df = pd.read_csv(PATH_TO_GT)\n",
68
+ "\n",
69
+ "images = df[\"file_name\"].tolist()\n",
70
+ "\n",
71
+ "features = []\n",
72
+ "labels = []\n",
73
+ "\n",
74
+ "for i in range(len(df)):\n",
75
+ " \n",
76
+ " image_name = df.iloc[i][\"file_name\"]\n",
77
+ " label = df.iloc[i][\"category_id\"]\n",
78
+ "\n",
79
+ " path_to_image = os.path.join(PATH_TO_IMAGES, image_name)\n",
80
+ " image = cv2.imread(path_to_image)\n",
81
+ " \n",
82
+ " image_features = utils.extract_features_from_image(image)\n",
83
+ " \n",
84
+ " features.append(image_features)\n",
85
+ " labels.append(label)\n",
86
+ " \n",
87
+ "features_multiclass = np.array(features)\n",
88
+ "labels_multiclass = np.array(labels)\n",
89
+ "\n",
90
+ "print(\"Features shape:\", features_multiclass.shape)\n",
91
+ "print(\"Labels shape:\", labels_multiclass.shape)\n",
92
+ "print(labels_multiclass)"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "metadata": {},
98
+ "source": [
99
+ "# B. Train Classification Model for Multiclass"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 41,
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "name": "stdout",
109
+ "output_type": "stream",
110
+ "text": [
111
+ "Test Accuracy: 0.9666\n",
112
+ " precision recall f1-score support\n",
113
+ "\n",
114
+ " 0 0.97 0.95 0.96 167\n",
115
+ " 1 0.95 0.98 0.96 253\n",
116
+ " 2 0.99 0.97 0.98 149\n",
117
+ "\n",
118
+ " accuracy 0.97 569\n",
119
+ " macro avg 0.97 0.97 0.97 569\n",
120
+ "weighted avg 0.97 0.97 0.97 569\n",
121
+ "\n",
122
+ "Confusion matrix:\n",
123
+ " [[158 9 0]\n",
124
+ " [ 5 247 1]\n",
125
+ " [ 0 4 145]]\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "multiclass_model, _, _ = utils.train_svm_model(features_multiclass, labels_multiclass)\n"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "name": "stdout",
140
+ "output_type": "stream",
141
+ "text": [
142
+ "Pipeline(steps=[('scaler', StandardScaler()), ('select', SelectKBest(k=500)),\n",
143
+ " ('pca', PCA(n_components=100)),\n",
144
+ " ('svc',\n",
145
+ " SVC(class_weight='balanced', kernel='linear', probability=True,\n",
146
+ " random_state=42))])\n"
147
+ ]
148
+ }
149
+ ],
150
+ "source": [
151
+ "print(multiclass_model)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 43,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "# save the weights of multiclass_model\n",
161
+ "import pickle\n",
162
+ "\n",
163
+ "SAVE_PATH = \"C:/Users/sharv/Documents/TUHH/sem-3/intelligent systems in medicine/project/baselines/phase_1a/submission\"\n",
164
+ "\n",
165
+ "with open(os.path.join(SAVE_PATH, \"multiclass_model.pkl\"), \"wb\") as f:\n",
166
+ " pickle.dump(multiclass_model, f)\n"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "ism",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.9.25"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 2
191
+ }
script.py CHANGED
@@ -3,39 +3,29 @@ import pickle
3
  import cv2
4
  import pandas as pd
5
  import numpy as np
6
- from utils.utils import extract_features_from_image, perform_pca, train_svm_model
7
 
8
 
9
- def run_inference(TEST_IMAGE_PATH, svm_model, k, SUBMISSION_CSV_SAVE_PATH):
10
-
11
- test_images = os.listdir(TEST_IMAGE_PATH)
12
- test_images.sort()
13
 
14
  image_feature_list = []
15
-
16
  for test_image in test_images:
17
-
18
  path_to_image = os.path.join(TEST_IMAGE_PATH, test_image)
19
-
20
  image = cv2.imread(path_to_image)
21
- image_features = extract_features_from_image(image)
22
-
23
- image_feature_list.append(image_features)
24
-
25
  features_multiclass = np.array(image_feature_list)
26
-
27
- features_multiclass_reduced = perform_pca(features_multiclass, k)
28
-
29
- multiclass_predictions = svm_model.predict(features_multiclass_reduced)
30
 
31
- df_predictions = pd.DataFrame(columns=["file_name", "category_id"])
 
 
 
 
 
32
 
33
- for i in range(len(test_images)):
34
- file_name = test_images[i]
35
- new_row = pd.DataFrame({"file_name": file_name,
36
- "category_id": multiclass_predictions[i]}, index=[0])
37
- df_predictions = pd.concat([df_predictions, new_row], ignore_index=True)
38
-
39
  df_predictions.to_csv(SUBMISSION_CSV_SAVE_PATH, index=False)
40
 
41
 
 
3
  import cv2
4
  import pandas as pd
5
  import numpy as np
6
+ from utils.utils import extract_features_from_image
7
 
8
 
9
+ def run_inference(TEST_IMAGE_PATH, pipeline_model, SUBMISSION_CSV_SAVE_PATH):
10
+ test_images = sorted(os.listdir(TEST_IMAGE_PATH))
 
 
11
 
12
  image_feature_list = []
13
+
14
  for test_image in test_images:
 
15
  path_to_image = os.path.join(TEST_IMAGE_PATH, test_image)
 
16
  image = cv2.imread(path_to_image)
17
+ features = extract_features_from_image(image)
18
+ image_feature_list.append(features)
19
+
 
20
  features_multiclass = np.array(image_feature_list)
 
 
 
 
21
 
22
+ multiclass_predictions = pipeline_model.predict(features_multiclass)
23
+
24
+ df_predictions = pd.DataFrame({
25
+ "file_name": test_images,
26
+ "category_id": multiclass_predictions
27
+ })
28
 
 
 
 
 
 
 
29
  df_predictions.to_csv(SUBMISSION_CSV_SAVE_PATH, index=False)
30
 
31
 
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (214 Bytes). View file
 
utils/__pycache__/utils.cpython-39.pyc ADDED
Binary file (6.77 kB). View file
 
utils.py → utils/utils.py RENAMED
@@ -2,86 +2,87 @@ import cv2
2
  import numpy as np
3
  from skimage.feature.texture import graycomatrix, graycoprops
4
  from skimage.feature import local_binary_pattern ,hog
5
- from skimage.feature import local_binary_pattern
6
  from sklearn.decomposition import PCA
7
  from sklearn.svm import SVC
8
- from sklearn.model_selection import GridSearchCV
9
- from sklearn.model_selection import train_test_split
10
- from sklearn.metrics import accuracy_score
11
  from sklearn.preprocessing import StandardScaler
12
- from sklearn.metrics import classification_report
13
 
14
 
15
- def rgb_histogram(image, bins=64):
16
  features = []
17
-
18
- # RGB histograms (reduced bins)
 
 
 
19
  for i in range(3):
20
  hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
21
  hist = cv2.normalize(hist, hist).flatten()
22
  features.extend(hist)
23
-
24
- # HSV color space (more discriminative)
25
- hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
26
- for i in range(3):
27
- hist = cv2.calcHist([hsv], [i], None, [bins], [0, 256])
28
  hist = cv2.normalize(hist, hist).flatten()
29
  features.extend(hist)
30
-
31
- # Color moments (mean, std for each channel)
32
  for i in range(3):
33
- channel = image[:, :, i].astype(np.float32)
34
- features.append(np.mean(channel))
35
- features.append(np.std(channel))
36
- features.append(np.median(channel))
37
-
 
38
  return np.array(features)
39
 
 
40
  def hu_moments(image):
41
- # Convert to grayscale if the image is in RGB format
42
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
43
  moments = cv2.moments(gray)
44
- hu_moments = cv2.HuMoments(moments).flatten()
45
- # Apply log transform to reduce scale variance
46
- hu_moments = -np.sign(hu_moments) * np.log10(np.abs(hu_moments) + 1e-10)
47
- return hu_moments
 
 
48
 
49
- def glcm_features(image, distances=[1], angles=[0], levels=256, symmetric=True, normed=True):
50
- # Multiple distance-angle combinations for texture diversity
51
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
52
- glcm = graycomatrix(gray, distances=distances, angles=angles, levels=levels, symmetric=symmetric, normed=normed)
53
- contrast = graycoprops(glcm, 'contrast').flatten()
54
- dissimilarity = graycoprops(glcm, 'dissimilarity').flatten()
55
- homogeneity = graycoprops(glcm, 'homogeneity').flatten()
56
- energy = graycoprops(glcm, 'energy').flatten()
57
- correlation = graycoprops(glcm, 'correlation').flatten()
58
- asm = graycoprops(glcm, 'ASM').flatten()
59
- return np.concatenate([contrast, dissimilarity, homogeneity, energy, correlation, asm])
60
-
61
- def local_binary_pattern_features(image, P=8, R=1): #Higher P and R
 
 
 
 
 
62
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
63
  lbp = local_binary_pattern(gray, P, R, method='uniform')
64
- (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, P + 3), range=(0, P + 2), density=True)
65
  return hist
66
 
67
 
68
-
69
  # Edge Density (Canny-based)
70
-
71
  def edge_density(image, low_threshold=50, high_threshold=150):
72
-
73
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
74
  edges = cv2.Canny(gray, low_threshold, high_threshold)
75
  density = np.sum(edges > 0) / edges.size
76
  return np.array([density])
77
 
78
 
79
-
80
-
81
- def hog_features(image, pixels_per_cell=(64, 64), cells_per_block=(1, 1), orientations=4):
82
- """
83
- Highly compressed HOG features to prevent overfitting
84
- """
85
  image_resized = cv2.resize(image, (128, 128))
86
  gray = cv2.cvtColor(image_resized, cv2.COLOR_RGB2GRAY)
87
  hog_feat = hog(gray,
@@ -89,43 +90,20 @@ def hog_features(image, pixels_per_cell=(64, 64), cells_per_block=(1, 1), orient
89
  pixels_per_cell=pixels_per_cell,
90
  cells_per_block=cells_per_block,
91
  block_norm='L2-Hys',
 
92
  feature_vector=True)
93
  return hog_feat
94
 
95
 
96
  def extract_features_from_image(image):
97
-
98
- # 1. RGB Histogram
99
- hist_features = rgb_histogram(image)
100
-
101
-
102
- # 2. Hu Moments
103
- hu_features = hu_moments(image)
104
-
105
- # 3. GLCM Features
106
- glcm_features_vector = glcm_features(image)
107
-
108
- # 4. Local Binary Pattern (LBP)
109
- lbp_features = local_binary_pattern_features(image)
110
-
111
-
112
- #### Add more feature extraction methods here ####
113
-
114
- edge_feat = edge_density(image)
115
- hog_feat = hog_features(image)
116
-
117
-
118
- ##################################################
119
-
120
-
121
- # Concatenate all feature vectors
122
- image_features = np.concatenate([hist_features, hu_features, glcm_features_vector, lbp_features
123
- ,edge_feat,hog_feat])
124
-
125
-
126
- return image_features
127
-
128
 
 
129
 
130
  def perform_pca(data, num_components):
131
  # Clean data
@@ -145,53 +123,58 @@ def perform_pca(data, num_components):
145
 
146
  return data_reduced
147
 
148
-
149
- def train_svm_model(features, labels, test_size=0.2, k=100):
 
 
 
 
 
150
  """
151
- Trains an SVM model and returns the trained model.
152
-
153
- Parameters:
154
- - features: Feature matrix of shape (B, F)
155
- - labels: Label matrix of shape (B, C) if one-hot encoded, or (B,) for single labels
156
- - test_size: Proportion of the data to use for testing (default is 0.2)
157
-
158
  Returns:
159
- - svm_model: Trained SVM model
 
 
160
  """
161
- # Check if labels are one-hot encoded, convert if needed
162
  if labels.ndim > 1 and labels.shape[1] > 1:
163
- labels = np.argmax(labels, axis=1) # Convert one-hot to single label per sample
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Split the data into training and testing sets
166
- X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)
167
-
168
- # ---------- FIX 1: Standardize TRAIN ONLY ----------
169
- scaler = StandardScaler()
170
- X_train_scaled = scaler.fit_transform(X_train)
171
- X_test_scaled = scaler.transform(X_test)
172
 
173
- # ---------- FIX 2: PCA fit ONLY on TRAIN ----------
174
- pca = PCA(n_components=min(k, X_train_scaled.shape[1]))
175
- X_train_reduced = pca.fit_transform(X_train_scaled)
176
- X_test_reduced = pca.transform(X_test_scaled)
177
-
178
- # SVM GridSearch
179
- param_grid = {
180
- 'C': [0.1, 1],
181
- 'gamma': [0.001, 0.0001],
182
- 'kernel': ['rbf']
183
- }
184
- grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
185
- grid.fit(X_train_reduced, y_train)
186
 
187
- # Evaluate
188
- preds = grid.predict(X_test_reduced)
189
- report = classification_report(y_test, preds)
190
-
191
- # Return EVERYTHING needed for inference
192
- return {
193
- "svm": grid,
194
- "scaler": scaler,
195
- "pca": pca,
196
- "report": report
197
- }
 
2
  import numpy as np
3
  from skimage.feature.texture import graycomatrix, graycoprops
4
  from skimage.feature import local_binary_pattern ,hog
 
5
  from sklearn.decomposition import PCA
6
  from sklearn.svm import SVC
7
+ from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
8
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
9
+ from sklearn.feature_selection import SelectKBest, f_classif
10
  from sklearn.preprocessing import StandardScaler
11
+ from sklearn.pipeline import Pipeline
12
 
13
 
14
+ def rgb_histogram(image, bins=32):
15
  features = []
16
+
17
+ # Convert to float32 for stability
18
+ image = image.astype(np.float32)
19
+
20
+ # RGB histograms
21
  for i in range(3):
22
  hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
23
  hist = cv2.normalize(hist, hist).flatten()
24
  features.extend(hist)
25
+
26
+ # HSV histograms
27
+ hsv = cv2.cvtColor(image.astype(np.uint8), cv2.COLOR_RGB2HSV)
28
+ for i, (low, high) in enumerate(zip([0, 0, 0], [180, 256, 256])):
29
+ hist = cv2.calcHist([hsv], [i], None, [bins], [low, high])
30
  hist = cv2.normalize(hist, hist).flatten()
31
  features.extend(hist)
32
+
33
+ # Color moments (mean, std, skew)
34
  for i in range(3):
35
+ channel = image[:, :, i]
36
+ mean = np.mean(channel)
37
+ std = np.std(channel)
38
+ skew = np.cbrt(np.mean((channel - mean) ** 3))
39
+ features.extend([mean, std, skew])
40
+
41
  return np.array(features)
42
 
43
+
44
  def hu_moments(image):
 
45
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
46
  moments = cv2.moments(gray)
47
+ hu = cv2.HuMoments(moments).flatten()
48
+ hu = -np.sign(hu) * np.log10(np.abs(hu) + 1e-10)
49
+ # Clip extreme values to reduce sensitivity to noise
50
+ hu = np.clip(hu, -10, 10)
51
+ return hu
52
+
53
 
54
+ def glcm_features(image, distances=[1, 2], angles=[0, np.pi/4, np.pi/2], levels=64):
 
55
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
56
+ gray = (gray // (256 // levels)).astype(np.uint8) # quantization
57
+ features = []
58
+
59
+ for d in distances:
60
+ for a in angles:
61
+ glcm = graycomatrix(gray, distances=[d], angles=[a], levels=levels, symmetric=True, normed=True)
62
+ props = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation']
63
+ for p in props:
64
+ val = graycoprops(glcm, p).flatten()
65
+ features.extend(val)
66
+
67
+ return np.array(features)
68
+
69
+
70
+ def local_binary_pattern_features(image, P=8, R=1):
71
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
72
  lbp = local_binary_pattern(gray, P, R, method='uniform')
73
+ hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P + 3), range=(0, P + 2), density=True)
74
  return hist
75
 
76
 
 
77
  # Edge Density (Canny-based)
 
78
  def edge_density(image, low_threshold=50, high_threshold=150):
 
79
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
80
  edges = cv2.Canny(gray, low_threshold, high_threshold)
81
  density = np.sum(edges > 0) / edges.size
82
  return np.array([density])
83
 
84
 
85
+ def hog_features(image, pixels_per_cell=(16,16), cells_per_block=(2,2), orientations=9):
 
 
 
 
 
86
  image_resized = cv2.resize(image, (128, 128))
87
  gray = cv2.cvtColor(image_resized, cv2.COLOR_RGB2GRAY)
88
  hog_feat = hog(gray,
 
90
  pixels_per_cell=pixels_per_cell,
91
  cells_per_block=cells_per_block,
92
  block_norm='L2-Hys',
93
+ transform_sqrt=True,
94
  feature_vector=True)
95
  return hog_feat
96
 
97
 
98
  def extract_features_from_image(image):
99
+ hist = rgb_histogram(image)
100
+ hu = hu_moments(image)
101
+ glcm = glcm_features(image)
102
+ lbp = local_binary_pattern_features(image)
103
+ edge = edge_density(image)
104
+ hog_f = hog_features(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ return np.concatenate([hist, hu, glcm, lbp, edge, hog_f])
107
 
108
  def perform_pca(data, num_components):
109
  # Clean data
 
123
 
124
  return data_reduced
125
 
126
+ def train_svm_model(features, labels,
127
+ test_size=0.2,
128
+ random_state=42,
129
+ use_selectkbest=True,
130
+ k_best=500,
131
+ n_pca_components=100,
132
+ do_gridsearch=False):
133
  """
 
 
 
 
 
 
 
134
  Returns:
135
+ pipeline: trained sklearn Pipeline (scaler -> optional SelectKBest -> PCA -> SVC)
136
+ X_test, y_test, y_pred for quick evaluation
137
+ grid_search (if do_gridsearch True), else None
138
  """
 
139
  if labels.ndim > 1 and labels.shape[1] > 1:
140
+ labels = np.argmax(labels, axis=1)
141
+
142
+ # stratified split
143
+ X_train, X_test, y_train, y_test = train_test_split(
144
+ features, labels, test_size=test_size, random_state=random_state, stratify=labels)
145
+
146
+ # build pipeline steps
147
+ steps = []
148
+ steps.append(('scaler', StandardScaler()))
149
+ if use_selectkbest:
150
+ steps.append(('select', SelectKBest(score_func=f_classif, k=min(k_best, X_train.shape[1]))))
151
+ steps.append(('pca', PCA(n_components=min(n_pca_components, X_train.shape[1]))))
152
+ steps.append(('svc', SVC(kernel='linear', probability=True, class_weight='balanced', random_state=random_state)))
153
+ pipeline = Pipeline(steps)
154
+
155
+ grid_search = None
156
+ if do_gridsearch:
157
+ param_grid = {
158
+ 'select__k': [int(min(200, X_train.shape[1])), int(min(500, X_train.shape[1])), int(min(1000, X_train.shape[1]))] if use_selectkbest else [],
159
+ 'pca__n_components': [50, 100, 200],
160
+ 'svc__C': [0.1, 1, 5, 10]
161
+ }
162
+ # remove empty keys if use_selectkbest is False
163
+ param_grid = {k: v for k, v in param_grid.items() if v}
164
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
165
+ grid_search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, scoring='accuracy', verbose=2)
166
+ grid_search.fit(X_train, y_train)
167
+ best_model = grid_search.best_estimator_
168
+ pipeline = best_model
169
+ else:
170
+ pipeline.fit(X_train, y_train)
171
 
172
+ # Evaluate
173
+ y_pred = pipeline.predict(X_test)
174
+ acc = accuracy_score(y_test, y_pred)
175
+ print(f"Test Accuracy: {acc:.4f}")
176
+ print(classification_report(y_test, y_pred))
177
+ print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
 
178
 
179
+ return pipeline, (X_test, y_test, y_pred), grid_search
 
 
 
 
 
 
 
 
 
 
 
 
180