Divyansh Chauhan commited on
Commit
867eb64
·
1 Parent(s): 5bc6b9d

all large model files

Browse files
Sentense transformer model.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import pandas as pd
2
+ # from sentence_transformers import SentenceTransformer
3
+ # from sklearn.model_selection import train_test_split
4
+ # from sklearn.ensemble import RandomForestClassifier
5
+ # from sklearn.metrics import classification_report, confusion_matrix
6
+ # from sklearn.preprocessing import LabelEncoder
7
+ # from imblearn.over_sampling import SMOTE
8
+ # import joblib
9
+ #
10
+ # # Load dataset
11
+ # df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
12
+ # df = df.dropna(subset=['article', 'label'])
13
+ # df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
14
+ #
15
+ # # SBERT Embedding
16
+ # sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
17
+ # embeddings = sbert_model.encode(df['article'].tolist(), show_progress_bar=True)
18
+ #
19
+ # # Encode labels
20
+ # label_encoder = LabelEncoder()
21
+ # y = label_encoder.fit_transform(df['label'])
22
+ #
23
+ # # Balance the dataset
24
+ # sm = SMOTE(random_state=42)
25
+ # X_resampled, y_resampled = sm.fit_resample(embeddings, y)
26
+ #
27
+ # # Train-test split
28
+ # X_train, X_test, y_train, y_test = train_test_split(
29
+ # X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
30
+ # )
31
+ #
32
+ # # Train classifier
33
+ # clf = RandomForestClassifier(n_estimators=100, random_state=42)
34
+ # clf.fit(X_train, y_train)
35
+ # y_pred = clf.predict(X_test)
36
+ #
37
+ # # Results
38
+ # print("\n✅ SBERT + RandomForest Results")
39
+ # print(classification_report(y_test, y_pred, zero_division=0))
40
+ # print("\n🔍 Confusion Matrix:")
41
+ # print(confusion_matrix(y_test, y_pred))
42
+ #
43
+ # # Define SBERT wrapper for inference compatibility
44
+ # class SBERTTransformer:
45
+ # def __init__(self, model_name='all-MiniLM-L6-v2'):
46
+ # self.model = SentenceTransformer(model_name)
47
+ #
48
+ # def transform(self, sentences):
49
+ # return self.model.encode(sentences)
50
+ #
51
+ # def fit(self, X, y=None):
52
+ # return self
53
+ #
54
+ # # Save components
55
+ # vectorizer = SBERTTransformer() # Wraps SBERT model
56
+ # pipeline = {
57
+ # "vectorizer": vectorizer,
58
+ # "model": clf,
59
+ # "label_encoder": label_encoder
60
+ # }
61
+ #
62
+ # joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline.joblib")
63
+ # print("✅ Model saved successfully to sentiment_pipeline.joblib")
64
+ #
65
+ # import pandas as pd
66
+ # from sentence_transformers import SentenceTransformer
67
+ # from sklearn.model_selection import StratifiedKFold
68
+ # from sklearn.ensemble import RandomForestClassifier
69
+ # from sklearn.metrics import classification_report, confusion_matrix
70
+ # from sklearn.preprocessing import LabelEncoder
71
+ # from imblearn.over_sampling import SMOTE
72
+ # import joblib
73
+ # import numpy as np
74
+ #
75
+ # # Load dataset
76
+ # df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
77
+ # df = df.dropna(subset=['article', 'label'])
78
+ # df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
79
+ #
80
+ # # SBERT Embedding
81
+ # sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
82
+ # embeddings = sbert_model.encode(df['article'].tolist(), show_progress_bar=True)
83
+ #
84
+ # # Encode labels
85
+ # label_encoder = LabelEncoder()
86
+ # y = label_encoder.fit_transform(df['label'])
87
+ #
88
+ # # Balance the dataset
89
+ # sm = SMOTE(random_state=42)
90
+ # X_resampled, y_resampled = sm.fit_resample(embeddings, y)
91
+ #
92
+ # # Stratified K-Fold Cross Validation
93
+ # kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
94
+ # all_reports = []
95
+ # fold = 1
96
+ #
97
+ # for train_index, test_index in kf.split(X_resampled, y_resampled):
98
+ # print(f"\n🔁 Fold {fold}")
99
+ # X_train, X_test = X_resampled[train_index], X_resampled[test_index]
100
+ # y_train, y_test = y_resampled[train_index], y_resampled[test_index]
101
+ #
102
+ # clf = RandomForestClassifier(n_estimators=100, random_state=42)
103
+ # clf.fit(X_train, y_train)
104
+ # y_pred = clf.predict(X_test)
105
+ #
106
+ # report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0, output_dict=True)
107
+ # all_reports.append(report)
108
+ #
109
+ # print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))
110
+ # print("Confusion Matrix:")
111
+ # print(confusion_matrix(y_test, y_pred))
112
+ # fold += 1
113
+ #
114
+ # # Average report (macro avg)
115
+ # avg_report = {}
116
+ # for label in label_encoder.classes_:
117
+ # avg_report[label] = {
118
+ # metric: np.mean([rep[label][metric] for rep in all_reports])
119
+ # for metric in ['precision', 'recall', 'f1-score']
120
+ # }
121
+ #
122
+ # print("\n📊 Average Classification Report across folds:")
123
+ # for label, metrics in avg_report.items():
124
+ # print(f"\nLabel: {label}")
125
+ # for metric, value in metrics.items():
126
+ # print(f"{metric}: {value:.4f}")
127
+ #
128
+ # # Save final model from last fold (or retrain on full data if preferred)
129
+ # final_clf = RandomForestClassifier(n_estimators=100, random_state=42)
130
+ # final_clf.fit(X_resampled, y_resampled)
131
+ #
132
+ # # Define SBERT wrapper
133
+ # class SBERTTransformer:
134
+ # def __init__(self, model_name='all-MiniLM-L6-v2'):
135
+ # self.model = SentenceTransformer(model_name)
136
+ #
137
+ # def transform(self, sentences):
138
+ # return self.model.encode(sentences)
139
+ #
140
+ # def fit(self, X, y=None):
141
+ # return self
142
+ #
143
+ # # Save final pipeline
144
+ # vectorizer = SBERTTransformer()
145
+ # pipeline = {
146
+ # "vectorizer": vectorizer,
147
+ # "model": final_clf,
148
+ # "label_encoder": label_encoder
149
+ # }
150
+ #
151
+ # joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline.joblib")
152
+ # print("\n✅ Final model saved successfully to sentiment_pipeline.joblib")
153
+
154
+
155
+ import pandas as pd
156
+ from sentence_transformers import SentenceTransformer
157
+ from sklearn.model_selection import StratifiedKFold
158
+ from sklearn.ensemble import RandomForestClassifier
159
+ from sklearn.metrics import classification_report, confusion_matrix
160
+ from sklearn.preprocessing import LabelEncoder
161
+ from imblearn.over_sampling import SMOTE
162
+ import joblib
163
+ import numpy as np
164
+ from tqdm import tqdm
165
+
166
+ # --- 1. Data Loading and Preparation ---
167
+ print("🔄 Loading and preparing data...")
168
+ df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
169
+ df = df.dropna(subset=['article', 'label'])
170
+ df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
171
+ print("✅ Data loaded successfully.")
172
+
173
+ # --- 2. SBERT Embedding with Chunking and Averaging ---
174
+ print("🧠 Initializing SBERT model...")
175
+ sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
176
+
177
+ # Define chunking parameters
178
+ # We use a chunk size smaller than the model's max sequence length (256)
179
+ CHUNK_SIZE = 200
180
+ OVERLAP = 50
181
+
182
+ all_article_embeddings = []
183
+ print(f"🚀 Generating embeddings with chunking (Chunk size: {CHUNK_SIZE}, Overlap: {OVERLAP})...")
184
+
185
+ # Use tqdm for a progress bar as this process is slower
186
+ for article in tqdm(df['article'].tolist(), desc="Embedding Articles"):
187
+ # Split article into words
188
+ words = article.split()
189
+
190
+ # If the article is short, no chunking is needed
191
+ if len(words) <= CHUNK_SIZE:
192
+ article_embedding = sbert_model.encode([article])
193
+ else:
194
+ # Create overlapping chunks
195
+ chunks = []
196
+ for i in range(0, len(words), CHUNK_SIZE - OVERLAP):
197
+ chunk = " ".join(words[i:i + CHUNK_SIZE])
198
+ chunks.append(chunk)
199
+
200
+ # Encode each chunk and store their embeddings
201
+ chunk_embeddings = sbert_model.encode(chunks)
202
+
203
+ # Average the embeddings of all chunks to get a single vector
204
+ article_embedding = np.mean(chunk_embeddings, axis=0, keepdims=True)
205
+
206
+ all_article_embeddings.append(article_embedding[0])
207
+
208
+ # Convert the list of embeddings to a NumPy array
209
+ embeddings = np.array(all_article_embeddings)
210
+ print("✅ Embeddings generated successfully.")
211
+
212
+ # --- 3. Encode Labels ---
213
+ print("🏷️ Encoding labels...")
214
+ label_encoder = LabelEncoder()
215
+ y = label_encoder.fit_transform(df['label'])
216
+
217
+ # --- 4. Balance the Dataset ---
218
+ print("⚖️ Balancing the dataset with SMOTE...")
219
+ sm = SMOTE(random_state=42)
220
+ X_resampled, y_resampled = sm.fit_resample(embeddings, y)
221
+ print(f"Dataset balanced. Original samples: {len(y)}, Resampled samples: {len(y_resampled)}")
222
+
223
+ # --- 5. Stratified K-Fold Cross Validation ---
224
+ print("🔄 Starting 5-Fold Cross-Validation...")
225
+ kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
226
+ all_reports = []
227
+ fold = 1
228
+
229
+ for train_index, test_index in kf.split(X_resampled, y_resampled):
230
+ print(f"\n--- Fold {fold} ---")
231
+ X_train, X_test = X_resampled[train_index], X_resampled[test_index]
232
+ y_train, y_test = y_resampled[train_index], y_resampled[test_index]
233
+
234
+ clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
235
+ clf.fit(X_train, y_train)
236
+ y_pred = clf.predict(X_test)
237
+
238
+ report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0,
239
+ output_dict=True)
240
+ all_reports.append(report)
241
+
242
+ print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))
243
+ print("Confusion Matrix:")
244
+ print(confusion_matrix(y_test, y_pred))
245
+ fold += 1
246
+
247
+ # --- 6. Average Report Calculation ---
248
+ avg_report = {}
249
+ for label in label_encoder.classes_:
250
+ avg_report[label] = {
251
+ metric: np.mean([rep[label][metric] for rep in all_reports])
252
+ for metric in ['precision', 'recall', 'f1-score']
253
+ }
254
+
255
+ print("\n📊 Average Classification Report Across All Folds:")
256
+ for label, metrics in avg_report.items():
257
+ print(f"\nLabel: {label}")
258
+ for metric, value in metrics.items():
259
+ print(f"{metric}: {value:.4f}")
260
+
261
+ # --- 7. Final Model Training ---
262
+ print("\n💪 Training final model on the full, balanced dataset...")
263
+ final_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
264
+ final_clf.fit(X_resampled, y_resampled)
265
+ print("✅ Final model trained.")
266
+
267
+
268
+ # --- 8. Define SBERT Wrapper with Chunking Logic ---
269
+ # This class is CRITICAL for the saved pipeline to work correctly on new, long text.
270
+ class SBERTTransformer:
271
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
272
+ self.model = SentenceTransformer(model_name)
273
+ self.chunk_size = 200
274
+ self.overlap = 50
275
+
276
+ def transform(self, sentences):
277
+ """
278
+ Transforms a list of sentences (articles) into embeddings using chunking.
279
+ """
280
+ all_embeddings = []
281
+ for sentence in tqdm(sentences, desc="Vectorizing new data"):
282
+ words = sentence.split()
283
+ if len(words) <= self.chunk_size:
284
+ embedding = self.model.encode([sentence])
285
+ else:
286
+ chunks = []
287
+ for i in range(0, len(words), self.chunk_size - self.overlap):
288
+ chunk = " ".join(words[i:i + self.chunk_size])
289
+ chunks.append(chunk)
290
+
291
+ chunk_embeddings = self.model.encode(chunks)
292
+ embedding = np.mean(chunk_embeddings, axis=0, keepdims=True)
293
+
294
+ all_embeddings.append(embedding[0])
295
+ return np.array(all_embeddings)
296
+
297
+ def fit(self, X, y=None):
298
+ # This model is already pre-trained, so fit does nothing.
299
+ return self
300
+
301
+
302
+ # --- 9. Save Final Pipeline ---
303
+ print("💾 Saving the final pipeline to disk...")
304
+ vectorizer = SBERTTransformer()
305
+ pipeline = {
306
+ "vectorizer": vectorizer,
307
+ "model": final_clf,
308
+ "label_encoder": label_encoder
309
+ }
310
+
311
+ joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline_chunking.joblib")
312
+ print("\n✅ Final model saved successfully to sentiment_pipeline_chunking.joblib")
313
+
Sentense transformer model_tp.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import pandas as pd
2
+ # from sentence_transformers import SentenceTransformer
3
+ # from sklearn.model_selection import train_test_split
4
+ # from sklearn.ensemble import RandomForestClassifier
5
+ # from sklearn.metrics import classification_report, confusion_matrix
6
+ # from sklearn.preprocessing import LabelEncoder
7
+ # from imblearn.over_sampling import SMOTE
8
+ # import joblib
9
+ #
10
+ # # Load dataset
11
+ # df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
12
+ # df = df.dropna(subset=['article', 'label'])
13
+ # df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
14
+ #
15
+ # # SBERT Embedding
16
+ # sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
17
+ # embeddings = sbert_model.encode(df['article'].tolist(), show_progress_bar=True)
18
+ #
19
+ # # Encode labels
20
+ # label_encoder = LabelEncoder()
21
+ # y = label_encoder.fit_transform(df['label'])
22
+ #
23
+ # # Balance the dataset
24
+ # sm = SMOTE(random_state=42)
25
+ # X_resampled, y_resampled = sm.fit_resample(embeddings, y)
26
+ #
27
+ # # Train-test split
28
+ # X_train, X_test, y_train, y_test = train_test_split(
29
+ # X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
30
+ # )
31
+ #
32
+ # # Train classifier
33
+ # clf = RandomForestClassifier(n_estimators=100, random_state=42)
34
+ # clf.fit(X_train, y_train)
35
+ # y_pred = clf.predict(X_test)
36
+ #
37
+ # # Results
38
+ # print("\n✅ SBERT + RandomForest Results")
39
+ # print(classification_report(y_test, y_pred, zero_division=0))
40
+ # print("\n🔍 Confusion Matrix:")
41
+ # print(confusion_matrix(y_test, y_pred))
42
+ #
43
+ # # Define SBERT wrapper for inference compatibility
44
+ # class SBERTTransformer:
45
+ # def __init__(self, model_name='all-MiniLM-L6-v2'):
46
+ # self.model = SentenceTransformer(model_name)
47
+ #
48
+ # def transform(self, sentences):
49
+ # return self.model.encode(sentences)
50
+ #
51
+ # def fit(self, X, y=None):
52
+ # return self
53
+ #
54
+ # # Save components
55
+ # vectorizer = SBERTTransformer() # Wraps SBERT model
56
+ # pipeline = {
57
+ # "vectorizer": vectorizer,
58
+ # "model": clf,
59
+ # "label_encoder": label_encoder
60
+ # }
61
+ #
62
+ # joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline.joblib")
63
+ # print("✅ Model saved successfully to sentiment_pipeline.joblib")
64
+ #
65
+ # import pandas as pd
66
+ # from sentence_transformers import SentenceTransformer
67
+ # from sklearn.model_selection import StratifiedKFold
68
+ # from sklearn.ensemble import RandomForestClassifier
69
+ # from sklearn.metrics import classification_report, confusion_matrix
70
+ # from sklearn.preprocessing import LabelEncoder
71
+ # from imblearn.over_sampling import SMOTE
72
+ # import joblib
73
+ # import numpy as np
74
+ #
75
+ # # Load dataset
76
+ # df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
77
+ # df = df.dropna(subset=['article', 'label'])
78
+ # df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
79
+ #
80
+ # # SBERT Embedding
81
+ # sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
82
+ # embeddings = sbert_model.encode(df['article'].tolist(), show_progress_bar=True)
83
+ #
84
+ # # Encode labels
85
+ # label_encoder = LabelEncoder()
86
+ # y = label_encoder.fit_transform(df['label'])
87
+ #
88
+ # # Balance the dataset
89
+ # sm = SMOTE(random_state=42)
90
+ # X_resampled, y_resampled = sm.fit_resample(embeddings, y)
91
+ #
92
+ # # Stratified K-Fold Cross Validation
93
+ # kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
94
+ # all_reports = []
95
+ # fold = 1
96
+ #
97
+ # for train_index, test_index in kf.split(X_resampled, y_resampled):
98
+ # print(f"\n🔁 Fold {fold}")
99
+ # X_train, X_test = X_resampled[train_index], X_resampled[test_index]
100
+ # y_train, y_test = y_resampled[train_index], y_resampled[test_index]
101
+ #
102
+ # clf = RandomForestClassifier(n_estimators=100, random_state=42)
103
+ # clf.fit(X_train, y_train)
104
+ # y_pred = clf.predict(X_test)
105
+ #
106
+ # report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0, output_dict=True)
107
+ # all_reports.append(report)
108
+ #
109
+ # print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))
110
+ # print("Confusion Matrix:")
111
+ # print(confusion_matrix(y_test, y_pred))
112
+ # fold += 1
113
+ #
114
+ # # Average report (macro avg)
115
+ # avg_report = {}
116
+ # for label in label_encoder.classes_:
117
+ # avg_report[label] = {
118
+ # metric: np.mean([rep[label][metric] for rep in all_reports])
119
+ # for metric in ['precision', 'recall', 'f1-score']
120
+ # }
121
+ #
122
+ # print("\n📊 Average Classification Report across folds:")
123
+ # for label, metrics in avg_report.items():
124
+ # print(f"\nLabel: {label}")
125
+ # for metric, value in metrics.items():
126
+ # print(f"{metric}: {value:.4f}")
127
+ #
128
+ # # Save final model from last fold (or retrain on full data if preferred)
129
+ # final_clf = RandomForestClassifier(n_estimators=100, random_state=42)
130
+ # final_clf.fit(X_resampled, y_resampled)
131
+ #
132
+ # # Define SBERT wrapper
133
+ # class SBERTTransformer:
134
+ # def __init__(self, model_name='all-MiniLM-L6-v2'):
135
+ # self.model = SentenceTransformer(model_name)
136
+ #
137
+ # def transform(self, sentences):
138
+ # return self.model.encode(sentences)
139
+ #
140
+ # def fit(self, X, y=None):
141
+ # return self
142
+ #
143
+ # # Save final pipeline
144
+ # vectorizer = SBERTTransformer()
145
+ # pipeline = {
146
+ # "vectorizer": vectorizer,
147
+ # "model": final_clf,
148
+ # "label_encoder": label_encoder
149
+ # }
150
+ #
151
+ # joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline.joblib")
152
+ # print("\n✅ Final model saved successfully to sentiment_pipeline.joblib")
153
+
154
+
155
+ import pandas as pd
156
+ from sentence_transformers import SentenceTransformer
157
+ from sklearn.model_selection import StratifiedKFold
158
+ from sklearn.ensemble import RandomForestClassifier
159
+ from sklearn.metrics import classification_report, confusion_matrix
160
+ from sklearn.preprocessing import LabelEncoder
161
+ from imblearn.over_sampling import SMOTE
162
+ import joblib
163
+ import numpy as np
164
+ from tqdm import tqdm
165
+
166
+ # --- 1. Data Loading and Preparation ---
167
+ print("🔄 Loading and preparing data...")
168
+ df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
169
+ df = df.dropna(subset=['article', 'label'])
170
+ df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
171
+ print("✅ Data loaded successfully.")
172
+
173
+ # --- 2. SBERT Embedding with Chunking and Averaging ---
174
+ print("🧠 Initializing SBERT model...")
175
+ sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
176
+
177
+ # Define chunking parameters
178
+ # We use a chunk size smaller than the model's max sequence length (256)
179
+ CHUNK_SIZE = 200
180
+ OVERLAP = 50
181
+
182
+ all_article_embeddings = []
183
+ print(f"🚀 Generating embeddings with chunking (Chunk size: {CHUNK_SIZE}, Overlap: {OVERLAP})...")
184
+
185
+ # Use tqdm for a progress bar as this process is slower
186
+ for article in tqdm(df['article'].tolist(), desc="Embedding Articles"):
187
+ # Split article into words
188
+ words = article.split()
189
+
190
+ # If the article is short, no chunking is needed
191
+ if len(words) <= CHUNK_SIZE:
192
+ article_embedding = sbert_model.encode([article])
193
+ else:
194
+ # Create overlapping chunks
195
+ chunks = []
196
+ for i in range(0, len(words), CHUNK_SIZE - OVERLAP):
197
+ chunk = " ".join(words[i:i + CHUNK_SIZE])
198
+ chunks.append(chunk)
199
+
200
+ # Encode each chunk and store their embeddings
201
+ chunk_embeddings = sbert_model.encode(chunks)
202
+
203
+ # Average the embeddings of all chunks to get a single vector
204
+ article_embedding = np.mean(chunk_embeddings, axis=0, keepdims=True)
205
+
206
+ all_article_embeddings.append(article_embedding[0])
207
+
208
+ # Convert the list of embeddings to a NumPy array
209
+ embeddings = np.array(all_article_embeddings)
210
+ print("✅ Embeddings generated successfully.")
211
+
212
+ # --- 3. Encode Labels ---
213
+ print("🏷️ Encoding labels...")
214
+ label_encoder = LabelEncoder()
215
+ y = label_encoder.fit_transform(df['label'])
216
+
217
+ # --- 4. Balance the Dataset ---
218
+ print("⚖️ Balancing the dataset with SMOTE...")
219
+ sm = SMOTE(random_state=42)
220
+ X_resampled, y_resampled = sm.fit_resample(embeddings, y)
221
+ print(f"Dataset balanced. Original samples: {len(y)}, Resampled samples: {len(y_resampled)}")
222
+
223
+ # --- 5. Stratified K-Fold Cross Validation ---
224
+ print("🔄 Starting 5-Fold Cross-Validation...")
225
+ kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
226
+ all_reports = []
227
+ fold = 1
228
+
229
+ for train_index, test_index in kf.split(X_resampled, y_resampled):
230
+ print(f"\n--- Fold {fold} ---")
231
+ X_train, X_test = X_resampled[train_index], X_resampled[test_index]
232
+ y_train, y_test = y_resampled[train_index], y_resampled[test_index]
233
+
234
+ clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
235
+ clf.fit(X_train, y_train)
236
+ y_pred = clf.predict(X_test)
237
+
238
+ report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0,
239
+ output_dict=True)
240
+ all_reports.append(report)
241
+
242
+ print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))
243
+ print("Confusion Matrix:")
244
+ print(confusion_matrix(y_test, y_pred))
245
+ fold += 1
246
+
247
+ # --- 6. Average Report Calculation ---
248
+ avg_report = {}
249
+ for label in label_encoder.classes_:
250
+ avg_report[label] = {
251
+ metric: np.mean([rep[label][metric] for rep in all_reports])
252
+ for metric in ['precision', 'recall', 'f1-score']
253
+ }
254
+
255
+ print("\n📊 Average Classification Report Across All Folds:")
256
+ for label, metrics in avg_report.items():
257
+ print(f"\nLabel: {label}")
258
+ for metric, value in metrics.items():
259
+ print(f"{metric}: {value:.4f}")
260
+
261
+ # --- 7. Final Model Training ---
262
+ print("\n💪 Training final model on the full, balanced dataset...")
263
+ final_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
264
+ final_clf.fit(X_resampled, y_resampled)
265
+ print("✅ Final model trained.")
266
+
267
+
268
+ # --- 8. Define SBERT Wrapper with Chunking Logic ---
269
+ # This class is CRITICAL for the saved pipeline to work correctly on new, long text.
270
+ class SBERTTransformer:
271
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
272
+ self.model = SentenceTransformer(model_name)
273
+ self.chunk_size = 200
274
+ self.overlap = 50
275
+
276
+ def transform(self, sentences):
277
+ """
278
+ Transforms a list of sentences (articles) into embeddings using chunking.
279
+ """
280
+ all_embeddings = []
281
+ for sentence in tqdm(sentences, desc="Vectorizing new data"):
282
+ words = sentence.split()
283
+ if len(words) <= self.chunk_size:
284
+ embedding = self.model.encode([sentence])
285
+ else:
286
+ chunks = []
287
+ for i in range(0, len(words), self.chunk_size - self.overlap):
288
+ chunk = " ".join(words[i:i + self.chunk_size])
289
+ chunks.append(chunk)
290
+
291
+ chunk_embeddings = self.model.encode(chunks)
292
+ embedding = np.mean(chunk_embeddings, axis=0, keepdims=True)
293
+
294
+ all_embeddings.append(embedding[0])
295
+ return np.array(all_embeddings)
296
+
297
+ def fit(self, X, y=None):
298
+ # This model is already pre-trained, so fit does nothing.
299
+ return self
300
+
301
+
302
+ # --- 9. Save Final Pipeline ---
303
+ print("💾 Saving the final pipeline to disk...")
304
+ vectorizer = SBERTTransformer()
305
+ pipeline = {
306
+ "vectorizer": vectorizer,
307
+ "model": final_clf,
308
+ "label_encoder": label_encoder
309
+ }
310
+
311
+ joblib.dump(pipeline, "D:/Python_files/models/sentiment_pipeline_chunking.joblib")
312
+ print("\n✅ Final model saved successfully to sentiment_pipeline_chunking.joblib")
313
+
TF-IDF model.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import classification_report, confusion_matrix
6
+ from imblearn.over_sampling import SMOTE
7
+
8
+ # Load dataset
9
+ df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
10
+ df = df.dropna(subset=['article', 'label'])
11
+ df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
12
+
13
+ # TF-IDF Vectorization
14
+ X = df['article'].values
15
+ y = df['label'].values
16
+
17
+ vectorizer = TfidfVectorizer(max_features=3000)
18
+ X_vec = vectorizer.fit_transform(X)
19
+
20
+ # SMOTE Oversampling
21
+ sm = SMOTE(random_state=42)
22
+ X_resampled, y_resampled = sm.fit_resample(X_vec, y)
23
+
24
+ # Train-test split
25
+ X_train, X_test, y_train, y_test = train_test_split(
26
+ X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
27
+ )
28
+
29
+ from sklearn.ensemble import RandomForestClassifier
30
+
31
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
32
+ model.fit(X_train, y_train)
33
+ y_pred = model.predict(X_test)
34
+
35
+ # Evaluation
36
+ print("\n✅ Balanced TF-IDF + RandomForestClassifier")
37
+ print(classification_report(y_test, y_pred, zero_division=0))
38
+ print("\n🔍 Confusion Matrix:")
39
+ print(confusion_matrix(y_test, y_pred))
40
+
TF-IDF model_tp.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import classification_report, confusion_matrix
6
+ from imblearn.over_sampling import SMOTE
7
+
8
+ # Load dataset
9
+ df = pd.read_csv(r"D:\Python_files\fully_merged.csv")
10
+ df = df.dropna(subset=['article', 'label'])
11
+ df = df[df['label'].isin(['positive', 'neutral', 'negative'])]
12
+
13
+ # TF-IDF Vectorization
14
+ X = df['article'].values
15
+ y = df['label'].values
16
+
17
+ vectorizer = TfidfVectorizer(max_features=3000)
18
+ X_vec = vectorizer.fit_transform(X)
19
+
20
+ # SMOTE Oversampling
21
+ sm = SMOTE(random_state=42)
22
+ X_resampled, y_resampled = sm.fit_resample(X_vec, y)
23
+
24
+ # Train-test split
25
+ X_train, X_test, y_train, y_test = train_test_split(
26
+ X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
27
+ )
28
+
29
+ from sklearn.ensemble import RandomForestClassifier
30
+
31
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
32
+ model.fit(X_train, y_train)
33
+ y_pred = model.predict(X_test)
34
+
35
+ # Evaluation
36
+ print("\n✅ Balanced TF-IDF + RandomForestClassifier")
37
+ print(classification_report(y_test, y_pred, zero_division=0))
38
+ print("\n🔍 Confusion Matrix:")
39
+ print(confusion_matrix(y_test, y_pred))
40
+
best_model_fold_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:664bf2a6783a80cde2e5617629a62678dcc053d14576084343b87dc11f6abe2b
3
+ size 439106274
best_model_fold_1_tp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:664bf2a6783a80cde2e5617629a62678dcc053d14576084343b87dc11f6abe2b
3
+ size 439106274
sbert_rf_pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00bbba07af657cedfbdf786fdb47f5447a5775eea554aca84a60cac522e87bfa
3
+ size 93011983
sbert_rf_pipeline_tp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00bbba07af657cedfbdf786fdb47f5447a5775eea554aca84a60cac522e87bfa
3
+ size 93011983
sentiment_pipeline_chunking.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d753926e21f482459bef64122a2ad54e6d640366f4596f553359603aa1864e34
3
+ size 93321113
sentiment_pipeline_chunking_tp.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d753926e21f482459bef64122a2ad54e6d640366f4596f553359603aa1864e34
3
+ size 93321113
sentiment_pipeline_lgbm.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d3df0cf45a1af1564644b68c7478b8b5cf53958f66a62e4f64cab179f3a1717
3
+ size 92495628
sentiment_pipeline_lgbm_tp.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d3df0cf45a1af1564644b68c7478b8b5cf53958f66a62e4f64cab179f3a1717
3
+ size 92495628