| |
|
|
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
|
|
| from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split |
| from sklearn.metrics import f1_score, make_scorer, classification_report, confusion_matrix |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier |
| from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline |
| from scipy.stats import randint |
|
|
| |
| df = pd.read_csv("Feature_Extracted_Corpus.csv") |
|
|
| |
| df['sentence_construction_type'] = df['sentence_construction_type'].replace(['Unknown'], 'Other') |
| df['sentence_type'] = df['sentence_type'].replace(['Compound-Complex'], 'Other') |
|
|
| print("Sentence Construction Type Counts:\n", df['sentence_construction_type'].value_counts()) |
| print("\nSentence Type Counts:\n", df['sentence_type'].value_counts()) |
| print("\nTarget Distribution:\n", df['group'].value_counts()) |
|
|
| |
| label_encoder = LabelEncoder() |
| y = label_encoder.fit_transform(df["group"]) |
| print("\nEncoded classes:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))) |
|
|
| |
| X = df.drop(columns=["id", "text", "group", "grade"]) |
|
|
| |
| numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() |
| categorical_cols = ["sentence_construction_type", "sentence_type"] |
|
|
| print(f"\nNumeric features: {numeric_cols}") |
| print(f"Categorical features: {categorical_cols}") |
|
|
| |
| preprocessor = ColumnTransformer( |
| transformers=[ |
| ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols), |
| ("num", StandardScaler(), numeric_cols) |
| ], |
| remainder="passthrough" |
| ) |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| print(f"\nTraining set size: {len(X_train)}") |
| print(f"Test set size: {len(X_test)}") |
| print(f"Training set class distribution: {np.bincount(y_train)}") |
| print(f"Test set class distribution: {np.bincount(y_test)}") |
|
|
| |
| print("\n=== ANALYZING FEATURE IMPORTANCE ===") |
| best_params = { |
| 'max_depth': 10, |
| 'max_features': 'sqrt', |
| 'min_samples_leaf': 4, |
| 'min_samples_split': 2, |
| 'n_estimators': 300 |
| } |
|
|
| |
| pipeline = Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", RandomForestClassifier(**best_params, random_state=42, class_weight='balanced')) |
| ]) |
|
|
| pipeline.fit(X_train, y_train) |
|
|
| |
| try: |
| feature_names = (pipeline.named_steps['preprocessing'] |
| .get_feature_names_out() |
| .tolist()) |
| |
| |
| feature_names = [name.replace('cat__', '').replace('num__', '') for name in feature_names] |
| |
| |
| importances = pipeline.named_steps['classifier'].feature_importances_ |
| |
| |
| feature_importance_df = pd.DataFrame({ |
| 'feature': feature_names, |
| 'importance': importances |
| }).sort_values('importance', ascending=False) |
| |
| print("\nTop 10 most important features:") |
| print(feature_importance_df.head(10)) |
| |
| |
| plt.figure(figsize=(12, 8)) |
| top_20 = feature_importance_df.head(20) |
| plt.barh(range(len(top_20)), top_20['importance']) |
| plt.yticks(range(len(top_20)), top_20['feature']) |
| plt.xlabel('Feature Importance') |
| plt.title('Top 20 Most Important Features') |
| plt.gca().invert_yaxis() |
| plt.tight_layout() |
| plt.show() |
| |
| except Exception as e: |
| print(f"Could not get feature names: {e}") |
| |
| importances = pipeline.named_steps['classifier'].feature_importances_ |
| print(f"\nTop 10 feature importances (indices): {importances.argsort()[-10:][::-1]}") |
|
|
| |
| y_pred = pipeline.predict(X_test) |
| current_f1 = f1_score(y_test, y_pred, average='macro') |
| print(f"\n=== CURRENT MODEL PERFORMANCE ===") |
| print(f"Test F1 Macro: {current_f1:.4f}") |
| print("\nClassification Report:") |
| print(classification_report(y_test, y_pred, target_names=label_encoder.classes_)) |
|
|
| |
| cm = confusion_matrix(y_test, y_pred) |
| plt.figure(figsize=(8, 6)) |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', |
| xticklabels=label_encoder.classes_, |
| yticklabels=label_encoder.classes_) |
| plt.title('Confusion Matrix - Current Model') |
| plt.ylabel('True Label') |
| plt.xlabel('Predicted Label') |
| plt.show() |
|
|
| |
| print("\n=== OPTION 1: FOCUSED GRID SEARCH ===") |
| focused_param_grid = { |
| "classifier__n_estimators": [200, 300, 400], |
| "classifier__max_depth": [8, 10, 12], |
| "classifier__min_samples_split": [2, 3, 4], |
| "classifier__min_samples_leaf": [3, 4, 5, 6], |
| "classifier__max_features": ["sqrt", "log2", 0.3, 0.4], |
| "classifier__class_weight": ["balanced", "balanced_subsample"] |
| } |
|
|
| |
| smaller_grid = { |
| "classifier__n_estimators": [200, 300], |
| "classifier__max_depth": [8, 10], |
| "classifier__min_samples_split": [2, 4], |
| "classifier__min_samples_leaf": [3, 4, 5], |
| "classifier__max_features": ["sqrt", 0.4], |
| "classifier__class_weight": ["balanced"] |
| } |
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
| f1_macro_scorer = make_scorer(f1_score, average="macro") |
|
|
| grid_search = GridSearchCV( |
| Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", RandomForestClassifier(random_state=42)) |
| ]), |
| smaller_grid, |
| scoring=f1_macro_scorer, |
| cv=cv, |
| n_jobs=-1, |
| verbose=1 |
| ) |
|
|
| print("Running grid search (this may take a few minutes)...") |
| grid_search.fit(X_train, y_train) |
|
|
| print("\n=== BEST MODEL FROM GRID SEARCH ===") |
| print(f"Best CV Score: {grid_search.best_score_:.4f}") |
| print("Best Params:") |
| for param, value in grid_search.best_params_.items(): |
| print(f" {param}: {value}") |
|
|
| |
| y_pred_grid = grid_search.predict(X_test) |
| grid_f1 = f1_score(y_test, y_pred_grid, average='macro') |
| print(f"\nTest F1 Macro: {grid_f1:.4f}") |
|
|
| print("\nClassification Report - Grid Search Model:") |
| print(classification_report(y_test, y_pred_grid, target_names=label_encoder.classes_)) |
|
|
| |
| print("\n=== OPTION 2: COMPARING DIFFERENT ALGORITHMS ===") |
|
|
| algorithms = { |
| 'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'), |
| 'Extra Trees': ExtraTreesClassifier(random_state=42, class_weight='balanced'), |
| 'Gradient Boosting': GradientBoostingClassifier(random_state=42) |
| } |
|
|
| results = {} |
| for name, algorithm in algorithms.items(): |
| print(f"\nTraining {name}...") |
| pipe = Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", algorithm) |
| ]) |
| |
| |
| pipe.fit(X_train, y_train) |
| y_pred = pipe.predict(X_test) |
| f1 = f1_score(y_test, y_pred, average='macro') |
| results[name] = f1 |
| print(f"{name} F1 Macro: {f1:.4f}") |
|
|
| |
| plt.figure(figsize=(10, 6)) |
| plt.bar(results.keys(), results.values()) |
| plt.ylabel('F1 Macro Score') |
| plt.title('Algorithm Comparison') |
| for i, (name, score) in enumerate(results.items()): |
| plt.text(i, score + 0.01, f'{score:.3f}', ha='center') |
| plt.ylim(0, 1) |
| plt.show() |
|
|
| |
| print("\n=== OPTION 3: ENSEMBLE MODEL ===") |
|
|
| |
| rf1 = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_leaf=4, |
| random_state=42, class_weight='balanced') |
| rf2 = RandomForestClassifier(n_estimators=200, max_depth=8, min_samples_leaf=5, |
| random_state=43, class_weight='balanced_subsample') |
| rf3 = RandomForestClassifier(n_estimators=400, max_depth=12, min_samples_leaf=3, |
| random_state=44, class_weight='balanced') |
|
|
| |
| ensemble = Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", VotingClassifier( |
| estimators=[ |
| ('rf1', rf1), |
| ('rf2', rf2), |
| ('rf3', rf3) |
| ], |
| voting='soft' |
| )) |
| ]) |
|
|
| print("Training ensemble...") |
| ensemble.fit(X_train, y_train) |
| ensemble_pred = ensemble.predict(X_test) |
| ensemble_f1 = f1_score(y_test, ensemble_pred, average='macro') |
| print(f"Ensemble Test F1 Macro: {ensemble_f1:.4f}") |
|
|
| print("\nClassification Report - Ensemble:") |
| print(classification_report(y_test, ensemble_pred, target_names=label_encoder.classes_)) |
|
|
| |
| print("\n" + "="*50) |
| print("=== SUMMARY OF RESULTS ===") |
| print(f"Current Model F1: {current_f1:.4f}") |
| print(f"Grid Search Model F1: {grid_f1:.4f}") |
| print(f"Ensemble Model F1: {ensemble_f1:.4f}") |
| print("\nAlgorithm Comparison:") |
| for name, score in results.items(): |
| print(f" {name}: {score:.4f}") |
|
|
| |
| best_f1 = max(current_f1, grid_f1, ensemble_f1, *results.values()) |
| print(f"\nBest Overall F1 Macro: {best_f1:.4f}") |
|
|
| |
| if best_f1 == ensemble_f1: |
| best_model = ensemble |
| print("Best model: Ensemble") |
| elif best_f1 == grid_f1: |
| best_model = grid_search.best_estimator_ |
| print("Best model: Grid Search") |
| elif best_f1 == current_f1: |
| best_model = pipeline |
| print("Best model: Current Model") |
| else: |
| |
| best_algo_name = max(results, key=results.get) |
| best_model = Pipeline([ |
| ("preprocessing", preprocessor), |
| ("classifier", algorithms[best_algo_name]) |
| ]) |
| best_model.fit(X_train, y_train) |
| print(f"Best model: {best_algo_name}") |
|
|
| |
| import joblib |
| joblib.dump(best_model, 'best_model.pkl') |
| print("\nBest model saved as 'best_model.pkl'") |