Spaces:

Jandayl
/

Alalay

Sleeping

File size: 10,232 Bytes

# THIS FILE IS NOT CRUCIAL TO THE CURRENT SYSTEM. USED ONLY AS A BENCHMARK.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import randint

# Load dataset
df = pd.read_csv("Feature_Extracted_Corpus.csv")

# Clean categorical features
df['sentence_construction_type'] = df['sentence_construction_type'].replace(['Unknown'], 'Other')
df['sentence_type'] = df['sentence_type'].replace(['Compound-Complex'], 'Other')

print("Sentence Construction Type Counts:\n", df['sentence_construction_type'].value_counts())
print("\nSentence Type Counts:\n", df['sentence_type'].value_counts())
print("\nTarget Distribution:\n", df['group'].value_counts())

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["group"])
print("\nEncoded classes:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))

# Feature matrix
X = df.drop(columns=["id", "text", "group", "grade"])

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = ["sentence_construction_type", "sentence_type"]

print(f"\nNumeric features: {numeric_cols}")
print(f"Categorical features: {categorical_cols}")

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ],
    remainder="passthrough"
)

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")

# Train a model with best params from previous run
print("\n=== ANALYZING FEATURE IMPORTANCE ===")
best_params = {
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 2,
    'n_estimators': 300
}

# Create and train pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(**best_params, random_state=42, class_weight='balanced'))
])

pipeline.fit(X_train, y_train)

# Get feature names after preprocessing
try:
    feature_names = (pipeline.named_steps['preprocessing']
                     .get_feature_names_out()
                     .tolist())
    
    # Clean up feature names for better display
    feature_names = [name.replace('cat__', '').replace('num__', '') for name in feature_names]
    
    # Get feature importances
    importances = pipeline.named_steps['classifier'].feature_importances_
    
    # Create feature importance dataframe
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance_df.head(10))
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_20 = feature_importance_df.head(20)
    plt.barh(range(len(top_20)), top_20['importance'])
    plt.yticks(range(len(top_20)), top_20['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Most Important Features')
    plt.gca().invert_yaxis()  # Most important at the top
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"Could not get feature names: {e}")
    # Alternative: just get importance values without names
    importances = pipeline.named_steps['classifier'].feature_importances_
    print(f"\nTop 10 feature importances (indices): {importances.argsort()[-10:][::-1]}")

# Evaluate current model
y_pred = pipeline.predict(X_test)
current_f1 = f1_score(y_test, y_pred, average='macro')
print(f"\n=== CURRENT MODEL PERFORMANCE ===")
print(f"Test F1 Macro: {current_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Current Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Option 1: Focused grid search around your best params
print("\n=== OPTION 1: FOCUSED GRID SEARCH ===")
focused_param_grid = {
    "classifier__n_estimators": [200, 300, 400],
    "classifier__max_depth": [8, 10, 12],
    "classifier__min_samples_split": [2, 3, 4],
    "classifier__min_samples_leaf": [3, 4, 5, 6],
    "classifier__max_features": ["sqrt", "log2", 0.3, 0.4],
    "classifier__class_weight": ["balanced", "balanced_subsample"]
}

# Use smaller grid for faster execution
smaller_grid = {
    "classifier__n_estimators": [200, 300],
    "classifier__max_depth": [8, 10],
    "classifier__min_samples_split": [2, 4],
    "classifier__min_samples_leaf": [3, 4, 5],
    "classifier__max_features": ["sqrt", 0.4],
    "classifier__class_weight": ["balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Used 5-fold based on planned methodology
f1_macro_scorer = make_scorer(f1_score, average="macro")

grid_search = GridSearchCV(
    Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42))
    ]),
    smaller_grid,  # Using smaller grid for speed
    scoring=f1_macro_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)

print("Running grid search (this may take a few minutes)...")
grid_search.fit(X_train, y_train)

print("\n=== BEST MODEL FROM GRID SEARCH ===")
print(f"Best CV Score: {grid_search.best_score_:.4f}")
print("Best Params:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Evaluate on test set
y_pred_grid = grid_search.predict(X_test)
grid_f1 = f1_score(y_test, y_pred_grid, average='macro')
print(f"\nTest F1 Macro: {grid_f1:.4f}")

print("\nClassification Report - Grid Search Model:")
print(classification_report(y_test, y_pred_grid, target_names=label_encoder.classes_))

# Option 2: Try different algorithms
print("\n=== OPTION 2: COMPARING DIFFERENT ALGORITHMS ===")

algorithms = {
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'Extra Trees': ExtraTreesClassifier(random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}
for name, algorithm in algorithms.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", algorithm)
    ])
    
    # Use default params for quick comparison
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    results[name] = f1
    print(f"{name} F1 Macro: {f1:.4f}")

# Plot algorithm comparison
plt.figure(figsize=(10, 6))
plt.bar(results.keys(), results.values())
plt.ylabel('F1 Macro Score')
plt.title('Algorithm Comparison')
for i, (name, score) in enumerate(results.items()):
    plt.text(i, score + 0.01, f'{score:.3f}', ha='center')
plt.ylim(0, 1)
plt.show()

# Try ensemble of best models
print("\n=== OPTION 3: ENSEMBLE MODEL ===")

# Create a few different models
rf1 = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_leaf=4, 
                             random_state=42, class_weight='balanced')
rf2 = RandomForestClassifier(n_estimators=200, max_depth=8, min_samples_leaf=5, 
                             random_state=43, class_weight='balanced_subsample')
rf3 = RandomForestClassifier(n_estimators=400, max_depth=12, min_samples_leaf=3, 
                             random_state=44, class_weight='balanced')

# Create ensemble
ensemble = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", VotingClassifier(
        estimators=[
            ('rf1', rf1),
            ('rf2', rf2),
            ('rf3', rf3)
        ],
        voting='soft'  # Use probability voting
    ))
])

print("Training ensemble...")
ensemble.fit(X_train, y_train)
ensemble_pred = ensemble.predict(X_test)
ensemble_f1 = f1_score(y_test, ensemble_pred, average='macro')
print(f"Ensemble Test F1 Macro: {ensemble_f1:.4f}")

print("\nClassification Report - Ensemble:")
print(classification_report(y_test, ensemble_pred, target_names=label_encoder.classes_))

# Summary of all results
print("\n" + "="*50)
print("=== SUMMARY OF RESULTS ===")
print(f"Current Model F1: {current_f1:.4f}")
print(f"Grid Search Model F1: {grid_f1:.4f}")
print(f"Ensemble Model F1: {ensemble_f1:.4f}")
print("\nAlgorithm Comparison:")
for name, score in results.items():
    print(f"  {name}: {score:.4f}")

# Find the best performing model
best_f1 = max(current_f1, grid_f1, ensemble_f1, *results.values())
print(f"\nBest Overall F1 Macro: {best_f1:.4f}")

# Save the best model if needed
if best_f1 == ensemble_f1:
    best_model = ensemble
    print("Best model: Ensemble")
elif best_f1 == grid_f1:
    best_model = grid_search.best_estimator_
    print("Best model: Grid Search")
elif best_f1 == current_f1:
    best_model = pipeline
    print("Best model: Current Model")
else:
    # Find which algorithm was best
    best_algo_name = max(results, key=results.get)
    best_model = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", algorithms[best_algo_name])
    ])
    best_model.fit(X_train, y_train)
    print(f"Best model: {best_algo_name}")

# Save the model
import joblib
joblib.dump(best_model, 'best_model.pkl')
print("\nBest model saved as 'best_model.pkl'")