import pandas as pd import joblib from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report train_df = pd.read_csv("basemodel/train_features.csv") val_df = pd.read_csv("basemodel/val_features.csv") drop_cols = ["Label"] if "language" in train_df.columns: drop_cols.append("language") X_train = train_df.drop(columns=drop_cols) y_train = train_df["Label"] X_val = val_df.drop(columns=drop_cols) y_val = val_df["Label"] rf = RandomForestClassifier( n_estimators=200, max_depth=8, min_samples_split=5, min_samples_leaf=3, random_state=42, class_weight="balanced" ) rf.fit(X_train, y_train) val_preds = rf.predict(X_val) accuracy = accuracy_score(y_val, val_preds) print("\nValidation Accuracy:", round(accuracy, 4)) print("\nValidation Classification Report:\n") print(classification_report(y_val, val_preds, target_names=["Human", "AI"])) joblib.dump(rf, "basemodel/random_forest_baseline.pkl") print("\n✅ Random Forest baseline model saved to basemodel/random_forest_baseline.pkl")