text_classificators / src /classical_classifiers.py
theformatisvalid's picture
Upload 7 files
2153792 verified
from typing import Dict, Any, Optional, Union, Tuple
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
XGBClassifier = None
CatBoostClassifier = None
LGBMClassifier = None
try:
from xgboost import XGBClassifier
except ImportError:
pass
try:
from catboost import CatBoostClassifier
except ImportError:
pass
try:
from lightgbm import LGBMClassifier
except ImportError:
pass
def get_logistic_regression(
penalty: str = "l2",
C: float = 1.0,
max_iter: int = 1000,
solver: str = "liblinear", # supports l1
random_state: int = 42
) -> LogisticRegression:
if penalty not in ("l1", "l2", "elasticnet", "none"):
raise ValueError("penalty must be 'l1', 'l2', 'elasticnet', or 'none'")
if penalty == "l1" and solver not in ("liblinear", "saga"):
solver = "liblinear"
return LogisticRegression(
penalty=penalty,
C=C,
max_iter=max_iter,
solver=solver,
random_state=random_state
)
def get_svm_linear(C: float = 1.0, random_state: int = 42) -> SVC:
return SVC(kernel="linear", C=C, probability=True, random_state=random_state)
def get_random_forest(
n_estimators: int = 100,
max_depth: Optional[int] = None,
random_state: int = 42
) -> RandomForestClassifier:
return RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=random_state
)
def get_gradient_boosting(
model_type: str = "xgb",
**kwargs
) -> Union[XGBClassifier, "CatBoostClassifier", "LGBMClassifier"]:
if model_type == "xgb":
if XGBClassifier is None:
raise ImportError("XGBoost not installed. Run: pip install xgboost")
kwargs.setdefault("random_state", 42)
return XGBClassifier(**kwargs)
elif model_type == "cat":
if CatBoostClassifier is None:
raise ImportError("CatBoost not installed. Run: pip install catboost")
kwargs.setdefault("verbose", False)
kwargs.setdefault("random_seed", 42)
return CatBoostClassifier(**kwargs)
elif model_type == "lgb":
if LGBMClassifier is None:
raise ImportError("LightGBM not installed. Run: pip install lightgbm")
kwargs.setdefault("random_state", 42)
return LGBMClassifier(**kwargs)
else:
raise ValueError("model_type must be 'xgb', 'cat', or 'lgb'")
def get_bagging_classifier(
base_estimator: str = "tree",
n_estimators: int = 10,
random_state: int = 42
) -> BaggingClassifier:
if base_estimator == "tree":
from sklearn.tree import DecisionTreeClassifier
estimator = DecisionTreeClassifier(random_state=random_state)
elif base_estimator == "lr":
estimator = get_logistic_regression()
else:
raise ValueError("base_estimator must be 'tree' or 'lr'")
return BaggingClassifier(
estimator=estimator,
n_estimators=n_estimators,
random_state=random_state
)
def get_stacking_classifier(
final_estimator: Optional[BaseEstimator] = None,
cv: int = 5,
random_state: int = 42
) -> StackingClassifier:
estimators = [
("lr", get_logistic_regression()),
("svm", get_svm_linear()),
]
if CatBoostClassifier is not None:
estimators.append(("cat", get_gradient_boosting("cat", iterations=100)))
if final_estimator is None:
final_estimator = get_logistic_regression()
return StackingClassifier(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
passthrough=False
)
def get_voting_classifier(
voting: str = "soft",
use_catboost: bool = True
) -> VotingClassifier:
clfs = [
("lr", get_logistic_regression()),
("svm", get_svm_linear()),
("rf", get_random_forest(n_estimators=50))
]
if use_catboost and CatBoostClassifier is not None:
clfs.append(("cat", get_gradient_boosting("cat", iterations=50, verbose=False)))
return VotingClassifier(
estimators=clfs,
voting=voting
)
def tpot_classifier(
generations: int = 5,
population_size: int = 20,
cv: int = 5,
random_state: int = 42,
verbosity: int = 0
) -> Any:
try:
from tpot import TPOTClassifier
except ImportError:
raise ImportError("TPOT not installed. Run: pip install tpot")
return TPOTClassifier(
generations=generations,
population_size=population_size,
cv=cv,
random_state=random_state,
verbosity=verbosity,
n_jobs=-1
)
def h2o_classifier(
max_runtime_secs: int = 300,
seed: int = 42,
exclude_algos: Optional[list] = None
) -> Any:
try:
import h2o
from h2o.automl import H2OAutoML
except ImportError:
raise ImportError("H2O not installed. Run: pip install h2o")
aml = H2OAutoML(
max_runtime_secs=max_runtime_secs,
seed=seed,
exclude_algos=exclude_algos
)
return aml
def train_and_evaluate(
model: Union[BaseEstimator, Any],
X_train: Union[np.ndarray, pd.DataFrame],
y_train: Union[np.ndarray, pd.Series],
X_test: Union[np.ndarray, pd.DataFrame],
y_test: Union[np.ndarray, pd.Series],
is_h2o: bool = False
) -> Dict[str, Any]:
if is_h2o:
import h2o
train_frame = X_train.cbind(y_train)
test_frame = X_test.cbind(y_test)
y_col = y_train.columns[0]
model.train(x=X_train.columns.tolist(), y=y_col, training_frame=train_frame)
perf = model.model_performance(test_frame)
return {
"accuracy": perf.accuracy()[0],
"auc": perf.auc() if perf._has_auc() else None,
"best_model": model.leader
}
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return {
"accuracy": accuracy_score(y_test, y_pred),
"report": classification_report(y_test, y_pred, output_dict=True),
"model": model
}