Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder, MinMaxScaler | |
| import math | |
| import matplotlib.pyplot as plt, joblib | |
| # from sklearn.linear_model import LinearRegression | |
| from sklearn.ensemble import RandomForestRegressor | |
| # from sklearn.tree import DecisionTreeRegressor | |
| # from catboost import CatBoostRegressor | |
| import warnings, random | |
| from sklearn.metrics import mean_absolute_error as mae | |
| from sklearn.metrics import mean_squared_error as mse | |
| # from sklearn import tree | |
| # from sklearn.svm import SVR | |
| # from sklearn.ensemble import VotingRegressor | |
| import os | |
| warnings.filterwarnings("ignore") | |
| features = [ | |
| "batting_team", | |
| # "bowling_team", | |
| # "balls", | |
| # "runs", | |
| # "wickets", | |
| "wkt_last_5_overs", | |
| # "runrate_last_5_overs", | |
| "current_RR", | |
| # "average", | |
| "balls_left", | |
| "wkts_left", | |
| # "required_RR", | |
| # "projected_score_more", | |
| # "min_score_more", | |
| # "max_score_more", | |
| # "projected_avg_score_more", | |
| "runrate_last_5_overs-current_RR", | |
| ] | |
| target = "deviation_from_projected" | |
| # evaluate | |
| def evaluate(model, featuresdf, x_test, fname): | |
| predictdf = featuresdf.loc[x_test.index].copy() | |
| # print(predictdf.columns) | |
| predictdf["h_deviation_from_projected"] = model.predict( | |
| featuresdf.loc[x_test.index][features] | |
| ) | |
| predictdf["error"] = ( | |
| predictdf["h_deviation_from_projected"] - predictdf["deviation_from_projected"] | |
| ) | |
| predictdf["abs_error"] = predictdf["error"].abs() | |
| ax = plt.gca() | |
| plt.plot(predictdf.groupby("balls").aggregate({"abs_error": "mean"})) | |
| plt.legend("Abs deviation") | |
| # ax.set_ylim([-50, 50]) | |
| plt.title(type(model).__name__) | |
| plt.xlabel("Balls on which prediction was made") | |
| plt.ylabel("Mean Abs Prediction error") | |
| plt.savefig("result/" + fname + ".png") | |
| plt.clf() | |
| predictdf.sample(frac=0.0001).to_csv("result/" + fname + "_sample.csv") | |
| # fig = plt.figure(figsize=(25, 20)) | |
| # tree.plot_tree(model) | |
| # fig.savefig(fname + ".png") | |
| # plt.clf()batting_teamsort_values("overs", ascending=False).to_string(index=False)) | |
| def train_test_split_matchid(df, matchids, split=0.2): | |
| unique_match_ids = set(matchids) | |
| print(f"{len(unique_match_ids)=}") | |
| testids = random.sample(unique_match_ids, int(len(unique_match_ids) * split)) | |
| trainids = list(unique_match_ids.difference(testids)) | |
| return ( | |
| df[features][df.matchid.isin(trainids)], | |
| df[features][df.matchid.isin(testids)], | |
| df[target][df.matchid.isin(trainids)], | |
| df[target][df.matchid.isin(testids)], | |
| ) | |
| def encode_teams(series): | |
| encoder = LabelEncoder() | |
| encoder.fit(series) | |
| np.save("model/team.npy", encoder.classes_) | |
| def transform_teams(series): | |
| encoder = LabelEncoder() | |
| encoder.classes_ = np.load("model/team.npy", allow_pickle=True) | |
| return encoder.transform(np.array(series).reshape(-1, 1)).reshape(-1) | |
| def plot_feature_importance(f, imp, fname): | |
| importance = ( | |
| pd.DataFrame( | |
| zip(*[f, imp]), | |
| columns=["feature", "importance"], | |
| ) | |
| .sort_values("importance", ascending=False) | |
| .set_index("feature") | |
| ) | |
| importance["importance"] = importance["importance"] / importance["importance"].sum() | |
| fig, ax = plt.subplots() | |
| importance.plot.bar(ax=ax) | |
| ax.bar_label(ax.containers[0], labels=f, rotation=90, label_type="center") | |
| ax.set_xticks([]) | |
| ax.set_title("Feature importances for predicted score " + fname) | |
| ax.set_ylabel("Significance") | |
| ax.set_xlabel("Features") | |
| plt.savefig("result/" + fname + "featureimp.png") | |
| plt.clf() | |
| def train(fname, max_depth=-1): | |
| print("training on", fname, "...") | |
| featuresdf = pd.read_feather(fname) | |
| featuresdf = featuresdf[featuresdf["inning"] == 2] | |
| encode_teams( | |
| featuresdf["batting_team"].to_list() + featuresdf["bowling_team"].to_list() | |
| ) | |
| featuresdf["batting_team"] = transform_teams(featuresdf["batting_team"]) | |
| featuresdf["bowling_team"] = transform_teams(featuresdf["bowling_team"]) | |
| x_train, x_test, y_train, y_test = train_test_split_matchid( | |
| featuresdf, featuresdf["matchid"], 0.2 | |
| ) | |
| print(f"{len(x_train)=} {len(x_test)=}") | |
| model = RandomForestRegressor(max_depth=8) | |
| model.fit(x_train, y_train) | |
| # for xgb | |
| # plot_feature_importance( | |
| # model.get_booster().get_score(importance_type="gain").keys(), | |
| # model.get_booster().get_score(importance_type="gain").values(), | |
| # fname, | |
| # ) | |
| # for rf | |
| plot_feature_importance( | |
| features, | |
| np.std([tree.feature_importances_ for tree in model.estimators_], axis=0), | |
| os.path.basename(fname), | |
| ) | |
| print("Depth:", [e.tree_.max_depth for e in model.estimators_]) | |
| # for dt | |
| # plot_feature_importance( | |
| # features, | |
| # model.feature_importances_, | |
| # fname, | |
| # ) | |
| # print(model.tree_.max_depth) | |
| # print(f"{model.score(x_train, y_train)=}, {model.score(x_test, y_test)=}") | |
| print( | |
| f"{mse(model.predict(x_train), y_train, squared=False)=}, {mse(model.predict(x_test), y_test, squared=False)=}" | |
| ) | |
| evaluate(model, featuresdf, x_test, os.path.basename(fname)) | |
| model.fit(featuresdf[features], featuresdf[target]) | |
| joblib.dump(model, f"model/{os.path.basename(fname)}.joblib") | |
| return model | |
| if __name__ == "__main__": | |
| train("data/t20features.feather") | |
| train("data/odifeatures.feather") | |