Spaces:
Build error
Build error
| """ | |
| The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml | |
| """ | |
| import os.path | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from configs.config import cfg | |
| from configs.constants import DATA_ROOT | |
| def result_finder(home, away): | |
| """ | |
| Encode the data | |
| :param home: | |
| :param away: | |
| :return: | |
| """ | |
| if home > away: | |
| return pd.Series([0, 3, 0]) | |
| if home < away: | |
| return pd.Series([1, 0, 3]) | |
| else: | |
| return pd.Series([2, 1, 1]) | |
| def create_dataset(df: pd.DataFrame): | |
| """ | |
| Create train, test dataset | |
| :param df: | |
| :return: | |
| """ | |
| x_, y = df.iloc[:, 3:], df[["target"]] | |
| x_train, x_test, y_train, y_test = train_test_split( | |
| x_, y, test_size=0.22, random_state=100 | |
| ) | |
| return x_train, x_test, y_train, y_test | |
| def data_preparing(): | |
| """ | |
| Data preparing | |
| :return: | |
| """ | |
| try: | |
| df = pd.read_csv(cfg.data.result_url) | |
| except Exception as e: | |
| print(e) | |
| df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file)) | |
| df["date"] = pd.to_datetime(df["date"]) | |
| df.dropna(inplace=True) | |
| df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True) | |
| # RANK data prepare | |
| rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file)) | |
| rank["rank_date"] = pd.to_datetime(rank["rank_date"]) | |
| rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True) | |
| rank["country_full"] = ( | |
| rank["country_full"] | |
| .str.replace("IR Iran", "Iran") | |
| .str.replace("Korea Republic", "South Korea") | |
| .str.replace("USA", "United States") | |
| ) | |
| # The merge is made in order to get a dataset FIFA games and its rankings. | |
| rank = ( | |
| rank.set_index(["rank_date"]) | |
| .groupby(["country_full"], group_keys=False) | |
| .resample("D") | |
| .first() | |
| .fillna(method="ffill") | |
| .reset_index() | |
| ) | |
| df_wc_ranked = df.merge( | |
| rank[ | |
| [ | |
| "country_full", | |
| "total_points", | |
| "previous_points", | |
| "rank", | |
| "rank_change", | |
| "rank_date", | |
| ] | |
| ], | |
| left_on=["date", "home_team"], | |
| right_on=["rank_date", "country_full"], | |
| ).drop(["rank_date", "country_full"], axis=1) | |
| df_wc_ranked = df_wc_ranked.merge( | |
| rank[ | |
| [ | |
| "country_full", | |
| "total_points", | |
| "previous_points", | |
| "rank", | |
| "rank_change", | |
| "rank_date", | |
| ] | |
| ], | |
| left_on=["date", "away_team"], | |
| right_on=["rank_date", "country_full"], | |
| suffixes=("_home", "_away"), | |
| ).drop(["rank_date", "country_full"], axis=1) | |
| # Featuring | |
| df = df_wc_ranked | |
| df[["result", "home_team_points", "away_team_points"]] = df.apply( | |
| lambda x: result_finder(x["home_score"], x["away_score"]), axis=1 | |
| ) | |
| # we create columns that will help in the creation of the features: ranking difference, | |
| # points won at the game vs. team faced rank, and goals difference in the game. | |
| # All features that are not differences should be created for the two teams (away and home). | |
| df["rank_dif"] = df["rank_home"] - df["rank_away"] | |
| df["sg"] = df["home_score"] - df["away_score"] | |
| df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"] | |
| df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"] | |
| # In order to create the features, I'll separate the dataset in home team's and away team's dataset, | |
| # unify them and calculate the past game values. | |
| # After that, I'll separate again and merge them, retrieving the original dataset. | |
| # This process optimizes the creation of the features. | |
| home_team = df[ | |
| [ | |
| "date", | |
| "home_team", | |
| "home_score", | |
| "away_score", | |
| "rank_home", | |
| "rank_away", | |
| "rank_change_home", | |
| "total_points_home", | |
| "result", | |
| "rank_dif", | |
| "points_home_by_rank", | |
| "home_team_points", | |
| ] | |
| ] | |
| away_team = df[ | |
| [ | |
| "date", | |
| "away_team", | |
| "away_score", | |
| "home_score", | |
| "rank_away", | |
| "rank_home", | |
| "rank_change_away", | |
| "total_points_away", | |
| "result", | |
| "rank_dif", | |
| "points_away_by_rank", | |
| "away_team_points", | |
| ] | |
| ] | |
| home_team.columns = [ | |
| h.replace("home_", "") | |
| .replace("_home", "") | |
| .replace("away_", "suf_") | |
| .replace("_away", "_suf") | |
| for h in home_team.columns | |
| ] | |
| away_team.columns = [ | |
| a.replace("away_", "") | |
| .replace("_away", "") | |
| .replace("home_", "suf_") | |
| .replace("_home", "_suf") | |
| for a in away_team.columns | |
| ] | |
| team_stats = home_team.append(away_team) | |
| stats_val = [] | |
| for index, row in team_stats.iterrows(): | |
| team = row["team"] | |
| date = row["date"] | |
| past_games = team_stats.loc[ | |
| (team_stats["team"] == team) & (team_stats["date"] < date) | |
| ].sort_values(by=["date"], ascending=False) | |
| last5 = past_games.head(5) | |
| goals = past_games["score"].mean() | |
| goals_l5 = last5["score"].mean() | |
| goals_suf = past_games["suf_score"].mean() | |
| goals_suf_l5 = last5["suf_score"].mean() | |
| rank = past_games["rank_suf"].mean() | |
| rank_l5 = last5["rank_suf"].mean() | |
| if len(last5) > 0: | |
| points = ( | |
| past_games["total_points"].values[0] | |
| - past_games["total_points"].values[-1] | |
| ) # amount of points earned | |
| points_l5 = ( | |
| last5["total_points"].values[0] - last5["total_points"].values[-1] | |
| ) | |
| else: | |
| points = 0 | |
| points_l5 = 0 | |
| gp = past_games["team_points"].mean() | |
| gp_l5 = last5["team_points"].mean() | |
| gp_rank = past_games["points_by_rank"].mean() | |
| gp_rank_l5 = last5["points_by_rank"].mean() | |
| stats_val.append( | |
| [ | |
| goals, | |
| goals_l5, | |
| goals_suf, | |
| goals_suf_l5, | |
| rank, | |
| rank_l5, | |
| points, | |
| points_l5, | |
| gp, | |
| gp_l5, | |
| gp_rank, | |
| gp_rank_l5, | |
| ] | |
| ) | |
| stats_cols = [ | |
| "goals_mean", | |
| "goals_mean_l5", | |
| "goals_suf_mean", | |
| "goals_suf_mean_l5", | |
| "rank_mean", | |
| "rank_mean_l5", | |
| "points_mean", | |
| "points_mean_l5", | |
| "game_points_mean", | |
| "game_points_mean_l5", | |
| "game_points_rank_mean", | |
| "game_points_rank_mean_l5", | |
| ] | |
| stats_df = pd.DataFrame(stats_val, columns=stats_cols) | |
| full_df = pd.concat( | |
| [team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False | |
| ) | |
| home_team_stats = full_df.iloc[: int(full_df.shape[0] / 2), :] | |
| away_team_stats = full_df.iloc[int(full_df.shape[0] / 2) :, :] | |
| home_team_stats = home_team_stats[home_team_stats.columns[-12:]] | |
| away_team_stats = away_team_stats[away_team_stats.columns[-12:]] | |
| home_team_stats.columns = ["home_" + str(col) for col in home_team_stats.columns] | |
| away_team_stats.columns = ["away_" + str(col) for col in away_team_stats.columns] | |
| # In order to unify the database, is needed to add home and away suffix for each column. | |
| # After that, the data is ready to be merged. | |
| match_stats = pd.concat( | |
| [home_team_stats, away_team_stats.reset_index(drop=True)], | |
| axis=1, | |
| ignore_index=False, | |
| ) | |
| full_df = pd.concat( | |
| [df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False | |
| ) | |
| # Drop friendly game | |
| full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x)) | |
| full_df = pd.get_dummies(full_df, columns=["is_friendly"]) | |
| base_df = full_df[ | |
| [ | |
| "date", | |
| "home_team", | |
| "away_team", | |
| "rank_home", | |
| "rank_away", | |
| "home_score", | |
| "away_score", | |
| "result", | |
| "rank_dif", | |
| "rank_change_home", | |
| "rank_change_away", | |
| "home_goals_mean", | |
| "home_goals_mean_l5", | |
| "home_goals_suf_mean", | |
| "home_goals_suf_mean_l5", | |
| "home_rank_mean", | |
| "home_rank_mean_l5", | |
| "home_points_mean", | |
| "home_points_mean_l5", | |
| "away_goals_mean", | |
| "away_goals_mean_l5", | |
| "away_goals_suf_mean", | |
| "away_goals_suf_mean_l5", | |
| "away_rank_mean", | |
| "away_rank_mean_l5", | |
| "away_points_mean", | |
| "away_points_mean_l5", | |
| "home_game_points_mean", | |
| "home_game_points_mean_l5", | |
| "home_game_points_rank_mean", | |
| "home_game_points_rank_mean_l5", | |
| "away_game_points_mean", | |
| "away_game_points_mean_l5", | |
| "away_game_points_rank_mean", | |
| "away_game_points_rank_mean_l5", | |
| "is_friendly_0", | |
| "is_friendly_1", | |
| ] | |
| ] | |
| df = base_df.dropna() | |
| df["target"] = df["result"].apply(lambda x: no_draw(x)) | |
| model_db = create_db(df) | |
| return df, model_db | |
| def find_friendly(x): | |
| """ | |
| Return whether the match is friendly match or not. | |
| :param x: | |
| :return: | |
| """ | |
| if x == "Friendly": | |
| return 1 | |
| else: | |
| return 0 | |
| def create_db(df): | |
| """ | |
| :param df: | |
| :return: | |
| """ | |
| columns = [ | |
| "home_team", | |
| "away_team", | |
| "target", | |
| "rank_dif", | |
| "home_goals_mean", | |
| "home_rank_mean", | |
| "away_goals_mean", | |
| "away_rank_mean", | |
| "home_rank_mean_l5", | |
| "away_rank_mean_l5", | |
| "home_goals_suf_mean", | |
| "away_goals_suf_mean", | |
| "home_goals_mean_l5", | |
| "away_goals_mean_l5", | |
| "home_goals_suf_mean_l5", | |
| "away_goals_suf_mean_l5", | |
| "home_game_points_rank_mean", | |
| "home_game_points_rank_mean_l5", | |
| "away_game_points_rank_mean", | |
| "away_game_points_rank_mean_l5", | |
| "is_friendly_0", | |
| "is_friendly_1", | |
| ] | |
| base = df.loc[:, columns] | |
| base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"] | |
| base.loc[:, "goals_dif_l5"] = ( | |
| base["home_goals_mean_l5"] - base["away_goals_mean_l5"] | |
| ) | |
| base.loc[:, "goals_suf_dif"] = ( | |
| base["home_goals_suf_mean"] - base["away_goals_suf_mean"] | |
| ) | |
| base.loc[:, "goals_suf_dif_l5"] = ( | |
| base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"] | |
| ) | |
| base.loc[:, "goals_per_ranking_dif"] = ( | |
| base["home_goals_mean"] / base["home_rank_mean"] | |
| ) - (base["away_goals_mean"] / base["away_rank_mean"]) | |
| base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"] | |
| base.loc[:, "dif_rank_agst_l5"] = ( | |
| base["home_rank_mean_l5"] - base["away_rank_mean_l5"] | |
| ) | |
| base.loc[:, "dif_points_rank"] = ( | |
| base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"] | |
| ) | |
| base.loc[:, "dif_points_rank_l5"] = ( | |
| base["home_game_points_rank_mean_l5"] - base["away_game_points_rank_mean_l5"] | |
| ) | |
| model_df = base[ | |
| [ | |
| "home_team", | |
| "away_team", | |
| "target", | |
| "rank_dif", | |
| "goals_dif", | |
| "goals_dif_l5", | |
| "goals_suf_dif", | |
| "goals_suf_dif_l5", | |
| "goals_per_ranking_dif", | |
| "dif_rank_agst", | |
| "dif_rank_agst_l5", | |
| "dif_points_rank", | |
| "dif_points_rank_l5", | |
| "is_friendly_0", | |
| "is_friendly_1", | |
| ] | |
| ] | |
| return model_df | |
| def no_draw(x): | |
| """ | |
| :param x: | |
| :return: | |
| """ | |
| if x == 2: | |
| return 1 | |
| else: | |
| return x | |