Spaces:
Running
Running
| import sys | |
| import os | |
| # Añadir la ruta raíz del proyecto al PYTHONPATH | |
| project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) | |
| sys.path.insert(0, project_root) | |
| from src.utils.helper import desactivar_advertencias | |
| import soccerdata as sd | |
| import pandas as pd | |
| def extract_local(game_str): | |
| try: | |
| parts = game_str.split(" ", 1)[1].split("-") | |
| return parts[0].strip() if len(parts) > 0 else None | |
| except (IndexError, AttributeError): | |
| return None | |
| def extract_away(game_str): | |
| try: | |
| parts = game_str.split(" ", 1)[1].split("-") | |
| return parts[1].strip() if len(parts) > 1 else None | |
| except (IndexError, AttributeError): | |
| return None | |
| class GENERATE_DATASET(): | |
| def __init__(self,current_year): | |
| print("Clase GENERATE_DATASET Inicializada") | |
| desactivar_advertencias() | |
| self.init_variables() | |
| self.mergue_raw_data_all_leagues(current_year) | |
| self.process_and_output_dataset(current_year) | |
| def init_variables(self): | |
| #Years to get from datasource | |
| self.LST_YEARS_CONFIG = [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025] | |
| self.dic_historic_all_leagues = { | |
| "ENG": {}, | |
| "ESP": {}, | |
| "GER": {}, | |
| "FRA": {}, | |
| "ITA": {}, | |
| "NED": {}, | |
| "ENG2": {}, | |
| "POR": {}, | |
| "BEL": {} | |
| } | |
| self.df_database = pd.DataFrame() | |
| # Diccionary to name leagues to get from datasource | |
| self.DIC_LEAGUES_CONFIG = { | |
| "ENG": { | |
| "name": "ENG-Premier League", | |
| "code": "ENG" | |
| }, | |
| "POR": { | |
| "name": "POR-Primeira Liga", | |
| "code": "POR" | |
| }, | |
| "BEL": { | |
| "name": "BEL-Belgian Pro League", | |
| "code": "BEL" | |
| }, | |
| "ESP": { | |
| "name": "ESP-La Liga", | |
| "code": "ESP" | |
| }, | |
| "GER": { | |
| "name": "GER-Bundesliga", | |
| "code": "GER" | |
| }, | |
| "FRA": { | |
| "name": "FRA-Ligue 1", | |
| "code": "FRA" | |
| }, | |
| "ITA": { | |
| "name": "ITA-Serie A", | |
| "code": "ITA" | |
| }, | |
| "NED": { | |
| "name": "NED-Eredivisie", | |
| "code": "NED" | |
| } | |
| } | |
| lst_base = ['season','date','game','round','day','venue','team','GF','GA','opponent',"result"] | |
| lst_columns_shooting = ['Expected_xG','Standard_Sh','Standard_SoT','Standard_Dist'] | |
| lst_columns_passing_type = ['Pass Types_CK'] | |
| lst_columns_passing = ['Total_Att','Long_Att','Ast','1/3','PrgP'] | |
| lst_columns_defensive = ['Tackles_Att 3rd','Tackles_Tkl','Blocks_Blocks','Int','Clr'] | |
| lst_columns_keeper = ['Performance_Save%'] | |
| lst_columns_shot_creation = ['SCA Types_SCA'] | |
| lst_columns_misc = ['Performance_Crs'] | |
| lst_columns_possesion = ['Poss', 'Touches_Att 3rd','Carries_PrgC','Touches_Touches','Touches_Att Pen','Carries_Carries','Carries_1/3','Carries_CPA'] | |
| self.lst_columns_combined = lst_base + lst_columns_passing_type +lst_columns_passing+lst_columns_defensive+lst_columns_shooting+lst_columns_keeper+lst_columns_shot_creation+lst_columns_misc+lst_columns_possesion | |
| print("-Variables inicializadas") | |
| def get_raw_data_from_source(self,league,year): | |
| print(f"\nLiga {league}... 📅 Año {year}...", end=" ") | |
| # Extraer equipos local/visitante | |
| if league["name"] in ["NED-Eredivisie","POR-Primeira Liga","ENG-Championship"] and year == 2017: | |
| return | |
| # Crear scraper para la liga específica | |
| fbref = sd.FBref(leagues=league["name"], seasons=year) | |
| # Leer estadísticas | |
| team_season_shooting = fbref.read_team_match_stats(stat_type="shooting",opponent_stats = False) | |
| team_season_passing_types = fbref.read_team_match_stats(stat_type="passing_types",opponent_stats = False) | |
| team_season_passing = fbref.read_team_match_stats(stat_type="passing",opponent_stats = False) | |
| team_season_defensive = fbref.read_team_match_stats(stat_type="defense",opponent_stats = False) | |
| team_season_goalkeeping = fbref.read_team_match_stats(stat_type="keeper",opponent_stats = False) | |
| team_season_goal_shot_creation = fbref.read_team_match_stats(stat_type="goal_shot_creation",opponent_stats = False) | |
| team_season_goal_misc = fbref.read_team_match_stats(stat_type="misc",opponent_stats = False) | |
| team_season_goal_possession = fbref.read_team_match_stats(stat_type="possession",opponent_stats = False) | |
| df_concat = pd.concat([team_season_shooting,team_season_passing_types,team_season_passing,team_season_defensive, | |
| team_season_goalkeeping,team_season_goal_shot_creation,team_season_goal_misc,team_season_goal_possession], axis=1) | |
| # Reset index | |
| df_reset = df_concat.copy().reset_index() | |
| # Aplanar MultiIndex | |
| df_reset.columns = [ | |
| '_'.join(col).strip('_') if isinstance(col, tuple) else col | |
| for col in df_reset.columns.values | |
| ] | |
| # Eliminar duplicados | |
| df_reset = df_reset.loc[:, ~df_reset.columns.duplicated()] | |
| df_filtered = df_reset[self.lst_columns_combined] | |
| df_filtered["local"] = df_filtered["game"].apply(extract_local) | |
| df_filtered["away"] = df_filtered["game"].apply(extract_away) | |
| # Agregar código de liga | |
| df_filtered["league"] = league["code"] | |
| df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated(keep='first')] | |
| # Verificar valores problemáticos | |
| problematic = df_filtered[df_filtered["away"].isna()] | |
| if len(problematic) > 0: | |
| print(f"⚠️ {len(problematic)} registros con formato incorrecto") | |
| else: | |
| print(f"✅ {len(df_filtered)} partidos extraídos") | |
| return df_filtered | |
| def mergue_raw_data_all_leagues(self, current_year): | |
| all_dataframes = [] | |
| if current_year == True: | |
| #Process only current year | |
| for league_key, league_info in self.DIC_LEAGUES_CONFIG.items(): | |
| self.dic_historic_all_leagues[league_key][self.LST_YEARS_CONFIG[-1]] = self.get_raw_data_from_source(league_info,self.LST_YEARS_CONFIG[-1]) | |
| else: | |
| #Process all years needed execpt for current year | |
| for league_key, league_info in self.DIC_LEAGUES_CONFIG.items(): | |
| for year in self.LST_YEARS_CONFIG: | |
| if year == 2025: | |
| continue | |
| self.dic_historic_all_leagues[league_key][year] = self.get_raw_data_from_source(league_info,year) | |
| for league_key, dic_historic in self.dic_historic_all_leagues.items(): | |
| for year, df in dic_historic.items(): | |
| all_dataframes.append(df) | |
| self.df_database = pd.concat(all_dataframes, ignore_index=True) | |
| print("Dataset conbinado") | |
| def process_and_output_dataset(self,current_year): | |
| # Filtrar solo Matchweek | |
| self.df_database = self.df_database[self.df_database['round'].str.contains("Matchweek", na=False)] | |
| self.df_database['round'] = self.df_database['round'].str.replace("Matchweek ", "") | |
| # Convertir tipos | |
| self.df_database['round'] = self.df_database['round'].astype(int) | |
| self.df_database['GF'] = self.df_database['GF'].astype(int) | |
| self.df_database['GA'] = self.df_database['GA'].astype(int) | |
| self.df_database = self.df_database.drop_duplicates() | |
| if current_year == True: | |
| self.df_database.to_csv("dataset\cleaned\dataset_cleaned_current_year.csv",index=False) | |
| else: | |
| self.df_database.to_csv("dataset\cleaned\dataset_cleaned.csv",index=False) | |
| print("Dataset cleaned and saved on dataset\cleaned") | |
| a = GENERATE_DATASET(False) |