corner-forecast / src /process_data /generate_dataset.py
daniel-saed's picture
Upload 21 files
c2aaace verified
import sys
import os
# Añadir la ruta raíz del proyecto al PYTHONPATH
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
sys.path.insert(0, project_root)
from src.utils.helper import desactivar_advertencias
import soccerdata as sd
import pandas as pd
def extract_local(game_str):
try:
parts = game_str.split(" ", 1)[1].split("-")
return parts[0].strip() if len(parts) > 0 else None
except (IndexError, AttributeError):
return None
def extract_away(game_str):
try:
parts = game_str.split(" ", 1)[1].split("-")
return parts[1].strip() if len(parts) > 1 else None
except (IndexError, AttributeError):
return None
class GENERATE_DATASET():
def __init__(self,current_year):
print("Clase GENERATE_DATASET Inicializada")
desactivar_advertencias()
self.init_variables()
self.mergue_raw_data_all_leagues(current_year)
self.process_and_output_dataset(current_year)
def init_variables(self):
#Years to get from datasource
self.LST_YEARS_CONFIG = [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
self.dic_historic_all_leagues = {
"ENG": {},
"ESP": {},
"GER": {},
"FRA": {},
"ITA": {},
"NED": {},
"ENG2": {},
"POR": {},
"BEL": {}
}
self.df_database = pd.DataFrame()
# Diccionary to name leagues to get from datasource
self.DIC_LEAGUES_CONFIG = {
"ENG": {
"name": "ENG-Premier League",
"code": "ENG"
},
"POR": {
"name": "POR-Primeira Liga",
"code": "POR"
},
"BEL": {
"name": "BEL-Belgian Pro League",
"code": "BEL"
},
"ESP": {
"name": "ESP-La Liga",
"code": "ESP"
},
"GER": {
"name": "GER-Bundesliga",
"code": "GER"
},
"FRA": {
"name": "FRA-Ligue 1",
"code": "FRA"
},
"ITA": {
"name": "ITA-Serie A",
"code": "ITA"
},
"NED": {
"name": "NED-Eredivisie",
"code": "NED"
}
}
lst_base = ['season','date','game','round','day','venue','team','GF','GA','opponent',"result"]
lst_columns_shooting = ['Expected_xG','Standard_Sh','Standard_SoT','Standard_Dist']
lst_columns_passing_type = ['Pass Types_CK']
lst_columns_passing = ['Total_Att','Long_Att','Ast','1/3','PrgP']
lst_columns_defensive = ['Tackles_Att 3rd','Tackles_Tkl','Blocks_Blocks','Int','Clr']
lst_columns_keeper = ['Performance_Save%']
lst_columns_shot_creation = ['SCA Types_SCA']
lst_columns_misc = ['Performance_Crs']
lst_columns_possesion = ['Poss', 'Touches_Att 3rd','Carries_PrgC','Touches_Touches','Touches_Att Pen','Carries_Carries','Carries_1/3','Carries_CPA']
self.lst_columns_combined = lst_base + lst_columns_passing_type +lst_columns_passing+lst_columns_defensive+lst_columns_shooting+lst_columns_keeper+lst_columns_shot_creation+lst_columns_misc+lst_columns_possesion
print("-Variables inicializadas")
def get_raw_data_from_source(self,league,year):
print(f"\nLiga {league}... 📅 Año {year}...", end=" ")
# Extraer equipos local/visitante
if league["name"] in ["NED-Eredivisie","POR-Primeira Liga","ENG-Championship"] and year == 2017:
return
# Crear scraper para la liga específica
fbref = sd.FBref(leagues=league["name"], seasons=year)
# Leer estadísticas
team_season_shooting = fbref.read_team_match_stats(stat_type="shooting",opponent_stats = False)
team_season_passing_types = fbref.read_team_match_stats(stat_type="passing_types",opponent_stats = False)
team_season_passing = fbref.read_team_match_stats(stat_type="passing",opponent_stats = False)
team_season_defensive = fbref.read_team_match_stats(stat_type="defense",opponent_stats = False)
team_season_goalkeeping = fbref.read_team_match_stats(stat_type="keeper",opponent_stats = False)
team_season_goal_shot_creation = fbref.read_team_match_stats(stat_type="goal_shot_creation",opponent_stats = False)
team_season_goal_misc = fbref.read_team_match_stats(stat_type="misc",opponent_stats = False)
team_season_goal_possession = fbref.read_team_match_stats(stat_type="possession",opponent_stats = False)
df_concat = pd.concat([team_season_shooting,team_season_passing_types,team_season_passing,team_season_defensive,
team_season_goalkeeping,team_season_goal_shot_creation,team_season_goal_misc,team_season_goal_possession], axis=1)
# Reset index
df_reset = df_concat.copy().reset_index()
# Aplanar MultiIndex
df_reset.columns = [
'_'.join(col).strip('_') if isinstance(col, tuple) else col
for col in df_reset.columns.values
]
# Eliminar duplicados
df_reset = df_reset.loc[:, ~df_reset.columns.duplicated()]
df_filtered = df_reset[self.lst_columns_combined]
df_filtered["local"] = df_filtered["game"].apply(extract_local)
df_filtered["away"] = df_filtered["game"].apply(extract_away)
# Agregar código de liga
df_filtered["league"] = league["code"]
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated(keep='first')]
# Verificar valores problemáticos
problematic = df_filtered[df_filtered["away"].isna()]
if len(problematic) > 0:
print(f"⚠️ {len(problematic)} registros con formato incorrecto")
else:
print(f"✅ {len(df_filtered)} partidos extraídos")
return df_filtered
def mergue_raw_data_all_leagues(self, current_year):
all_dataframes = []
if current_year == True:
#Process only current year
for league_key, league_info in self.DIC_LEAGUES_CONFIG.items():
self.dic_historic_all_leagues[league_key][self.LST_YEARS_CONFIG[-1]] = self.get_raw_data_from_source(league_info,self.LST_YEARS_CONFIG[-1])
else:
#Process all years needed execpt for current year
for league_key, league_info in self.DIC_LEAGUES_CONFIG.items():
for year in self.LST_YEARS_CONFIG:
if year == 2025:
continue
self.dic_historic_all_leagues[league_key][year] = self.get_raw_data_from_source(league_info,year)
for league_key, dic_historic in self.dic_historic_all_leagues.items():
for year, df in dic_historic.items():
all_dataframes.append(df)
self.df_database = pd.concat(all_dataframes, ignore_index=True)
print("Dataset conbinado")
def process_and_output_dataset(self,current_year):
# Filtrar solo Matchweek
self.df_database = self.df_database[self.df_database['round'].str.contains("Matchweek", na=False)]
self.df_database['round'] = self.df_database['round'].str.replace("Matchweek ", "")
# Convertir tipos
self.df_database['round'] = self.df_database['round'].astype(int)
self.df_database['GF'] = self.df_database['GF'].astype(int)
self.df_database['GA'] = self.df_database['GA'].astype(int)
self.df_database = self.df_database.drop_duplicates()
if current_year == True:
self.df_database.to_csv("dataset\cleaned\dataset_cleaned_current_year.csv",index=False)
else:
self.df_database.to_csv("dataset\cleaned\dataset_cleaned.csv",index=False)
print("Dataset cleaned and saved on dataset\cleaned")
a = GENERATE_DATASET(False)