daniel-saed commited on
Commit
90710dc
·
verified ·
1 Parent(s): ed72f48

Upload 2 files

Browse files
src/process_data/generate_dataset.py CHANGED
@@ -201,11 +201,11 @@ class GENERATE_DATASET():
201
  self.df_database = self.df_database.drop_duplicates()
202
 
203
  if current_year == True:
204
- self.df_database.to_csv("dataset\cleaned\dataset_cleaned_current_year.csv",index=False)
205
  else:
206
- self.df_database.to_csv("dataset\cleaned\dataset_cleaned.csv",index=False)
207
- print("Dataset cleaned and saved on dataset\cleaned")
208
 
209
 
210
 
211
- a = GENERATE_DATASET(False)
 
201
  self.df_database = self.df_database.drop_duplicates()
202
 
203
  if current_year == True:
204
+ self.df_database.to_csv(r"dataset/cleaned/dataset_cleaned_current_year.csv",index=False)
205
  else:
206
+ self.df_database.to_csv(r"dataset/cleaned/dataset_cleaned.csv",index=False)
207
+ print("Dataset cleaned and saved on dataset/cleaned")
208
 
209
 
210
 
211
+ #a = GENERATE_DATASET(False)
src/process_data/process_dataset.py CHANGED
@@ -1,583 +1,711 @@
1
- import pandas as pd
2
- import os
3
-
4
- def get_ck(df, season, round_num, local, away, league=None):
5
- """Obtiene corners totales de un partido específico"""
6
- season_round = (df['season'] == season) & (df['round'] == round_num)
7
-
8
- if league is not None:
9
- season_round = season_round & (df['league'] == league)
10
-
11
- df = df[season_round]
12
-
13
- df_local = df[df['team'] == local]
14
- df_away = df[df['team'] == away]
15
-
16
- total_ck = df_local["Pass Types_CK"].sum() + df_away["Pass Types_CK"].sum()
17
-
18
- return total_ck
19
-
20
- def get_dataframes(df, season, round_num, local, away, league=None):
21
- """Retorna 8 DataFrames filtrados por equipo, venue y liga"""
22
-
23
- season_round = (df['season'] == season) & (df['round'] < round_num)
24
-
25
- if league is not None:
26
- season_round = season_round & (df['league'] == league)
27
-
28
- def filter_and_split(team_filter):
29
- filtered = df[season_round & team_filter].copy()
30
- home = filtered[filtered['venue'] == "Home"]
31
- away = filtered[filtered['venue'] == "Away"]
32
- return home, away
33
-
34
- local_home, local_away = filter_and_split(df['team'] == local)
35
- local_opp_home, local_opp_away = filter_and_split(df['opponent'] == local)
36
-
37
- away_home, away_away = filter_and_split(df['team'] == away)
38
- away_opp_home, away_opp_away = filter_and_split(df['opponent'] == away)
39
-
40
- return (local_home, local_away, local_opp_home, local_opp_away,
41
- away_home, away_away, away_opp_home, away_opp_away)
42
-
43
- def get_head_2_head(df, local, away, seasons=None, league=None):
44
- """Obtiene últimos 3 enfrentamientos directos"""
45
- if seasons is None:
46
- seasons = []
47
-
48
- df_filtered = df[df['season'].isin(seasons)] if seasons else df
49
-
50
- if league is not None:
51
- df_filtered = df_filtered[df_filtered['league'] == league]
52
-
53
- local_h2h = df_filtered[(df_filtered['team'] == local) & (df_filtered['opponent'] == away)]
54
- away_h2h = df_filtered[(df_filtered['team'] == away) & (df_filtered['opponent'] == local)]
55
-
56
- if len(local_h2h) < 4:
57
- return local_h2h.tail(2), away_h2h.tail(2)
58
-
59
- return local_h2h.tail(3), away_h2h.tail(3)
60
-
61
- def get_points_from_result(result):
62
- """Convierte resultado (W/D/L) a puntos"""
63
- if result == 'W':
64
- return 3
65
- elif result == 'D':
66
- return 1
67
- else:
68
- return 0
69
-
70
- # ✅ NUEVA FUNCIÓN: Calcular PPP (Puntos Por Partido)
71
- def get_team_ppp(df, team, season, round_num, league=None):
72
- """
73
- Calcula puntos por partido (PPP) de un equipo
74
-
75
- Args:
76
- df: DataFrame completo
77
- team: Nombre del equipo
78
- season: Temporada
79
- round_num: Número de jornada (NO incluye esta jornada)
80
- league: Código de liga (opcional)
81
-
82
- Returns:
83
- float: Puntos por partido (0-3)
84
- """
85
- team_matches = df[
86
- (df['team'] == team) &
87
- (df['season'] == season) &
88
- (df['round'] < round_num)
89
- ]
90
-
91
- if league is not None:
92
- team_matches = team_matches[team_matches['league'] == league]
93
-
94
- if len(team_matches) == 0:
95
- return 0.0
96
-
97
- total_points = team_matches['result'].apply(get_points_from_result).sum()
98
- ppp = total_points / len(team_matches)
99
-
100
- return ppp
101
-
102
- # NUEVA FUNCIÓN: Calcular diferencia de PPP
103
- def get_ppp_difference(df, local, away, season, round_num, league=None):
104
- """
105
- Calcula la diferencia de puntos por partido entre local y visitante
106
-
107
- Args:
108
- df: DataFrame completo
109
- local: Equipo local
110
- away: Equipo visitante
111
- season: Temporada
112
- round_num: Jornada actual
113
- league: Código de liga (opcional)
114
-
115
- Returns:
116
- float: Diferencia de PPP (local - away)
117
- """
118
- local_ppp = get_team_ppp(df, local, season, round_num, league)
119
- away_ppp = get_team_ppp(df, away, season, round_num, league)
120
-
121
- return local_ppp - away_ppp
122
-
123
- def get_average(df, is_team=False, lst_avg=None):
124
- """Calcula promedios de estadísticas"""
125
-
126
- if len(df) == 0:
127
- # Retornar valores por defecto si el DataFrame está vacío
128
- if is_team:
129
- return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
130
- return (0, 0, 0, 0, 0, 0, 0, 0)
131
-
132
- if is_team:
133
- # ===========================
134
- # ESTADÍSTICAS BÁSICAS (NORMALIZADAS)
135
- # ===========================
136
- avg_cross = (df['Performance_Crs'].sum() / len(df)) - lst_avg[3]
137
- avg_att_3rd = (df['Touches_Att 3rd'].sum() / len(df)) - lst_avg[4]
138
- avg_sca = (df['SCA Types_SCA'].sum() / len(df)) - lst_avg[2]
139
- avg_xg = (df['Expected_xG'].sum() / len(df)) - lst_avg[1]
140
-
141
- # CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
142
- var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
143
- avg_ck = (df['Pass Types_CK'].sum() / len(df)) - lst_avg[8]
144
-
145
- avg_poss = (df['Poss'].sum() / len(df)) - 50
146
- avg_gf = (df['GF'].sum() / len(df)) - lst_avg[5]
147
- avg_ga = (df['GA'].sum() / len(df)) - lst_avg[6]
148
-
149
- # ===========================
150
- # MÉTRICAS OFENSIVAS AVANZADAS
151
- # ===========================
152
-
153
- # Precisión de tiros
154
- total_sh = df['Standard_Sh'].sum()
155
- sh_accuracy = (df['Standard_SoT'].sum() / total_sh) if total_sh > 0 else 0
156
-
157
- # Eficiencia xG por tiro
158
- xg_shot = (df['Expected_xG'].sum() / total_sh) if total_sh > 0 else 0
159
-
160
- # Presencia atacante (% toques en área rival)
161
- total_touches = df['Touches_Touches'].sum()
162
- attacking_presence = (df['Touches_Att 3rd'].sum() / total_touches) if total_touches > 0 else 0
163
-
164
- # Tiros por posesión
165
- total_poss = df['Poss'].sum()
166
- possession_shot = (total_sh / total_poss) if total_poss > 0 else 0
167
-
168
- # Distancia promedio de tiros
169
- standard_dist = df['Standard_Dist'].mean() if 'Standard_Dist' in df.columns else 0
170
-
171
- # ===========================
172
- # MÉTRICAS DE CREACIÓN
173
- # ===========================
174
-
175
- # Ratio de pases progresivos
176
- total_passes = df['Total_Att'].sum()
177
- progressive_pass_ratio = (df['PrgP'].sum() / total_passes) if total_passes > 0 else 0
178
-
179
- # Participación en último tercio
180
- final_third_passes = df['1/3'].sum()
181
- final_third_involvement = (final_third_passes / total_passes) if total_passes > 0 else 0
182
-
183
- # Ratio de pases largos
184
- long_ball_ratio = (df['Long_Att'].sum() / total_passes) if total_passes > 0 else 0
185
-
186
- # Asistencias por SCA
187
- total_sca = df['SCA Types_SCA'].sum()
188
- assist_sca = (df['Ast'].sum() / total_sca) if total_sca > 0 else 0
189
-
190
- # Dependencia de centros
191
- cross_dependency = (df['Performance_Crs'].sum() / total_passes) if total_passes > 0 else 0
192
-
193
- # Eficiencia creativa
194
- creative_efficiency = (total_sca / total_poss) if total_poss > 0 else 0
195
-
196
- # ===========================
197
- # MÉTRICAS DEFENSIVAS
198
- # ===========================
199
-
200
- # Intensidad de presión alta
201
- total_tackles = df['Tackles_Tkl'].sum()
202
- high_press_intensity = (df['Tackles_Att 3rd'].sum() / total_tackles) if total_tackles > 0 else 0
203
-
204
- # Ratio intercepciones/tackles
205
- interception_tackle = (df['Int'].sum() / total_tackles) if total_tackles > 0 else 0
206
-
207
- # Ratio bloqueos/tackles
208
- blocks_tackle = (df['Blocks_Blocks'].sum() / total_tackles) if total_tackles > 0 else 0
209
-
210
- # Ratio de despejes
211
- total_defensive_actions = total_tackles + df['Int'].sum()
212
- clearance_ratio = (df['Clr'].sum() / total_defensive_actions) if total_defensive_actions > 0 else 0
213
-
214
- # ===========================
215
- # MÉTRICAS DE PORTERÍA
216
- # ===========================
217
-
218
- # Rendimiento del portero normalizado
219
- avg_save_pct = df['Performance_Save%'].mean() if 'Performance_Save%' in df.columns else 0
220
- avg_xg_against = df['Expected_xG'].mean() if len(df) > 0 else 1
221
- performance_save = (avg_save_pct / (1 / avg_xg_against)) if avg_xg_against > 0 else 0
222
-
223
- # ===========================
224
- # MÉTRICAS DE POSESIÓN
225
- # ===========================
226
-
227
- # Ratio de conducciones progresivas
228
- total_carries = df['Carries_Carries'].sum()
229
- progressive_carry_ratio = (df['Carries_PrgC'].sum() / total_carries) if total_carries > 0 else 0
230
-
231
- # Ratio de conducciones al área
232
- penalty_carry_ratio = (df['Carries_CPA'].sum() / total_carries) if total_carries > 0 else 0
233
-
234
- # Balance conducción/pase progresivo
235
- total_prog_passes = df['PrgP'].sum()
236
- carry_pass_balance = (df['Carries_PrgC'].sum() / total_prog_passes) if total_prog_passes > 0 else 0
237
-
238
- # ===========================
239
- # ÍNDICES COMPUESTOS
240
- # ===========================
241
-
242
- # Índice ofensivo
243
- avg_gf_raw = df['GF'].mean()
244
- avg_xg_raw = df['Expected_xG'].mean()
245
- avg_sot = df['Standard_SoT'].mean()
246
- avg_sh = df['Standard_Sh'].mean()
247
- offensive_index = (avg_gf_raw + avg_xg_raw) * (avg_sot / avg_sh) if avg_sh > 0 else 0
248
-
249
- # Índice defensivo
250
- avg_int = df['Int'].mean()
251
- avg_tkl = df['Tackles_Tkl'].mean()
252
- avg_clr = df['Clr'].mean()
253
- defensive_index = avg_save_pct * (avg_int / (avg_tkl + avg_clr)) if (avg_tkl + avg_clr) > 0 else 0
254
-
255
- # Índice de control de posesión
256
- avg_touches_att = df['Touches_Att 3rd'].mean()
257
- avg_carries_third = df['Carries_1/3'].mean() if 'Carries_1/3' in df.columns else 0
258
- avg_touches_total = df['Touches_Touches'].mean()
259
- possession_control_index = ((avg_touches_att + avg_carries_third) / avg_touches_total) if avg_touches_total > 0 else 0
260
-
261
- # Índice de transición
262
- avg_prgp = df['PrgP'].mean()
263
- avg_prgc = df['Carries_PrgC'].mean()
264
- avg_poss_raw = df['Poss'].mean()
265
- transition_index = ((avg_prgp + avg_prgc) / avg_poss_raw) if avg_poss_raw > 0 else 0
266
-
267
- # RETORNAR TODAS LAS MÉTRICAS (23 valores)
268
- return (
269
- avg_ck,
270
- var_ck, # 0 - CAMBIADO: varianza en vez de promedio
271
- avg_xg, # 1
272
- avg_sca, # 2
273
- avg_cross, # 3
274
- avg_poss, # 4
275
- avg_att_3rd, # 5
276
- avg_gf, # 6
277
- avg_ga, # 7
278
- sh_accuracy, # 8
279
- xg_shot, # 9
280
- attacking_presence, # 10
281
- possession_shot, # 11
282
- progressive_pass_ratio, # 12
283
- final_third_involvement, # 13
284
- assist_sca, # 14
285
- creative_efficiency, # 15
286
- high_press_intensity, # 16
287
- interception_tackle, # 17
288
- clearance_ratio, # 18
289
- progressive_carry_ratio, # 19
290
- carry_pass_balance, # 20
291
- offensive_index, # 21
292
- transition_index # 22
293
- )
294
-
295
- # ===========================
296
- # PROMEDIOS DE LIGA (is_team=False)
297
- # ===========================
298
-
299
- avg_cross = df['Performance_Crs'].mean()
300
- avg_att_3rd = df['Touches_Att 3rd'].mean()
301
- avg_sca = df['SCA Types_SCA'].mean()
302
- avg_xg = df['Expected_xG'].mean()
303
-
304
- # ✅ CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
305
- var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
306
- avg_ck = df['Pass Types_CK'].mean()
307
-
308
- avg_gf = df['GF'].mean()
309
- avg_ga = df['GA'].mean()
310
-
311
- # AGREGAR MÉTRICAS BÁSICAS PARA NORMALIZACIÓN
312
- avg_sh = df['Standard_Sh'].mean() if 'Standard_Sh' in df.columns else 0
313
-
314
- return (
315
-
316
- var_ck, # 0 - ✅ CAMBIADO
317
- avg_xg, # 1
318
- avg_sca, # 2
319
- avg_cross, # 3
320
- avg_att_3rd, # 4
321
- avg_gf, # 5
322
- avg_ga, # 6
323
- avg_sh, # 7 - NUEVO
324
- avg_ck
325
- )
326
-
327
-
328
-
329
- class PROCESS_DATA():
330
- def __init__(self,use_one_hot_encoding):
331
-
332
- self.USE_ONE_HOT_ENCODING = use_one_hot_encoding
333
-
334
- self.init_variables()
335
-
336
- self.load_clean_dataset()
337
-
338
- self.process_all_matches()
339
-
340
- self.clean_and_ouput_dataset()
341
- # Excluir temporada 1718 si es necesario
342
-
343
-
344
- def init_variables(self):
345
-
346
- self.y = []
347
-
348
- self.lst_data = []
349
-
350
- self.lst_years = ["1819", "1920", "2021", "2122", "2223", "2324", "2425", "2526"]
351
-
352
- # ✅ CONSTRUIR VECTOR DE FEATURES CON NOMBRES DESCRIPTIVOS
353
- self.lst_base_advanced = [
354
- "avg_ck","var_ck", # ✅ CAMBIADO
355
- "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga",
356
- "sh_accuracy", "xg_shot", "attacking_presence", "possession_shot",
357
- "progressive_pass_ratio", "final_third_involvement", "assist_sca", "creative_efficiency",
358
- "high_press_intensity", "interception_tackle", "clearance_ratio",
359
- "progressive_carry_ratio", "carry_pass_balance", "offensive_index", "transition_index"
360
- ]
361
-
362
- self.lst_base_original = [
363
- "var_ck","xg", "sca", "cross", "poss", "att_3rd", "gf", "ga","avg_ck"
364
- ]
365
-
366
- print("Variables inicializadas")
367
-
368
- def load_clean_dataset(self):
369
-
370
- #load clean dataset generated on generate_dataset.py
371
- self.df_dataset_historic = pd.read_csv("dataset/cleaned/dataset_cleaned.csv")
372
-
373
- if os.path.exists(r"dataset/cleaned/dataset_cleaned_current_year.csv"):
374
- self.df_dataset_current_year = pd.read_csv("dataset/cleaned/dataset_cleaned_current_year.csv")
375
-
376
- self.df_dataset = pd.concat([self.df_dataset_historic,self.df_dataset_current_year])
377
- else:
378
- self.df_dataset = self.df_dataset_historic
379
-
380
- self.df_dataset["season"] = self.df_dataset["season"].astype(str)
381
- self.df_dataset["Performance_Save%"].fillna(0)
382
-
383
- self.df_dataset_export = self.df_dataset.copy()
384
-
385
- #filter data to get key elements on mathces
386
- self.df_dataset_export = self.df_dataset_export.drop_duplicates(subset=["game", "league"])
387
- self.df_dataset_export = self.df_dataset_export[["local", "away", "round", "season", "date", "league"]]
388
-
389
- #load all unique matches on a list to process
390
- self.lst_matches = self.df_dataset_export.values.tolist()
391
-
392
- self.lst_matches = [row for row in self.lst_matches if row[3] != "1718"]
393
-
394
- print("dataset loaded")
395
-
396
- def process_all_matches(self):
397
-
398
- for i in self.lst_matches:
399
- if i[2] < 5:
400
- continue
401
-
402
- local = i[0]
403
- away = i[1]
404
- round_num = i[2]
405
- season = i[3]
406
- date = i[4]
407
- league_code = i[5]
408
-
409
- dic_df = {}
410
- # Promedios de liga
411
- lst_avg = get_average(
412
- self.df_dataset[
413
- (self.df_dataset['season'] == season) &
414
- (self.df_dataset['round'] < round_num) &
415
- (self.df_dataset['league'] == league_code)
416
- ],
417
- is_team=False
418
- )
419
-
420
- # FUNCIÓN MEJORADA: Maneja métricas originales y avanzadas
421
- def create_line(df, is_form=True, is_team=False, use_advanced=True):
422
- """
423
- Args:
424
- df: DataFrame con datos del equipo
425
- is_form: Si True, toma solo últimos 8 partidos
426
- is_team: Si True, normaliza contra promedios de liga
427
- use_advanced: Si True, incluye métricas avanzadas (23 valores)
428
- Si False, solo métricas originales (8 valores)
429
- """
430
- if is_form:
431
- df = df[-6:]
432
-
433
- if use_advanced:
434
- # Retorna 23 valores (todas las métricas)
435
- return get_average(df, is_team, lst_avg)
436
- else:
437
- # Retorna solo 8 valores originales
438
- result = get_average(df, is_team, lst_avg)
439
- return result[:9] # Primeros 8 valores
440
-
441
-
442
-
443
- # Extraer DataFrames
444
- (team1_home, team1_away, team1_opp_home, team1_opp_away,
445
- team2_home, team2_away, team2_opp_home, team2_opp_away) = get_dataframes(
446
- self.df_dataset, season, round_num, local, away, league=league_code
447
- )
448
-
449
- # Corners reales
450
- ck = get_ck(self.df_dataset, season, round_num, local, away, league=league_code)
451
- self.y.append(ck)
452
-
453
- # Head to Head
454
- index = self.lst_years.index(season)
455
- result = self.lst_years[:index+1]
456
- team1_h2h, team2_h2h = get_head_2_head(
457
- self.df_dataset, local, away, seasons=result, league=league_code
458
- )
459
-
460
- # PPP
461
- local_ppp = get_team_ppp(self.df_dataset, local, season, round_num, league=league_code)
462
- away_ppp = get_team_ppp(self.df_dataset, away, season, round_num, league=league_code)
463
- ppp_diff = local_ppp - away_ppp
464
-
465
- dic_df['ppp_local'] = (local_ppp,)
466
- dic_df['ppp_away'] = (away_ppp,)
467
- dic_df['ppp_difference'] = (ppp_diff,)
468
-
469
- # FEATURES CON MÉTRICAS AVANZADAS (23 valores cada una)
470
- dic_df['lst_team1_home_form'] = create_line(team1_home, True, True, use_advanced=True)
471
- dic_df['lst_team1_home_general'] = create_line(team1_home, False, True, use_advanced=True)
472
- dic_df['lst_team1_away_form'] = create_line(team1_away, True, True, use_advanced=True)
473
- dic_df['lst_team1_away_general'] = create_line(team1_away, False, True, use_advanced=True)
474
-
475
- dic_df['lst_team2_home_form'] = create_line(team2_home, True, True, use_advanced=True)
476
- dic_df['lst_team2_home_general'] = create_line(team2_home, False, True, use_advanced=True)
477
- dic_df['lst_team2_away_form'] = create_line(team2_away, True, True, use_advanced=True)
478
- dic_df['lst_team2_away_general'] = create_line(team2_away, False, True, use_advanced=True)
479
-
480
- dic_df['lst_team1_h2h'] = create_line(team1_h2h, False, True, use_advanced=True)
481
- dic_df['lst_team2_h2h'] = create_line(team2_h2h, False, True, use_advanced=True)
482
-
483
- # ✅ FEATURES CON MÉTRICAS ORIGINALES (8 valores) - SOLO PARA OPONENTES
484
- dic_df['lst_team1_opp_away'] = create_line(team1_opp_away, False, True, use_advanced=False)
485
- dic_df['lst_team2_opp_home'] = create_line(team2_opp_home, False, True, use_advanced=False)
486
-
487
- # One-Hot Encoding
488
- if self.USE_ONE_HOT_ENCODING:
489
- league_dummies = {
490
- 'league_ESP': 1 if league_code == 'ESP' else 0,
491
- 'league_GER': 1 if league_code == 'GER' else 0,
492
- 'league_FRA': 1 if league_code == 'FRA' else 0,
493
- 'league_ITA': 1 if league_code == 'ITA' else 0,
494
- 'league_NED': 1 if league_code == 'NED' else 0,
495
- 'league_ENG': 1 if league_code == 'ENG' else 0,
496
- 'league_POR': 1 if league_code == 'POR' else 0,
497
- 'league_BEL': 1 if league_code == 'BEL' else 0
498
- }
499
-
500
- for key, value in league_dummies.items():
501
- dic_df[key] = (value,)
502
-
503
-
504
-
505
- lst_features_values = []
506
- self.lst_features_values = []
507
-
508
- for key in dic_df:
509
- lst_features_values.extend(list(dic_df[key]))
510
-
511
- # Casos especiales
512
- if key in ['ppp_local', 'ppp_away', 'ppp_difference']:
513
- self.lst_features_values.append(key)
514
- elif key.startswith('league_'):
515
- self.lst_features_values.append(key)
516
- elif key in ['lst_team1_opp_away', 'lst_team2_opp_home']:
517
- # Métricas ORIGINALES (8 valores)
518
- self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_original])
519
- else:
520
- # Métricas AVANZADAS (23 valores)
521
- self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_advanced])
522
-
523
- self.lst_data.append(lst_features_values)
524
- print("Dataset processed")
525
-
526
- def clean_and_ouput_dataset(self):
527
-
528
- self.df_data = pd.DataFrame(data=self.lst_data, columns=self.lst_features_values)
529
-
530
- print(f"\n✅ PROCESAMIENTO COMPLETADO:")
531
- print(f" Shape inicial: {self.df_data.shape}")
532
- print(f" Total partidos: {len(self.df_data)}")
533
- print(f" Features totales: {self.df_data.shape[1]}")
534
-
535
- # ===========================
536
- # LIMPIEZA DE DATOS NULOS
537
- # ===========================
538
-
539
- print(f"\n🧹 LIMPIANDO DATOS NULOS...")
540
-
541
- import numpy as np
542
- nulos_antes_X = self.df_data.isnull().sum().sum()
543
- nulos_antes_y = np.isnan(self.y).sum() if isinstance(self.y, np.ndarray) else sum(pd.isna(self.y))
544
-
545
- print(f" Nulos en X (antes): {nulos_antes_X}")
546
- print(f" Nulos en Y (antes): {nulos_antes_y}")
547
-
548
- y_array = np.array(self.y).flatten()
549
-
550
- mask_valid_X = ~self.df_data.isnull().any(axis=1)
551
- mask_valid_y = ~np.isnan(y_array)
552
- mask_combined = mask_valid_X & mask_valid_y
553
-
554
- self.df_data = self.df_data[mask_combined].reset_index(drop=True)
555
- y_array = y_array[mask_combined]
556
-
557
- print(f"\n✅ LIMPIEZA COMPLETADA:")
558
- print(f" Nulos en X (después): {self.df_data.isnull().sum().sum()}")
559
- print(f" Nulos en Y (después): {np.isnan(y_array).sum()}")
560
- print(f" Filas eliminadas: {len(mask_combined) - mask_combined.sum()}")
561
- print(f" Shape final: {self.df_data.shape}")
562
-
563
- # ===========================
564
- # VERIFICACIÓN FINAL
565
- # ===========================
566
-
567
- print(f"\n🔍 VERIFICACIÓN DE NUEVAS FEATURES:")
568
- print(f" ✅ Features con 'var_ck': {len([c for c in self.df_data.columns if 'var_ck' in c])}")
569
- print(f" ✅ Features con métricas avanzadas: {len([c for c in self.df_data.columns if any(m in c for m in ['sh_accuracy', 'offensive_index'])])}")
570
- print(f" ✅ Features de oponentes (8 valores): {len([c for c in self.df_data.columns if 'opp' in c])}")
571
-
572
- print("\n" + "=" * 80)
573
- print("✅ PROCESO COMPLETADO - DATOS LISTOS PARA ENTRENAMIENTO")
574
- print("=" * 80)
575
-
576
- self.y = y_array.tolist()
577
-
578
- self.df_data["y"] = self.y
579
- self.df_data.to_csv("dataset\processed\dataset_processed.csv",index=False)
580
- print("Dataset")
581
-
582
- #a = PROCESS_DATA(True)
583
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+
5
+ def get_ck(df, season, round_num, local, away, league=None):
6
+ """Obtiene corners totales de un partido específico"""
7
+ season_round = (df['season'] == season) & (df['round'] == round_num)
8
+
9
+ if league is not None:
10
+ season_round = season_round & (df['league'] == league)
11
+
12
+ df = df[season_round]
13
+
14
+ df_local = df[df['team'] == local]
15
+ df_away = df[df['team'] == away]
16
+
17
+ total_ck = df_local["Pass Types_CK"].sum() + df_away["Pass Types_CK"].sum()
18
+ local_ck = df_local["Pass Types_CK"].sum()
19
+ visit_ck = df_away["Pass Types_CK"].sum()
20
+
21
+ total_gol = df_local["GF"].sum() + df_away["GF"].sum()
22
+ local_gol = df_local["GF"].sum()
23
+ visit_gol = df_away["GF"].sum()
24
+
25
+ total_eg = df_local["Expected_xG"].sum() + df_away["Expected_xG"].sum()
26
+ local_eg = df_local["Expected_xG"].sum()
27
+ visit_eg = df_away["Expected_xG"].sum()
28
+
29
+ total_st = df_local["Standard_SoT"].sum() + df_away["Standard_SoT"].sum()
30
+ local_st = df_local["Standard_SoT"].sum()
31
+ visit_st = df_away["Standard_SoT"].sum()
32
+
33
+ return total_ck,local_ck,visit_ck, total_gol,local_gol,visit_gol,total_eg,local_eg,visit_eg,total_st,local_st,visit_st
34
+
35
+ def get_dataframes(df, season, round_num, local, away, league=None):
36
+ """Retorna 8 DataFrames filtrados por equipo, venue y liga"""
37
+
38
+ season_round = (df['season'] == season) & (df['round'] < round_num)
39
+
40
+ if league is not None:
41
+ season_round = season_round & (df['league'] == league)
42
+
43
+ def filter_and_split(team_filter):
44
+ filtered = df[season_round & team_filter].copy()
45
+ home = filtered[filtered['venue'] == "Home"]
46
+ away = filtered[filtered['venue'] == "Away"]
47
+ return home, away
48
+
49
+ local_home, local_away = filter_and_split(df['team'] == local)
50
+ local_opp_home, local_opp_away = filter_and_split(df['opponent'] == local)
51
+
52
+ away_home, away_away = filter_and_split(df['team'] == away)
53
+ away_opp_home, away_opp_away = filter_and_split(df['opponent'] == away)
54
+
55
+ return (local_home, local_away, local_opp_home, local_opp_away,
56
+ away_home, away_away, away_opp_home, away_opp_away)
57
+
58
+ def get_head_2_head(df, local, away, seasons=None, league=None):
59
+ """Obtiene últimos 3 enfrentamientos directos"""
60
+ if seasons is None:
61
+ seasons = []
62
+
63
+ df_filtered = df[df['season'].isin(seasons)] if seasons else df
64
+
65
+ if league is not None:
66
+ df_filtered = df_filtered[df_filtered['league'] == league]
67
+
68
+ local_h2h = df_filtered[(df_filtered['team'] == local) & (df_filtered['opponent'] == away)]
69
+ away_h2h = df_filtered[(df_filtered['team'] == away) & (df_filtered['opponent'] == local)]
70
+
71
+ if len(local_h2h) < 4:
72
+ return local_h2h.tail(2), away_h2h.tail(2)
73
+
74
+ return local_h2h.tail(3), away_h2h.tail(3)
75
+
76
+ def get_points_from_result(result):
77
+ """Convierte resultado (W/D/L) a puntos"""
78
+ if result == 'W':
79
+ return 3
80
+ elif result == 'D':
81
+ return 1
82
+ else:
83
+ return 0
84
+
85
+ # NUEVA FUNCIÓN: Calcular PPP (Puntos Por Partido)
86
+ def get_team_ppp(df, team, season, round_num, league=None):
87
+ """
88
+ Calcula puntos por partido (PPP) de un equipo
89
+
90
+ Args:
91
+ df: DataFrame completo
92
+ team: Nombre del equipo
93
+ season: Temporada
94
+ round_num: Número de jornada (NO incluye esta jornada)
95
+ league: Código de liga (opcional)
96
+
97
+ Returns:
98
+ float: Puntos por partido (0-3)
99
+ """
100
+ team_matches = df[
101
+ (df['team'] == team) &
102
+ (df['season'] == season) &
103
+ (df['round'] < round_num)
104
+ ]
105
+
106
+ if league is not None:
107
+ team_matches = team_matches[team_matches['league'] == league]
108
+
109
+ if len(team_matches) == 0:
110
+ return 0.0
111
+
112
+ total_points = team_matches['result'].apply(get_points_from_result).sum()
113
+ ppp = total_points / len(team_matches)
114
+
115
+ return ppp
116
+
117
+ # ✅ NUEVA FUNCIÓN: Calcular diferencia de PPP
118
+ def get_ppp_difference(df, local, away, season, round_num, league=None):
119
+ """
120
+ Calcula la diferencia de puntos por partido entre local y visitante
121
+
122
+ Args:
123
+ df: DataFrame completo
124
+ local: Equipo local
125
+ away: Equipo visitante
126
+ season: Temporada
127
+ round_num: Jornada actual
128
+ league: Código de liga (opcional)
129
+
130
+ Returns:
131
+ float: Diferencia de PPP (local - away)
132
+ """
133
+ local_ppp = get_team_ppp(df, local, season, round_num, league)
134
+ away_ppp = get_team_ppp(df, away, season, round_num, league)
135
+
136
+ return local_ppp - away_ppp
137
+
138
+ def get_average(df, is_team=False, lst_avg=None):
139
+ """Calcula promedios de estadísticas"""
140
+
141
+ if len(df) == 0:
142
+ # Retornar valores por defecto si el DataFrame está vacío
143
+ if is_team:
144
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
145
+ return (0, 0, 0, 0, 0, 0, 0, 0)
146
+
147
+ if is_team:
148
+ # ===========================
149
+ # ESTADÍSTICAS BÁSICAS (NORMALIZADAS)
150
+ # ===========================
151
+ avg_cross = (df['Performance_Crs'].sum() / len(df)) - lst_avg[3]
152
+ avg_att_3rd = (df['Touches_Att 3rd'].sum() / len(df)) - lst_avg[4]
153
+ avg_sca = (df['SCA Types_SCA'].sum() / len(df)) - lst_avg[2]
154
+ avg_xg = (df['Expected_xG'].sum() / len(df)) - lst_avg[1]
155
+
156
+ # ✅ CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
157
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
158
+ avg_ck = (df['Pass Types_CK'].sum() / len(df)) - lst_avg[8]
159
+
160
+ avg_poss = (df['Poss'].sum() / len(df)) - 50
161
+ avg_gf = (df['GF'].sum() / len(df)) - lst_avg[5]
162
+ avg_ga = (df['GA'].sum() / len(df)) - lst_avg[6]
163
+
164
+ # ===========================
165
+ # MÉTRICAS OFENSIVAS AVANZADAS
166
+ # ===========================
167
+
168
+ # Precisión de tiros
169
+ total_sh = df['Standard_Sh'].sum()
170
+ sh_accuracy = (df['Standard_SoT'].sum() / total_sh) if total_sh > 0 else 0
171
+
172
+ # Eficiencia xG por tiro
173
+ xg_shot = (df['Expected_xG'].sum() / total_sh) if total_sh > 0 else 0
174
+
175
+ # Presencia atacante (% toques en área rival)
176
+ total_touches = df['Touches_Touches'].sum()
177
+ attacking_presence = (df['Touches_Att 3rd'].sum() / total_touches) if total_touches > 0 else 0
178
+
179
+ # Tiros por posesión
180
+ total_poss = df['Poss'].sum()
181
+ possession_shot = (total_sh / total_poss) if total_poss > 0 else 0
182
+
183
+ # Distancia promedio de tiros
184
+ standard_dist = df['Standard_Dist'].mean() if 'Standard_Dist' in df.columns else 0
185
+
186
+ # ===========================
187
+ # MÉTRICAS DE CREACIÓN
188
+ # ===========================
189
+
190
+ # Ratio de pases progresivos
191
+ total_passes = df['Total_Att'].sum()
192
+ progressive_pass_ratio = (df['PrgP'].sum() / total_passes) if total_passes > 0 else 0
193
+
194
+ # Participación en último tercio
195
+ final_third_passes = df['1/3'].sum()
196
+ final_third_involvement = (final_third_passes / total_passes) if total_passes > 0 else 0
197
+
198
+ # Ratio de pases largos
199
+ long_ball_ratio = (df['Long_Att'].sum() / total_passes) if total_passes > 0 else 0
200
+
201
+ # Asistencias por SCA
202
+ total_sca = df['SCA Types_SCA'].sum()
203
+ assist_sca = (df['Ast'].sum() / total_sca) if total_sca > 0 else 0
204
+
205
+ # Dependencia de centros
206
+ cross_dependency = (df['Performance_Crs'].sum() / total_passes) if total_passes > 0 else 0
207
+
208
+ # Eficiencia creativa
209
+ creative_efficiency = (total_sca / total_poss) if total_poss > 0 else 0
210
+
211
+ # ===========================
212
+ # MÉTRICAS DEFENSIVAS
213
+ # ===========================
214
+
215
+ # Intensidad de presión alta
216
+ total_tackles = df['Tackles_Tkl'].sum()
217
+ high_press_intensity = (df['Tackles_Att 3rd'].sum() / total_tackles) if total_tackles > 0 else 0
218
+
219
+ # Ratio intercepciones/tackles
220
+ interception_tackle = (df['Int'].sum() / total_tackles) if total_tackles > 0 else 0
221
+
222
+ # Ratio bloqueos/tackles
223
+ blocks_tackle = (df['Blocks_Blocks'].sum() / total_tackles) if total_tackles > 0 else 0
224
+
225
+ # Ratio de despejes
226
+ total_defensive_actions = total_tackles + df['Int'].sum()
227
+ clearance_ratio = (df['Clr'].sum() / total_defensive_actions) if total_defensive_actions > 0 else 0
228
+
229
+ # ===========================
230
+ # MÉTRICAS DE PORTERÍA
231
+ # ===========================
232
+
233
+ # Rendimiento del portero normalizado
234
+ avg_save_pct = df['Performance_Save%'].mean() if 'Performance_Save%' in df.columns else 0
235
+ avg_xg_against = df['Expected_xG'].mean() if len(df) > 0 else 1
236
+ performance_save = (avg_save_pct / (1 / avg_xg_against)) if avg_xg_against > 0 else 0
237
+
238
+ # ===========================
239
+ # MÉTRICAS DE POSESIÓN
240
+ # ===========================
241
+
242
+ # Ratio de conducciones progresivas
243
+ total_carries = df['Carries_Carries'].sum()
244
+ progressive_carry_ratio = (df['Carries_PrgC'].sum() / total_carries) if total_carries > 0 else 0
245
+
246
+ # Ratio de conducciones al área
247
+ penalty_carry_ratio = (df['Carries_CPA'].sum() / total_carries) if total_carries > 0 else 0
248
+
249
+ # Balance conducción/pase progresivo
250
+ total_prog_passes = df['PrgP'].sum()
251
+ carry_pass_balance = (df['Carries_PrgC'].sum() / total_prog_passes) if total_prog_passes > 0 else 0
252
+
253
+ # ===========================
254
+ # ÍNDICES COMPUESTOS
255
+ # ===========================
256
+
257
+ # Índice ofensivo
258
+ avg_gf_raw = df['GF'].mean()
259
+ avg_xg_raw = df['Expected_xG'].mean()
260
+ avg_sot = df['Standard_SoT'].mean()
261
+ avg_sh = df['Standard_Sh'].mean()
262
+ offensive_index = (avg_gf_raw + avg_xg_raw) * (avg_sot / avg_sh) if avg_sh > 0 else 0
263
+
264
+ # Índice defensivo
265
+ avg_int = df['Int'].mean()
266
+ avg_tkl = df['Tackles_Tkl'].mean()
267
+ avg_clr = df['Clr'].mean()
268
+ defensive_index = avg_save_pct * (avg_int / (avg_tkl + avg_clr)) if (avg_tkl + avg_clr) > 0 else 0
269
+
270
+ # Índice de control de posesión
271
+ avg_touches_att = df['Touches_Att 3rd'].mean()
272
+ avg_carries_third = df['Carries_1/3'].mean() if 'Carries_1/3' in df.columns else 0
273
+ avg_touches_total = df['Touches_Touches'].mean()
274
+ possession_control_index = ((avg_touches_att + avg_carries_third) / avg_touches_total) if avg_touches_total > 0 else 0
275
+
276
+ # Índice de transición
277
+ avg_prgp = df['PrgP'].mean()
278
+ avg_prgc = df['Carries_PrgC'].mean()
279
+ avg_poss_raw = df['Poss'].mean()
280
+ transition_index = ((avg_prgp + avg_prgc) / avg_poss_raw) if avg_poss_raw > 0 else 0
281
+
282
+ # ✅ RETORNAR TODAS LAS MÉTRICAS (23 valores)
283
+ return (
284
+ avg_ck,
285
+ var_ck, # 0 - ✅ CAMBIADO: varianza en vez de promedio
286
+ avg_xg, # 1
287
+ avg_sca, # 2
288
+ avg_cross, # 3
289
+ avg_poss, # 4
290
+ avg_att_3rd, # 5
291
+ avg_gf, # 6
292
+ avg_ga, # 7
293
+ sh_accuracy, # 8
294
+ xg_shot, # 9
295
+ attacking_presence, # 10
296
+ possession_shot, # 11
297
+ progressive_pass_ratio, # 12
298
+ final_third_involvement, # 13
299
+ assist_sca, # 14
300
+ creative_efficiency, # 15
301
+ high_press_intensity, # 16
302
+ interception_tackle, # 17
303
+ clearance_ratio, # 18
304
+ progressive_carry_ratio, # 19
305
+ carry_pass_balance, # 20
306
+ offensive_index, # 21
307
+ transition_index # 22
308
+ )
309
+
310
+ # ===========================
311
+ # PROMEDIOS DE LIGA (is_team=False)
312
+ # ===========================
313
+
314
+ avg_cross = df['Performance_Crs'].mean()
315
+ avg_att_3rd = df['Touches_Att 3rd'].mean()
316
+ avg_sca = df['SCA Types_SCA'].mean()
317
+ avg_xg = df['Expected_xG'].mean()
318
+
319
+ # ✅ CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
320
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
321
+ avg_ck = df['Pass Types_CK'].mean()
322
+
323
+ avg_gf = df['GF'].mean()
324
+ avg_ga = df['GA'].mean()
325
+
326
+ # ✅ AGREGAR MÉTRICAS BÁSICAS PARA NORMALIZACIÓN
327
+ avg_sh = df['Standard_Sh'].mean() if 'Standard_Sh' in df.columns else 0
328
+
329
+ return (
330
+
331
+ var_ck, # 0 - ✅ CAMBIADO
332
+ avg_xg, # 1
333
+ avg_sca, # 2
334
+ avg_cross, # 3
335
+ avg_att_3rd, # 4
336
+ avg_gf, # 5
337
+ avg_ga, # 6
338
+ avg_sh, # 7 - NUEVO
339
+ avg_ck
340
+ )
341
+
342
+
343
+
344
+ class PROCESS_DATA():
345
+ def __init__(self,use_one_hot_encoding):
346
+
347
+ self.USE_ONE_HOT_ENCODING = use_one_hot_encoding
348
+
349
+ self.init_variables()
350
+
351
+ self.load_clean_dataset()
352
+
353
+ self.process_all_matches()
354
+
355
+ self.clean_and_ouput_dataset()
356
+ # Excluir temporada 1718 si es necesario
357
+
358
+
359
+ def init_variables(self):
360
+
361
+ self.y = []
362
+ self.y_home = []
363
+ self.y_away = []
364
+
365
+ self.lst_data = []
366
+
367
+ self.lst_years = ["1819", "1920", "2021", "2122", "2223", "2324", "2425", "2526"]
368
+
369
+ # ✅ CONSTRUIR VECTOR DE FEATURES CON NOMBRES DESCRIPTIVOS
370
+ self.lst_base_advanced = [
371
+ "avg_ck","var_ck", # ✅ CAMBIADO
372
+ "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga",
373
+ "sh_accuracy", "xg_shot", "attacking_presence", "possession_shot",
374
+ "progressive_pass_ratio", "final_third_involvement", "assist_sca", "creative_efficiency",
375
+ "high_press_intensity", "interception_tackle", "clearance_ratio",
376
+ "progressive_carry_ratio", "carry_pass_balance", "offensive_index", "transition_index"
377
+ ]
378
+
379
+ self.lst_base_original = [
380
+ "var_ck","xg", "sca", "cross", "poss", "att_3rd", "gf", "ga","avg_ck"
381
+ ]
382
+
383
+ print("Variables inicializadas")
384
+
385
+ def load_clean_dataset(self, iqr_multiplier=4.5, by_league=True):
386
+ """
387
+ Cargar dataset y eliminar outliers con IQR
388
+
389
+ Args:
390
+ iqr_multiplier: Multiplicador IQR (1.5 = estándar)
391
+ by_league: Si True, calcula IQR por liga (más preciso)
392
+ """
393
+
394
+ # Cargar datasets
395
+ self.df_dataset_historic = pd.read_csv("dataset/cleaned/dataset_cleaned.csv")
396
+
397
+ if os.path.exists(r"dataset/cleaned/dataset_cleaned_current_year.csv"):
398
+ self.df_dataset_current_year = pd.read_csv("dataset/cleaned/dataset_cleaned_current_year.csv")
399
+ self.df_dataset = pd.concat([self.df_dataset_historic, self.df_dataset_current_year])
400
+ else:
401
+ self.df_dataset = self.df_dataset_historic
402
+
403
+ self.df_dataset["season"] = self.df_dataset["season"].astype(str)
404
+ self.df_dataset["Performance_Save%"].fillna(0, inplace=True)
405
+
406
+
407
+ print(f"✅ Dataset cargado: {self.df_dataset.shape}")
408
+
409
+ # ===========================
410
+ # ELIMINAR OUTLIERS
411
+ # ===========================
412
+
413
+ print(f"\n🧹 ELIMINANDO OUTLIERS (IQR × {iqr_multiplier})...")
414
+ if by_league:
415
+ print(" Método: IQR por liga (más preciso)")
416
+ else:
417
+ print(" Método: IQR global")
418
+
419
+ # Columnas numéricas
420
+ exclude_cols = ['date', 'season', 'league', 'team', 'opponent', 'venue',
421
+ 'round', 'game', 'result', 'local', 'away']
422
+
423
+ numeric_cols = self.df_dataset.select_dtypes(include=['float64', 'int64']).columns.tolist()
424
+ numeric_cols = [col for col in numeric_cols if col not in exclude_cols]
425
+
426
+ print(f" Columnas numéricas: {len(numeric_cols)}")
427
+
428
+ filas_antes = len(self.df_dataset)
429
+
430
+ if by_league:
431
+ # ===========================
432
+ # ELIMINAR POR LIGA (RECOMENDADO)
433
+ # ===========================
434
+
435
+ dfs_limpios = []
436
+
437
+ for league in self.df_dataset['league'].unique():
438
+ df_league = self.df_dataset[self.df_dataset['league'] == league].copy()
439
+ filas_liga_antes = len(df_league)
440
+
441
+ # Calcular IQR por liga
442
+ for col in numeric_cols:
443
+ Q1 = df_league[col].quantile(0.25)
444
+ Q3 = df_league[col].quantile(0.75)
445
+ IQR = Q3 - Q1
446
+
447
+ lower_bound = Q1 - iqr_multiplier * IQR
448
+ upper_bound = Q3 + iqr_multiplier * IQR
449
+
450
+ mask = (df_league[col] >= lower_bound) & (df_league[col] <= upper_bound)
451
+ df_league = df_league[mask]
452
+
453
+ filas_liga_despues = len(df_league)
454
+ eliminadas = filas_liga_antes - filas_liga_despues
455
+
456
+ print(f" {league}: {filas_liga_antes} {filas_liga_despues} (-{eliminadas})")
457
+
458
+ dfs_limpios.append(df_league)
459
+
460
+ self.df_dataset = pd.concat(dfs_limpios, ignore_index=True)
461
+
462
+ else:
463
+ # ===========================
464
+ # ELIMINAR GLOBAL
465
+ # ===========================
466
+
467
+ for col in numeric_cols:
468
+ Q1 = self.df_dataset[col].quantile(0.25)
469
+ Q3 = self.df_dataset[col].quantile(0.75)
470
+ IQR = Q3 - Q1
471
+
472
+ lower_bound = Q1 - iqr_multiplier * IQR
473
+ upper_bound = Q3 + iqr_multiplier * IQR
474
+
475
+ mask = (self.df_dataset[col] >= lower_bound) & (self.df_dataset[col] <= upper_bound)
476
+ self.df_dataset = self.df_dataset[mask]
477
+
478
+ filas_despues = len(self.df_dataset)
479
+ filas_eliminadas = filas_antes - filas_despues
480
+ porcentaje_eliminado = (filas_eliminadas / filas_antes) * 100
481
+
482
+ print(f"\n✅ RESUMEN:")
483
+ print(f" Filas antes: {filas_antes:,}")
484
+ print(f" Filas después: {filas_despues:,}")
485
+ print(f" Eliminadas: {filas_eliminadas:,} ({porcentaje_eliminado:.2f}%)")
486
+ print(f" Shape final: {self.df_dataset.shape}")
487
+
488
+ # ===========================
489
+ # PREPARAR MATCHES
490
+ # ===========================
491
+
492
+ self.df_dataset_export = self.df_dataset.copy()
493
+ self.df_dataset_export = self.df_dataset_export.drop_duplicates(subset=["game", "league"])
494
+ self.df_dataset_export = self.df_dataset_export.sort_values(by='date', ascending=True)
495
+ print(self.df_dataset_export.head(10))
496
+ self.df_dataset_export = self.df_dataset_export[["local", "away", "round", "season", "date", "league"]]
497
+
498
+ self.lst_matches = self.df_dataset_export.values.tolist()
499
+ self.lst_matches = [row for row in self.lst_matches if row[3] != "1718"]
500
+
501
+ print(f"✅ Partidos a procesar: {len(self.lst_matches)}")
502
+
503
+ def process_all_matches(self):
504
+
505
+ for i in self.lst_matches:
506
+ if i[2] < 5:
507
+ continue
508
+
509
+ local = i[0]
510
+ away = i[1]
511
+ round_num = i[2]
512
+ season = i[3]
513
+ date = i[4]
514
+ league_code = i[5]
515
+
516
+ dic_df = {}
517
+ # Promedios de liga
518
+ lst_avg = get_average(
519
+ self.df_dataset[
520
+ (self.df_dataset['season'] == season) &
521
+ (self.df_dataset['round'] < round_num) &
522
+ (self.df_dataset['league'] == league_code)
523
+ ],
524
+ is_team=False
525
+ )
526
+
527
+ # ✅ FUNCIÓN MEJORADA: Maneja métricas originales y avanzadas
528
+ def create_line(df, is_form=True, is_team=False, use_advanced=True):
529
+ """
530
+ Args:
531
+ df: DataFrame con datos del equipo
532
+ is_form: Si True, toma solo últimos 8 partidos
533
+ is_team: Si True, normaliza contra promedios de liga
534
+ use_advanced: Si True, incluye métricas avanzadas (23 valores)
535
+ Si False, solo métricas originales (8 valores)
536
+ """
537
+ if is_form:
538
+ df = df[-6:]
539
+
540
+ if use_advanced:
541
+ # Retorna 23 valores (todas las métricas)
542
+ return get_average(df, is_team, lst_avg)
543
+ else:
544
+ # Retorna solo 8 valores originales
545
+ result = get_average(df, is_team, lst_avg)
546
+ return result[:9] # Primeros 8 valores
547
+
548
+
549
+
550
+ # Extraer DataFrames
551
+ (team1_home, team1_away, team1_opp_home, team1_opp_away,
552
+ team2_home, team2_away, team2_opp_home, team2_opp_away) = get_dataframes(
553
+ self.df_dataset, season, round_num, local, away, league=league_code
554
+ )
555
+
556
+ # Corners reales
557
+ ck = get_ck(self.df_dataset, season, round_num, local, away, league=league_code)
558
+ self.y.append(ck[0])
559
+ dic_df['y_home'] = (ck[1],)
560
+ dic_df['y_away'] = (ck[2],)
561
+ dic_df['gol_total'] = (ck[3],)
562
+ dic_df['gol_home'] = (ck[4],)
563
+ dic_df['gol_away'] = (ck[5],)
564
+ dic_df['eg_total'] = (ck[6],)
565
+ dic_df['eg_home'] = (ck[7],)
566
+ dic_df['eg_away'] = (ck[8],)
567
+ dic_df['st_total'] = (ck[9],)
568
+ dic_df['st_home'] = (ck[10],)
569
+ dic_df['st_away'] = (ck[11],)
570
+
571
+ # Head to Head
572
+ index = self.lst_years.index(season)
573
+ result = self.lst_years[:index+1]
574
+ team1_h2h, team2_h2h = get_head_2_head(
575
+ self.df_dataset, local, away, seasons=result, league=league_code
576
+ )
577
+
578
+ # PPP
579
+ local_ppp = get_team_ppp(self.df_dataset, local, season, round_num, league=league_code)
580
+ away_ppp = get_team_ppp(self.df_dataset, away, season, round_num, league=league_code)
581
+ ppp_diff = local_ppp - away_ppp
582
+
583
+ dic_df['ppp_local'] = (local_ppp,)
584
+ dic_df['ppp_away'] = (away_ppp,)
585
+ dic_df['ppp_difference'] = (ppp_diff,)
586
+ if i[2] < 15:
587
+ dic_df['round'] = (1,)
588
+ elif i[2] < 15 and i[2] > 25:
589
+ dic_df['round'] = (2,)
590
+ else:
591
+ dic_df['round'] = (3,)
592
+
593
+
594
+
595
+ # ✅ FEATURES CON MÉTRICAS AVANZADAS (23 valores cada una)
596
+ dic_df['lst_team1_home_form'] = create_line(team1_home, True, True, use_advanced=True)
597
+ dic_df['lst_team1_home_general'] = create_line(team1_home, False, True, use_advanced=True)
598
+ dic_df['lst_team1_away_form'] = create_line(team1_away, True, True, use_advanced=True)
599
+ dic_df['lst_team1_away_general'] = create_line(team1_away, False, True, use_advanced=True)
600
+
601
+ dic_df['lst_team2_home_form'] = create_line(team2_home, True, True, use_advanced=True)
602
+ dic_df['lst_team2_home_general'] = create_line(team2_home, False, True, use_advanced=True)
603
+ dic_df['lst_team2_away_form'] = create_line(team2_away, True, True, use_advanced=True)
604
+ dic_df['lst_team2_away_general'] = create_line(team2_away, False, True, use_advanced=True)
605
+
606
+ dic_df['lst_team1_h2h'] = create_line(team1_h2h, False, True, use_advanced=True)
607
+ dic_df['lst_team2_h2h'] = create_line(team2_h2h, False, True, use_advanced=True)
608
+
609
+ # ✅ FEATURES CON MÉTRICAS ORIGINALES (8 valores) - SOLO PARA OPONENTES
610
+ dic_df['lst_team1_opp_away'] = create_line(team1_opp_away, False, True, use_advanced=False)
611
+ dic_df['lst_team2_opp_home'] = create_line(team2_opp_home, False, True, use_advanced=False)
612
+
613
+ # One-Hot Encoding
614
+ if self.USE_ONE_HOT_ENCODING:
615
+ league_dummies = {
616
+ 'league_ESP': 1 if league_code == 'ESP' else 0,
617
+ 'league_GER': 1 if league_code == 'GER' else 0,
618
+ 'league_FRA': 1 if league_code == 'FRA' else 0,
619
+ 'league_ITA': 1 if league_code == 'ITA' else 0,
620
+ 'league_NED': 1 if league_code == 'NED' else 0,
621
+ 'league_ENG': 1 if league_code == 'ENG' else 0,
622
+ 'league_POR': 1 if league_code == 'POR' else 0,
623
+ 'league_BEL': 1 if league_code == 'BEL' else 0
624
+ }
625
+
626
+ for key, value in league_dummies.items():
627
+ dic_df[key] = (value,)
628
+
629
+
630
+
631
+ lst_features_values = []
632
+ self.lst_features_values = []
633
+
634
+ for key in dic_df:
635
+ lst_features_values.extend(list(dic_df[key]))
636
+
637
+ # Casos especiales
638
+ if key in ['ppp_local', 'ppp_away', 'ppp_difference','round','y_home','y_away',"gol_total","gol_home","gol_away","eg_total","eg_home","eg_away","st_total","st_home","st_away"]:
639
+ self.lst_features_values.append(key)
640
+ elif key.startswith('league_'):
641
+ self.lst_features_values.append(key)
642
+ elif key in ['lst_team1_opp_away', 'lst_team2_opp_home']:
643
+ # ✅ Métricas ORIGINALES (8 valores)
644
+ self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_original])
645
+ else:
646
+ # ✅ Métricas AVANZADAS (23 valores)
647
+ self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_advanced])
648
+
649
+ self.lst_data.append(lst_features_values)
650
+ print("Dataset processed")
651
+
652
+ def clean_and_ouput_dataset(self):
653
+
654
+ self.df_data = pd.DataFrame(data=self.lst_data, columns=self.lst_features_values)
655
+
656
+
657
+
658
+ print(f"\n✅ PROCESAMIENTO COMPLETADO:")
659
+ print(f" Shape inicial: {self.df_data.shape}")
660
+ print(f" Total partidos: {len(self.df_data)}")
661
+ print(f" Features totales: {self.df_data.shape[1]}")
662
+
663
+ # ===========================
664
+ # LIMPIEZA DE DATOS NULOS
665
+ # ===========================
666
+
667
+ print(f"\n🧹 LIMPIANDO DATOS NULOS...")
668
+
669
+ import numpy as np
670
+ nulos_antes_X = self.df_data.isnull().sum().sum()
671
+ nulos_antes_y = np.isnan(self.y).sum() if isinstance(self.y, np.ndarray) else sum(pd.isna(self.y))
672
+
673
+ print(f" Nulos en X (antes): {nulos_antes_X}")
674
+ print(f" Nulos en Y (antes): {nulos_antes_y}")
675
+
676
+ y_array = np.array(self.y).flatten()
677
+
678
+ mask_valid_X = ~self.df_data.isnull().any(axis=1)
679
+ mask_valid_y = ~np.isnan(y_array)
680
+ mask_combined = mask_valid_X & mask_valid_y
681
+
682
+ self.df_data = self.df_data[mask_combined].reset_index(drop=True)
683
+ y_array = y_array[mask_combined]
684
+
685
+ print(f"\n✅ LIMPIEZA COMPLETADA:")
686
+ print(f" Nulos en X (después): {self.df_data.isnull().sum().sum()}")
687
+ print(f" Nulos en Y (después): {np.isnan(y_array).sum()}")
688
+ print(f" Filas eliminadas: {len(mask_combined) - mask_combined.sum()}")
689
+ print(f" Shape final: {self.df_data.shape}")
690
+
691
+ # ===========================
692
+ # VERIFICACIÓN FINAL
693
+ # ===========================
694
+
695
+ print(f"\n🔍 VERIFICACIÓN DE NUEVAS FEATURES:")
696
+ print(f" ✅ Features con 'var_ck': {len([c for c in self.df_data.columns if 'var_ck' in c])}")
697
+ print(f" ✅ Features con métricas avanzadas: {len([c for c in self.df_data.columns if any(m in c for m in ['sh_accuracy', 'offensive_index'])])}")
698
+ print(f" ✅ Features de oponentes (8 valores): {len([c for c in self.df_data.columns if 'opp' in c])}")
699
+
700
+ print("\n" + "=" * 80)
701
+ print("✅ PROCESO COMPLETADO - DATOS LISTOS PARA ENTRENAMIENTO")
702
+ print("=" * 80)
703
+
704
+ self.y = y_array.tolist()
705
+
706
+ self.df_data["y"] = self.y
707
+ self.df_data.to_csv(r"dataset/processed/dataset_processed.csv",index=False)
708
+ print("Dataset")
709
+
710
+ #a = PROCESS_DATA(True)
711
+