daniel-saed commited on
Commit
c2aaace
·
verified ·
1 Parent(s): 315039f

Upload 21 files

Browse files
Dockerfile_STREAMLIT ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Establecer directorio de trabajo
4
+ WORKDIR /app
5
+
6
+ # Copiar requirements
7
+ COPY requirements.txt .
8
+
9
+ # Instalar dependencias
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copiar todo el código
13
+ COPY . .
14
+
15
+ # Exponer el puerto que usa Hugging Face Spaces
16
+ EXPOSE 7860
17
+
18
+ # ✅ COMANDO PARA FASTAPI EN HUGGING FACE SPACES
19
+ CMD ["uvicorn", "src.api.api:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Data Processing
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+
5
+ # Machine Learning
6
+ scikit-learn>=1.3.0
7
+ xgboost>=2.0.0
8
+
9
+ # Statistics
10
+ scipy>=1.11.0
11
+
12
+ # Data Collection
13
+ soccerdata>=1.4.0
14
+
15
+ # Experiment Tracking & Model Management
16
+ mlflow>=2.8.0
17
+
18
+ # Model Persistence
19
+ joblib>=1.3.0
20
+
21
+ fastapi>=0.115.4
22
+
23
+ # Security
24
+ python-dotenv>=1.0.0
25
+
26
+ # Model
27
+ joblib>=1.3.0
28
+
29
+ streamlit>=1.28.0
30
+
31
+ plotly
32
+ requests
src/api/__init__.py ADDED
File without changes
src/api/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (162 Bytes). View file
 
src/api/__pycache__/api.cpython-311.pyc ADDED
Binary file (7.16 kB). View file
 
src/api/__pycache__/load.cpython-311.pyc ADDED
Binary file (48.8 kB). View file
 
src/api/api.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================
2
+ # SISTEMA DE PREDICCIÓN DE CORNERS - OPTIMIZADO PARA APUESTAS (VERSIÓN COMPLETA)
3
+ # ===========================
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import os
8
+ from fastapi.responses import JSONResponse
9
+ from fastapi import Depends, FastAPI, HTTPException
10
+ from fastapi.security.api_key import APIKeyHeader
11
+ from fastapi import Security
12
+ from fastapi.responses import JSONResponse
13
+ from dotenv import load_dotenv
14
+ from src.api.load import USE_MODEL
15
+ #from load import USE_MODEL
16
+
17
+ load_dotenv()
18
+
19
+ model = USE_MODEL()
20
+
21
+ app = FastAPI()
22
+
23
+ # ===========================
24
+ # CONFIGURACIÓN API KEY
25
+ # ===========================
26
+
27
+ API_KEY = os.getenv("API_KEY") # ⚠️ CÁMBIALA POR UNA SEGURA
28
+ api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
29
+
30
+ async def get_api_key(api_key: str = Security(api_key_header)):
31
+ """Validar API Key"""
32
+ if api_key != API_KEY:
33
+ raise HTTPException(
34
+ status_code=401,
35
+ detail="API Key inválida o faltante"
36
+ )
37
+ return api_key
38
+
39
+ # ===========================
40
+ # HELPER: CONVERTIR NUMPY/PANDAS A TIPOS NATIVOS
41
+ # ===========================
42
+ def convert_to_native(val):
43
+ """Convierte tipos NumPy/Pandas a tipos nativos de Python"""
44
+ if isinstance(val, (np.integer, np.int64, np.int32, np.int16, np.int8)):
45
+ return int(val)
46
+ elif isinstance(val, (np.floating, np.float64, np.float32, np.float16)):
47
+ return float(val)
48
+ elif isinstance(val, np.ndarray):
49
+ return [convert_to_native(item) for item in val.tolist()]
50
+ elif isinstance(val, dict):
51
+ return {key: convert_to_native(value) for key, value in val.items()}
52
+ elif isinstance(val, (list, tuple)):
53
+ return [convert_to_native(item) for item in val]
54
+ elif isinstance(val, pd.Series):
55
+ return convert_to_native(val.to_dict())
56
+ elif isinstance(val, pd.DataFrame):
57
+ return convert_to_native(val.to_dict(orient='records'))
58
+ elif pd.isna(val):
59
+ return None
60
+ else:
61
+ return val
62
+
63
+
64
+
65
+
66
+ # ===========================
67
+ # ENDPOINTS
68
+ # ===========================
69
+
70
+ @app.get("/")
71
+ def read_root():
72
+ """Endpoint raíz con información de la API"""
73
+ return {
74
+ "api": "Corners Prediction API",
75
+ "version": "1.0.0",
76
+ "status": "active",
77
+ "endpoints": {
78
+ "/": "Información de la API",
79
+ "/items/": "Predicción de corners (requiere API Key)",
80
+ "/health": "Estado de salud"
81
+ },
82
+ "auth": "Requiere header: X-API-Key"
83
+ }
84
+
85
+
86
+
87
+ @app.get("/items/")
88
+ def predict_corners(
89
+ local: str,
90
+ visitante: str,
91
+ jornada: int,
92
+ league_code: str,
93
+ temporada: str = "2526",
94
+ api_key: str = Depends(get_api_key) # ✅ PROTEGIDO
95
+ ):
96
+ """
97
+ Predecir corners para un partido de fútbol
98
+
99
+ Args:
100
+ local: Nombre del equipo local (requerido)
101
+ visitante: Nombre del equipo visitante (requerido)
102
+ jornada: Número de jornada (requerido, min: 1)
103
+ league_code: Código de liga (requerido: ESP, GER, FRA, ITA, ENG, NED, POR, BEL)
104
+ temporada: Temporada en formato AABB (default: "2526")
105
+
106
+ Returns:
107
+ JSON con predicción y análisis completo
108
+
109
+ Example:
110
+ GET /items/?local=Barcelona&visitante=Real%20Madrid&jornada=15&league_code=ESP&temporada=2526
111
+ Headers: X-API-Key: tu-clave-secreta-aqui
112
+ """
113
+
114
+ # ===========================
115
+ # VALIDACIONES
116
+ # ===========================
117
+
118
+ # Validar campos obligatorios
119
+ if not local or not visitante:
120
+ raise HTTPException(
121
+ status_code=400,
122
+ detail="Los parámetros 'local' y 'visitante' son obligatorios"
123
+ )
124
+
125
+ # Validar jornada
126
+ if jornada < 1:
127
+ raise HTTPException(
128
+ status_code=400,
129
+ detail="La jornada debe ser mayor o igual a 1"
130
+ )
131
+
132
+ # Validar liga
133
+ valid_leagues = ["ESP", "GER", "FRA", "ITA", "ENG", "NED", "POR", "BEL"]
134
+ if league_code not in valid_leagues:
135
+ raise HTTPException(
136
+ status_code=400,
137
+ detail=f"Liga inválida. Ligas válidas: {', '.join(valid_leagues)}"
138
+ )
139
+
140
+ # ===========================
141
+ # PREDICCIÓN
142
+ # ===========================
143
+
144
+ try:
145
+ resultado = model.consume_model_single(
146
+ local=local,
147
+ visitante=visitante,
148
+ jornada=jornada,
149
+ temporada=temporada,
150
+ league_code=league_code
151
+ )
152
+
153
+ # Verificar si hubo error en la predicción
154
+ if resultado.get("error"):
155
+ raise HTTPException(
156
+ status_code=422,
157
+ detail=f"Error en predicción: {resultado['error']}"
158
+ )
159
+
160
+ # ✅ CONVERTIR TIPOS NUMPY A NATIVOS
161
+ resultado_limpio = convert_to_native(resultado)
162
+
163
+ # Agregar metadata
164
+ resultado_limpio["metadata"] = {
165
+ "api_version": "1.0.0",
166
+ "model_version": "v4",
167
+ "timestamp": pd.Timestamp.now().isoformat()
168
+ }
169
+
170
+ return JSONResponse(
171
+ status_code=200,
172
+ content=resultado_limpio
173
+ )
174
+
175
+ except HTTPException:
176
+ # Re-lanzar excepciones HTTP
177
+ raise
178
+
179
+ except Exception as e:
180
+ # Capturar cualquier otro error
181
+ import traceback
182
+ error_detail = {
183
+ "error": str(e),
184
+ "type": type(e).__name__,
185
+ "traceback": traceback.format_exc() if app.debug else None
186
+ }
187
+
188
+ return JSONResponse(
189
+ status_code=500,
190
+ content=error_detail
191
+ )
src/api/load.py ADDED
@@ -0,0 +1,1208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================
2
+ # SISTEMA DE PREDICCIÓN DE CORNERS - OPTIMIZADO PARA APUESTAS (VERSIÓN COMPLETA)
3
+ # ===========================
4
+
5
+ import requests
6
+ import tempfile
7
+ import numpy as np
8
+ import pandas as pd
9
+ import joblib
10
+ from scipy.stats import poisson
11
+ from scipy import stats
12
+ import os
13
+ import sys
14
+ from src.process_data.process_dataset import get_dataframes,get_head_2_head,get_points_from_result,get_team_ppp,get_ppp_difference,get_average
15
+ #from process_data.process_dataset import get_dataframes,get_head_2_head,get_points_from_result,get_team_ppp,get_ppp_difference,get_average
16
+ #project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
17
+ #sys.path.insert(0, project_root)
18
+ # ===========================
19
+ # 1. FUNCIONES FIABILIDAD
20
+ # ===========================
21
+
22
+ def analizar_fiabilidad_equipos(df_database, temporada="2526", min_partidos=5):
23
+ """
24
+ Análisis completo de fiabilidad para apuestas de corners
25
+ No solo varianza, sino consistencia, tendencias y patrones
26
+ """
27
+
28
+ df_temp = df_database[df_database['season'] == temporada].copy()
29
+ resultados = []
30
+ equipos = pd.concat([df_temp['team'], df_temp['opponent']]).unique()
31
+
32
+ for equipo in equipos:
33
+ # Partidos del equipo
34
+ partidos_equipo = df_temp[df_temp['team'] == equipo]
35
+
36
+ if len(partidos_equipo) < min_partidos:
37
+ continue
38
+
39
+ ck_sacados = partidos_equipo['Pass Types_CK'].values
40
+
41
+ # ===========================
42
+ # 1. MÉTRICAS DE VARIABILIDAD
43
+ # ===========================
44
+ media = ck_sacados.mean()
45
+ std = ck_sacados.std()
46
+ cv = (std / media * 100) if media > 0 else 0
47
+
48
+ # ===========================
49
+ # 2. MÉTRICAS DE CONSISTENCIA
50
+ # ===========================
51
+
52
+ # 2.1 Porcentaje de partidos cerca de la media (±2 corners)
53
+ cerca_media = np.sum(np.abs(ck_sacados - media) <= 2) / len(ck_sacados) * 100
54
+
55
+ # 2.2 Rachas (detectar equipos con "explosiones" de corners)
56
+ cambios_bruscos = np.sum(np.abs(np.diff(ck_sacados)) > 4)
57
+ pct_cambios_bruscos = cambios_bruscos / (len(ck_sacados) - 1) * 100
58
+
59
+ # 2.3 Cuartiles (Q1, Q2=mediana, Q3)
60
+ q1, q2, q3 = np.percentile(ck_sacados, [25, 50, 75])
61
+ iqr = q3 - q1 # Rango intercuartílico (más robusto que std)
62
+
63
+ # ===========================
64
+ # 3. MÉTRICAS DE TENDENCIA
65
+ # ===========================
66
+
67
+ # 3.1 Tendencia lineal (¿mejora/empeora con el tiempo?)
68
+ jornadas = np.arange(len(ck_sacados))
69
+ slope, intercept, r_value, p_value, std_err = stats.linregress(jornadas, ck_sacados)
70
+
71
+ # 3.2 Autocorrelación (¿resultado actual predice el siguiente?)
72
+ if len(ck_sacados) > 2:
73
+ autocorr = np.corrcoef(ck_sacados[:-1], ck_sacados[1:])[0, 1]
74
+ else:
75
+ autocorr = 0
76
+
77
+ # ===========================
78
+ # 4. MÉTRICAS DE OUTLIERS
79
+ # ===========================
80
+
81
+ # 4.1 Detección de valores atípicos (método IQR)
82
+ lower_bound = q1 - 1.5 * iqr
83
+ upper_bound = q3 + 1.5 * iqr
84
+ outliers = np.sum((ck_sacados < lower_bound) | (ck_sacados > upper_bound))
85
+ pct_outliers = outliers / len(ck_sacados) * 100
86
+
87
+ # 4.2 Z-score máximo
88
+ z_scores = np.abs(stats.zscore(ck_sacados))
89
+ max_z = z_scores.max()
90
+
91
+ # ===========================
92
+ # 5. MÉTRICAS DE RANGO
93
+ # ===========================
94
+
95
+ rango = ck_sacados.max() - ck_sacados.min()
96
+ rango_normalizado = rango / media if media > 0 else 0
97
+
98
+ # ===========================
99
+ # 6. SCORE GLOBAL DE FIABILIDAD
100
+ # ===========================
101
+
102
+ # Penalizaciones (0-100, menor = peor)
103
+ score_cv = max(0, 100 - cv * 2) # CV alto = mala
104
+ score_consistencia = cerca_media # Más cerca de media = mejor
105
+ score_cambios = max(0, 100 - pct_cambios_bruscos * 2) # Cambios bruscos = malo
106
+ score_outliers = max(0, 100 - pct_outliers * 3) # Outliers = malo
107
+ score_iqr = max(0, 100 - iqr * 10) # IQR grande = malo
108
+
109
+ # Score final (promedio ponderado)
110
+ score_fiabilidad = (
111
+ score_cv * 0.25 +
112
+ score_consistencia * 0.30 +
113
+ score_cambios * 0.20 +
114
+ score_outliers * 0.15 +
115
+ score_iqr * 0.10
116
+ )
117
+
118
+ # ===========================
119
+ # 7. CLASIFICACIÓN MULTI-CRITERIO
120
+ # ===========================
121
+
122
+ # Clasificación basada en score
123
+ if score_fiabilidad >= 70:
124
+ nivel = "EXCELENTE ⭐⭐⭐"
125
+ color = "#27ae60"
126
+ elif score_fiabilidad >= 55:
127
+ nivel = "BUENO ✅"
128
+ color = "#2ecc71"
129
+ elif score_fiabilidad >= 40:
130
+ nivel = "ACEPTABLE 🟡"
131
+ color = "#f39c12"
132
+ elif score_fiabilidad >= 25:
133
+ nivel = "REGULAR ⚠️"
134
+ color = "#e67e22"
135
+ else:
136
+ nivel = "EVITAR ⛔"
137
+ color = "#e74c3c"
138
+
139
+ resultados.append({
140
+ 'Equipo': equipo,
141
+ 'Partidos': len(ck_sacados),
142
+
143
+ # Estadísticas básicas
144
+ 'Media_CK': round(media, 2),
145
+ 'Mediana_CK': round(q2, 2),
146
+ 'Std_CK': round(std, 2),
147
+ 'CV_%': round(cv, 1),
148
+
149
+ # Consistencia
150
+ 'Pct_Cerca_Media': round(cerca_media, 1),
151
+ 'Cambios_Bruscos_%': round(pct_cambios_bruscos, 1),
152
+ 'IQR': round(iqr, 2),
153
+
154
+ # Rango
155
+ 'Rango': int(rango),
156
+ 'Rango_Norm': round(rango_normalizado, 2),
157
+ 'Min': int(ck_sacados.min()),
158
+ 'Max': int(ck_sacados.max()),
159
+
160
+ # Outliers
161
+ 'Outliers': int(outliers),
162
+ 'Pct_Outliers': round(pct_outliers, 1),
163
+ 'Max_ZScore': round(max_z, 2),
164
+
165
+ # Tendencia
166
+ 'Tendencia_Slope': round(slope, 3),
167
+ 'Autocorr': round(autocorr, 3),
168
+
169
+ # Score y clasificación
170
+ 'Score_Fiabilidad': round(score_fiabilidad, 1),
171
+ 'Nivel': nivel,
172
+ 'Color': color
173
+ })
174
+
175
+ df_resultado = pd.DataFrame(resultados)
176
+
177
+ df_resultado = df_resultado.sort_values('Score_Fiabilidad', ascending=False)
178
+
179
+ return df_resultado
180
+
181
+ def mostrar_analisis_fiabilidad(df_analisis, top_n=10):
182
+ """
183
+ Muestra el análisis completo de fiabilidad
184
+ """
185
+
186
+ print("\n" + "=" * 120)
187
+ print("🎯 ANÁLISIS DE FIABILIDAD PARA APUESTAS - CORNERS")
188
+ print("=" * 120)
189
+
190
+ # TOP EQUIPOS FIABLES
191
+ print(f"\n⭐ TOP {top_n} EQUIPOS MÁS FIABLES")
192
+ print("-" * 120)
193
+
194
+ top_fiables = df_analisis.head(top_n)
195
+
196
+ for idx, row in top_fiables.iterrows():
197
+ print(f"\n{row['Equipo']:25s} | {row['Nivel']:20s} | Score: {row['Score_Fiabilidad']:.1f}")
198
+ print(f" 📊 Media: {row['Media_CK']:.1f} | Mediana: {row['Mediana_CK']:.1f} | CV: {row['CV_%']:.1f}%")
199
+ print(f" ✅ {row['Pct_Cerca_Media']:.1f}% cerca de media | IQR: {row['IQR']:.1f}")
200
+ print(f" ⚠️ Cambios bruscos: {row['Cambios_Bruscos_%']:.1f}% | Outliers: {row['Pct_Outliers']:.1f}%")
201
+ print(f" 📈 Rango: {row['Min']}-{row['Max']} ({row['Rango']} corners)")
202
+
203
+ # TOP EQUIPOS NO FIABLES
204
+ print(f"\n\n⛔ TOP {top_n} EQUIPOS MENOS FIABLES")
205
+ print("-" * 120)
206
+
207
+ top_no_fiables = df_analisis.tail(top_n)
208
+
209
+ for idx, row in top_no_fiables.iterrows():
210
+ print(f"\n{row['Equipo']:25s} | {row['Nivel']:20s} | Score: {row['Score_Fiabilidad']:.1f}")
211
+ print(f" 📊 Media: {row['Media_CK']:.1f} | Mediana: {row['Mediana_CK']:.1f} | CV: {row['CV_%']:.1f}%")
212
+ print(f" ❌ Solo {row['Pct_Cerca_Media']:.1f}% cerca de media | IQR: {row['IQR']:.1f}")
213
+ print(f" ⚠️ Cambios bruscos: {row['Cambios_Bruscos_%']:.1f}% | Outliers: {row['Pct_Outliers']:.1f}%")
214
+
215
+ # ESTADÍSTICAS GENERALES
216
+ print(f"\n\n📊 DISTRIBUCIÓN POR NIVEL DE FIABILIDAD")
217
+ print("-" * 120)
218
+ print(df_analisis['Nivel'].value_counts())
219
+
220
+ print(f"\n📈 ESTADÍSTICAS DE SCORE:")
221
+ print(f" Media: {df_analisis['Score_Fiabilidad'].mean():.1f}")
222
+ print(f" Mediana: {df_analisis['Score_Fiabilidad'].median():.1f}")
223
+ print(f" Score máximo: {df_analisis['Score_Fiabilidad'].max():.1f}")
224
+ print(f" Score mínimo: {df_analisis['Score_Fiabilidad'].min():.1f}")
225
+
226
+ def obtener_fiabilidad_partido(local, visitante, df_analisis):
227
+ """
228
+ Evalúa la fiabilidad de un partido específico
229
+ """
230
+
231
+ datos_local = df_analisis[df_analisis['Equipo'] == local]
232
+ datos_away = df_analisis[df_analisis['Equipo'] == visitante]
233
+
234
+ if datos_local.empty or datos_away.empty:
235
+ return {
236
+ 'fiabilidad': 'DESCONOCIDO',
237
+ 'score': 0,
238
+ 'mensaje': '⚠️ Datos insuficientes'
239
+ }
240
+
241
+ score_local = datos_local['Score_Fiabilidad'].values[0]
242
+ score_away = datos_away['Score_Fiabilidad'].values[0]
243
+ score_promedio = (score_local + score_away) / 2
244
+
245
+ # Clasificación del partido
246
+ if score_promedio >= 65:
247
+ fiabilidad = "MUY ALTA ⭐⭐⭐"
248
+ mensaje = "✅ EXCELENTE PARTIDO PARA APOSTAR"
249
+ elif score_promedio >= 50:
250
+ fiabilidad = "ALTA ✅"
251
+ mensaje = "✅ BUEN PARTIDO PARA APOSTAR"
252
+ elif score_promedio >= 35:
253
+ fiabilidad = "MEDIA 🟡"
254
+ mensaje = "🟡 APOSTAR CON PRECAUCIÓN"
255
+ else:
256
+ fiabilidad = "BAJA ⛔"
257
+ mensaje = "⛔ EVITAR APUESTA"
258
+
259
+ return {
260
+ 'fiabilidad': fiabilidad,
261
+ 'score_local': score_local,
262
+ 'score_away': score_away,
263
+ 'score_promedio': score_promedio,
264
+ 'nivel_local': datos_local['Nivel'].values[0],
265
+ 'nivel_away': datos_away['Nivel'].values[0],
266
+ 'mensaje': mensaje,
267
+
268
+ # Datos adicionales útiles
269
+ 'cv_local': datos_local['CV_%'].values[0],
270
+ 'cv_away': datos_away['CV_%'].values[0],
271
+ 'consistencia_local': datos_local['Pct_Cerca_Media'].values[0],
272
+ 'consistencia_away': datos_away['Pct_Cerca_Media'].values[0]
273
+ }
274
+
275
+ def calcular_probabilidades_poisson(lambda_pred, rango_inferior=5, rango_superior=5):
276
+ """Calcula probabilidades usando distribución de Poisson"""
277
+
278
+ valor_central = int(round(lambda_pred))
279
+ valores_analizar = range(
280
+ max(0, valor_central - rango_inferior),
281
+ valor_central + rango_superior + 1
282
+ )
283
+
284
+ probabilidades_exactas = {}
285
+ for k in valores_analizar:
286
+ prob = poisson.pmf(k, lambda_pred) * 100
287
+ probabilidades_exactas[k] = prob
288
+
289
+ # ✅ CORRECCIÓN: MISMAS LÍNEAS PARA OVER Y UNDER
290
+ lines = [7.5, 8.5, 9.5, 10.5, 11.5, 12.5]
291
+
292
+ probabilidades_over = {}
293
+ for linea in lines:
294
+ prob_over = (1 - poisson.cdf(linea, lambda_pred)) * 100
295
+ probabilidades_over[linea] = prob_over
296
+
297
+ probabilidades_under = {}
298
+ for linea in lines: # ✅ CAMBIO: usar la misma lista
299
+ prob_under = poisson.cdf(linea, lambda_pred) * 100
300
+ probabilidades_under[linea] = prob_under
301
+
302
+ return {
303
+ 'exactas': probabilidades_exactas,
304
+ 'over': probabilidades_over,
305
+ 'under': probabilidades_under
306
+ }
307
+
308
+ def clasificar_confianza(prob):
309
+ """Clasifica la confianza según probabilidad"""
310
+ if prob >= 66:
311
+ return "ALTA ✅"
312
+ elif prob >= 55:
313
+ return "MEDIA ⚠️"
314
+ else:
315
+ return "BAJA ❌"
316
+
317
+ '''
318
+ def get_dataframes(df, season, round_num, local, away, league=None):
319
+ """Retorna 8 DataFrames filtrados por equipo, venue y liga"""
320
+
321
+ season_round = (df['season'] == season) & (df['round'] < round_num)
322
+
323
+ if league is not None:
324
+ season_round = season_round & (df['league'] == league)
325
+
326
+ def filter_and_split(team_filter):
327
+ filtered = df[season_round & team_filter].copy()
328
+ home = filtered[filtered['venue'] == "Home"]
329
+ away = filtered[filtered['venue'] == "Away"]
330
+ return home, away
331
+
332
+ local_home, local_away = filter_and_split(df['team'] == local)
333
+ local_opp_home, local_opp_away = filter_and_split(df['opponent'] == local)
334
+
335
+ away_home, away_away = filter_and_split(df['team'] == away)
336
+ away_opp_home, away_opp_away = filter_and_split(df['opponent'] == away)
337
+
338
+ return (local_home, local_away, local_opp_home, local_opp_away,
339
+ away_home, away_away, away_opp_home, away_opp_away)
340
+
341
+ def get_head_2_head(df, local, away, seasons=None, league=None):
342
+ """Obtiene últimos 3 enfrentamientos directos"""
343
+ if seasons is None:
344
+ seasons = []
345
+
346
+ df_filtered = df[df['season'].isin(seasons)] if seasons else df
347
+
348
+ if league is not None:
349
+ df_filtered = df_filtered[df_filtered['league'] == league]
350
+
351
+ local_h2h = df_filtered[(df_filtered['team'] == local) & (df_filtered['opponent'] == away)]
352
+ away_h2h = df_filtered[(df_filtered['team'] == away) & (df_filtered['opponent'] == local)]
353
+
354
+ if len(local_h2h) < 4:
355
+ return local_h2h.tail(2), away_h2h.tail(2)
356
+
357
+ return local_h2h.tail(3), away_h2h.tail(3)
358
+
359
+ def get_average(df, is_team=False, lst_avg=None):
360
+ """Calcula promedios de estadísticas (VERSIÓN COMPLETA)"""
361
+
362
+ if len(df) == 0:
363
+ if is_team:
364
+ # ✅ Retornar 23 valores (métricas avanzadas)
365
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
366
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0)
367
+
368
+ if is_team:
369
+ # ===========================
370
+ # ESTADÍSTICAS BÁSICAS (NORMALIZADAS)
371
+ # ===========================
372
+ avg_cross = (df['Performance_Crs'].sum() / len(df)) - lst_avg[3]
373
+ avg_att_3rd = (df['Touches_Att 3rd'].sum() / len(df)) - lst_avg[4]
374
+ avg_sca = (df['SCA Types_SCA'].sum() / len(df)) - lst_avg[2]
375
+ avg_xg = (df['Expected_xG'].sum() / len(df)) - lst_avg[1]
376
+
377
+ # ✅ VARIANZA DE CORNERS
378
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
379
+ avg_ck = (df['Pass Types_CK'].sum() / len(df)) - lst_avg[8]
380
+
381
+ avg_poss = (df['Poss'].sum() / len(df)) - 50
382
+ avg_gf = (df['GF'].sum() / len(df)) - lst_avg[5]
383
+ avg_ga = (df['GA'].sum() / len(df)) - lst_avg[6]
384
+
385
+ # ===========================
386
+ # MÉTRICAS OFENSIVAS AVANZADAS
387
+ # ===========================
388
+ total_sh = df['Standard_Sh'].sum()
389
+ sh_accuracy = (df['Standard_SoT'].sum() / total_sh) if total_sh > 0 else 0
390
+ xg_shot = (df['Expected_xG'].sum() / total_sh) if total_sh > 0 else 0
391
+
392
+ total_touches = df['Touches_Touches'].sum()
393
+ attacking_presence = (df['Touches_Att 3rd'].sum() / total_touches) if total_touches > 0 else 0
394
+
395
+ total_poss = df['Poss'].sum()
396
+ possession_shot = (total_sh / total_poss) if total_poss > 0 else 0
397
+
398
+ # ===========================
399
+ # MÉTRICAS DE CREACIÓN
400
+ # ===========================
401
+ total_passes = df['Total_Att'].sum()
402
+ progressive_pass_ratio = (df['PrgP'].sum() / total_passes) if total_passes > 0 else 0
403
+ final_third_involvement = (df['1/3'].sum() / total_passes) if total_passes > 0 else 0
404
+
405
+ total_sca = df['SCA Types_SCA'].sum()
406
+ assist_sca = (df['Ast'].sum() / total_sca) if total_sca > 0 else 0
407
+ creative_efficiency = (total_sca / total_poss) if total_poss > 0 else 0
408
+
409
+ # ===========================
410
+ # MÉTRICAS DEFENSIVAS
411
+ # ===========================
412
+ total_tackles = df['Tackles_Tkl'].sum()
413
+ high_press_intensity = (df['Tackles_Att 3rd'].sum() / total_tackles) if total_tackles > 0 else 0
414
+ interception_tackle = (df['Int'].sum() / total_tackles) if total_tackles > 0 else 0
415
+
416
+ total_defensive_actions = total_tackles + df['Int'].sum()
417
+ clearance_ratio = (df['Clr'].sum() / total_defensive_actions) if total_defensive_actions > 0 else 0
418
+
419
+ # ===========================
420
+ # MÉTRICAS DE POSESIÓN
421
+ # ===========================
422
+ total_carries = df['Carries_Carries'].sum()
423
+ progressive_carry_ratio = (df['Carries_PrgC'].sum() / total_carries) if total_carries > 0 else 0
424
+
425
+ total_prog_passes = df['PrgP'].sum()
426
+ carry_pass_balance = (df['Carries_PrgC'].sum() / total_prog_passes) if total_prog_passes > 0 else 0
427
+
428
+ # ===========================
429
+ # ÍNDICES COMPUESTOS
430
+ # ===========================
431
+ avg_gf_raw = df['GF'].mean()
432
+ avg_xg_raw = df['Expected_xG'].mean()
433
+ avg_sot = df['Standard_SoT'].mean()
434
+ avg_sh = df['Standard_Sh'].mean()
435
+ offensive_index = (avg_gf_raw + avg_xg_raw) * (avg_sot / avg_sh) if avg_sh > 0 else 0
436
+
437
+ avg_prgp = df['PrgP'].mean()
438
+ avg_prgc = df['Carries_PrgC'].mean()
439
+ avg_poss_raw = df['Poss'].mean()
440
+ transition_index = ((avg_prgp + avg_prgc) / avg_poss_raw) if avg_poss_raw > 0 else 0
441
+
442
+ # ✅ RETORNAR 23 VALORES
443
+ return (
444
+ avg_ck, # 0
445
+ var_ck, # 1 - ✅ NUEVO
446
+ avg_xg, # 2
447
+ avg_sca, # 3
448
+ avg_cross, # 4
449
+ avg_poss, # 5
450
+ avg_att_3rd, # 6
451
+ avg_gf, # 7
452
+ avg_ga, # 8
453
+ sh_accuracy, # 9
454
+ xg_shot, # 10
455
+ attacking_presence, # 11
456
+ possession_shot, # 12
457
+ progressive_pass_ratio, # 13
458
+ final_third_involvement, # 14
459
+ assist_sca, # 15
460
+ creative_efficiency, # 16
461
+ high_press_intensity, # 17
462
+ interception_tackle, # 18
463
+ clearance_ratio, # 19
464
+ progressive_carry_ratio, # 20
465
+ carry_pass_balance, # 21
466
+ offensive_index, # 22
467
+ transition_index # 23
468
+ )
469
+
470
+ # ===========================
471
+ # PROMEDIOS DE LIGA (is_team=False)
472
+ # ===========================
473
+ avg_cross = df['Performance_Crs'].mean()
474
+ avg_att_3rd = df['Touches_Att 3rd'].mean()
475
+ avg_sca = df['SCA Types_SCA'].mean()
476
+ avg_xg = df['Expected_xG'].mean()
477
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
478
+ avg_ck = df['Pass Types_CK'].mean()
479
+ avg_gf = df['GF'].mean()
480
+ avg_ga = df['GA'].mean()
481
+ avg_sh = df['Standard_Sh'].mean() if 'Standard_Sh' in df.columns else 0
482
+
483
+ return (
484
+ var_ck, # 0
485
+ avg_xg, # 1
486
+ avg_sca, # 2
487
+ avg_cross, # 3
488
+ avg_att_3rd, # 4
489
+ avg_gf, # 5
490
+ avg_ga, # 6
491
+ avg_sh, # 7
492
+ avg_ck # 8
493
+ )
494
+
495
+ def get_points_from_result(result):
496
+ """Convierte resultado (W/D/L) a puntos"""
497
+ if result == 'W':
498
+ return 3
499
+ elif result == 'D':
500
+ return 1
501
+ else:
502
+ return 0
503
+
504
+ def get_team_ppp(df, team, season, round_num, league=None):
505
+ """Calcula puntos por partido (PPP) de un equipo"""
506
+ team_matches = df[
507
+ (df['team'] == team) &
508
+ (df['season'] == season) &
509
+ (df['round'] < round_num)
510
+ ]
511
+
512
+ if league is not None:
513
+ team_matches = team_matches[team_matches['league'] == league]
514
+
515
+ if len(team_matches) == 0:
516
+ return 0.0
517
+
518
+ total_points = team_matches['result'].apply(get_points_from_result).sum()
519
+ ppp = total_points / len(team_matches)
520
+
521
+ return ppp
522
+
523
+ def get_ppp_difference(df, local, away, season, round_num, league=None):
524
+ """Calcula diferencia de PPP entre local y visitante"""
525
+ local_ppp = get_team_ppp(df, local, season, round_num, league)
526
+ away_ppp = get_team_ppp(df, away, season, round_num, league)
527
+ return local_ppp - away_ppp
528
+
529
+ '''
530
+
531
+ def predecir_corners(local, visitante, jornada, temporada="2526", league_code="ESP",df_database=pd.DataFrame(),xgb_model="",scaler="",lst_years=[]):
532
+ """
533
+ Predice corners totales con análisis completo para apuestas
534
+
535
+ Args:
536
+ local: Equipo local
537
+ visitante: Equipo visitante
538
+ jornada: Número de jornada
539
+ temporada: Temporada (formato "2526")
540
+ league_code: Código de liga ("ESP", "GER", "FRA", "ITA", "NED")
541
+ """
542
+
543
+ print(f"\n{'='*80}")
544
+ print(f"🏟️ {local} vs {visitante}")
545
+ print(f"📅 Temporada {temporada} | Jornada {jornada} | Liga: {league_code}")
546
+ print(f"{'='*80}")
547
+
548
+ if jornada < 5:
549
+ return {
550
+ "error": "❌ Se necesitan al menos 5 jornadas previas",
551
+ "prediccion": None
552
+ }
553
+
554
+ try:
555
+ # ===========================
556
+ # EXTRAER FEATURES (igual que antes)
557
+ # ===========================
558
+
559
+ lst_avg = get_average(
560
+ df_database[
561
+ (df_database['season'] == temporada) &
562
+ (df_database['round'] < jornada) &
563
+ (df_database['league'] == league_code)
564
+ ],
565
+ is_team=False
566
+ )
567
+
568
+ (team1_home, team1_away, team1_opp_home, team1_opp_away,
569
+ team2_home, team2_away, team2_opp_home, team2_opp_away) = get_dataframes(
570
+ df_database, temporada, jornada, local, visitante, league=league_code
571
+ )
572
+
573
+ index = lst_years.index(temporada)
574
+ result = lst_years[:index+1]
575
+ team1_h2h, team2_h2h = get_head_2_head(
576
+ df_database, local, visitante, seasons=result, league=league_code
577
+ )
578
+
579
+ local_ppp = get_team_ppp(df_database, local, temporada, jornada, league=league_code)
580
+ away_ppp = get_team_ppp(df_database, visitante, temporada, jornada, league=league_code)
581
+ ppp_diff = local_ppp - away_ppp
582
+
583
+ # ===========================
584
+ # CONSTRUIR DICCIONARIO DE FEATURES (igual que antes)
585
+ # ===========================
586
+
587
+ def create_line(df, is_form=True, is_team=False, use_advanced=True):
588
+ if is_form:
589
+ df = df[-6:]
590
+ if use_advanced:
591
+ return get_average(df, is_team, lst_avg)
592
+ else:
593
+ result = get_average(df, is_team, lst_avg)
594
+ return result[:9]
595
+
596
+ dic_features = {}
597
+
598
+ dic_features['ppp_local'] = (local_ppp,)
599
+ dic_features['ppp_away'] = (away_ppp,)
600
+ dic_features['ppp_difference'] = (ppp_diff,)
601
+
602
+ dic_features['lst_team1_home_form'] = create_line(team1_home, True, True, use_advanced=True)
603
+ dic_features['lst_team1_home_general'] = create_line(team1_home, False, True, use_advanced=True)
604
+ dic_features['lst_team1_away_form'] = create_line(team1_away, True, True, use_advanced=True)
605
+ dic_features['lst_team1_away_general'] = create_line(team1_away, False, True, use_advanced=True)
606
+
607
+ dic_features['lst_team2_home_form'] = create_line(team2_home, True, True, use_advanced=True)
608
+ dic_features['lst_team2_home_general'] = create_line(team2_home, False, True, use_advanced=True)
609
+ dic_features['lst_team2_away_form'] = create_line(team2_away, True, True, use_advanced=True)
610
+ dic_features['lst_team2_away_general'] = create_line(team2_away, False, True, use_advanced=True)
611
+
612
+ dic_features['lst_team1_h2h'] = create_line(team1_h2h, False, True, use_advanced=True)
613
+ dic_features['lst_team2_h2h'] = create_line(team2_h2h, False, True, use_advanced=True)
614
+
615
+ dic_features['lst_team1_opp_away'] = create_line(team1_opp_away, False, True, use_advanced=False)
616
+ dic_features['lst_team2_opp_home'] = create_line(team2_opp_home, False, True, use_advanced=False)
617
+
618
+ league_dummies = {
619
+ 'league_ESP': 1 if league_code == 'ESP' else 0,
620
+ 'league_GER': 1 if league_code == 'GER' else 0,
621
+ 'league_FRA': 1 if league_code == 'FRA' else 0,
622
+ 'league_ITA': 1 if league_code == 'ITA' else 0,
623
+ 'league_NED': 1 if league_code == 'NED' else 0,
624
+ 'league_ENG': 1 if league_code == 'ENG' else 0,
625
+ 'league_POR': 1 if league_code == 'POR' else 0,
626
+ 'league_BEL': 1 if league_code == 'BEL' else 0
627
+ }
628
+
629
+ for key, value in league_dummies.items():
630
+ dic_features[key] = (value,)
631
+
632
+ # ===========================
633
+ # CONSTRUIR VECTOR DE FEATURES
634
+ # ===========================
635
+
636
+ lst_base_advanced = [
637
+ "avg_ck", "var_ck", "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga",
638
+ "sh_accuracy", "xg_shot", "attacking_presence", "possession_shot",
639
+ "progressive_pass_ratio", "final_third_involvement", "assist_sca", "creative_efficiency",
640
+ "high_press_intensity", "interception_tackle", "clearance_ratio",
641
+ "progressive_carry_ratio", "carry_pass_balance", "offensive_index", "transition_index"
642
+ ]
643
+
644
+ lst_base_original = [
645
+ "var_ck", "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga", "avg_ck"
646
+ ]
647
+
648
+ lst_features_values = []
649
+ lst_features_names = []
650
+
651
+ for key in dic_features:
652
+ lst_features_values.extend(list(dic_features[key]))
653
+
654
+ if key in ['ppp_local', 'ppp_away', 'ppp_difference']:
655
+ lst_features_names.append(key)
656
+ elif key.startswith('league_'):
657
+ lst_features_names.append(key)
658
+ elif key in ['lst_team1_opp_away', 'lst_team2_opp_home']:
659
+ lst_features_names.extend([f"{key}_{col}" for col in lst_base_original])
660
+ else:
661
+ lst_features_names.extend([f"{key}_{col}" for col in lst_base_advanced])
662
+
663
+ df_input = pd.DataFrame([lst_features_values], columns=lst_features_names)
664
+
665
+ expected_features = scaler.feature_names_in_
666
+
667
+ if len(df_input.columns) != len(expected_features):
668
+ print(f"\n⚠️ ERROR: Número de features no coincide")
669
+ print(f" Esperadas: {len(expected_features)}")
670
+ print(f" Recibidas: {len(df_input.columns)}")
671
+ return {"error": "Desajuste de features", "prediccion": None}
672
+
673
+ df_input = df_input[expected_features]
674
+
675
+ X_input_scaled = pd.DataFrame(
676
+ scaler.transform(df_input),
677
+ columns=df_input.columns
678
+ )
679
+
680
+ # ===========================
681
+ # PREDICCIÓN
682
+ # ===========================
683
+
684
+ prediccion = xgb_model.predict(X_input_scaled)[0]
685
+
686
+ # ===========================
687
+ # ✅ ANÁLISIS PROBABILÍSTICO CON POISSON
688
+ # ===========================
689
+
690
+ analisis = calcular_probabilidades_poisson(prediccion, rango_inferior=5, rango_superior=5)
691
+
692
+ # ===========================
693
+ # ESTADÍSTICAS DETALLADAS
694
+ # ===========================
695
+
696
+ local_ck_home = team1_home['Pass Types_CK'].mean() if len(team1_home) > 0 else 0
697
+ local_xg_home = team1_home['Expected_xG'].mean() if len(team1_home) > 0 else 0
698
+ local_poss_home = team1_home['Poss'].mean() if len(team1_home) > 0 else 0
699
+
700
+ away_ck_away = team2_away['Pass Types_CK'].mean() if len(team2_away) > 0 else 0
701
+ away_xg_away = team2_away['Expected_xG'].mean() if len(team2_away) > 0 else 0
702
+ away_poss_away = team2_away['Poss'].mean() if len(team2_away) > 0 else 0
703
+
704
+ local_ck_received = team1_opp_home['Pass Types_CK'].mean() if len(team1_opp_home) > 0 else 0
705
+ away_ck_received = team2_opp_away['Pass Types_CK'].mean() if len(team2_opp_away) > 0 else 0
706
+
707
+ partido_ck_esperado = local_ck_home + away_ck_away
708
+
709
+ h2h_ck_local = team1_h2h['Pass Types_CK'].mean() if len(team1_h2h) > 0 else 0
710
+ h2h_ck_away = team2_h2h['Pass Types_CK'].mean() if len(team2_h2h) > 0 else 0
711
+ h2h_total = h2h_ck_local + h2h_ck_away
712
+
713
+ # ===========================
714
+ # ✅ MOSTRAR RESULTADOS CON PROBABILIDADES
715
+ # ===========================
716
+
717
+ print(f"\n🎲 PREDICCIÓN MODELO: {prediccion:.2f} corners totales")
718
+ print(f" PPP: {local} ({local_ppp:.2f}) vs {visitante} ({away_ppp:.2f}) | Diff: {ppp_diff:+.2f}")
719
+
720
+ print(f"\n📊 ESTADÍSTICAS HISTÓRICAS:")
721
+ print(f" {local} (Casa): {local_ck_home:.1f} CK/partido | xG: {local_xg_home:.2f} | Poss: {local_poss_home:.1f}%")
722
+ print(f" {visitante} (Fuera): {away_ck_away:.1f} CK/partido | xG: {away_xg_away:.2f} | Poss: {away_poss_away:.1f}%")
723
+ print(f" Corners recibidos: {local} ({local_ck_received:.1f}) | {visitante} ({away_ck_received:.1f})")
724
+ print(f" Total esperado (suma): {partido_ck_esperado:.1f} corners")
725
+
726
+ if len(team1_h2h) > 0 or len(team2_h2h) > 0:
727
+ print(f"\n🔄 HEAD TO HEAD (últimos {max(len(team1_h2h), len(team2_h2h))} partidos):")
728
+ print(f" {local}: {h2h_ck_local:.1f} CK/partido")
729
+ print(f" {visitante}: {h2h_ck_away:.1f} CK/partido")
730
+ print(f" Promedio total: {h2h_total:.1f} corners")
731
+
732
+ # ===========================
733
+ # ✅ MOSTRAR PROBABILIDADES EXACTAS
734
+ # ===========================
735
+
736
+ valor_mas_probable = max(analisis['exactas'].items(), key=lambda x: x[1])
737
+
738
+ print(f"\n📈 PROBABILIDADES EXACTAS (Poisson):")
739
+ for k in sorted(analisis['exactas'].keys()):
740
+ prob = analisis['exactas'][k]
741
+ bar = '█' * int(prob / 2)
742
+ marca = ' ⭐' if k == valor_mas_probable[0] else ''
743
+ print(f" {k:2d} corners: {prob:5.2f}% {bar}{marca}")
744
+
745
+ print(f"\n✅ Valor más probable: {valor_mas_probable[0]} corners ({valor_mas_probable[1]:.2f}%)")
746
+
747
+ # ✅ RANGO DE 80% CONFIANZA
748
+ probs_sorted = sorted(analisis['exactas'].items(), key=lambda x: x[1], reverse=True)
749
+ cumsum = 0
750
+ rango_80 = []
751
+ for val, prob in probs_sorted:
752
+ cumsum += prob
753
+ rango_80.append(val)
754
+ if cumsum >= 80:
755
+ break
756
+
757
+ print(f"📊 Rango 80% confianza: {min(rango_80)}-{max(rango_80)} corners")
758
+
759
+ # ===========================
760
+ # ✅ MOSTRAR OVER/UNDER CON CUOTAS IMPLÍCITAS
761
+ # ===========================
762
+
763
+ print(f"\n🎯 ANÁLISIS OVER/UNDER:")
764
+ print(f"{'Línea':<10} {'Prob Over':<12} {'Cuota Impl':<12} {'Confianza':<15} {'Prob Under':<12} {'Cuota Impl':<12}")
765
+ print("-" * 85)
766
+
767
+ for linea in [7.5, 8.5, 9.5, 10.5, 11.5, 12.5]:
768
+ prob_over = analisis['over'][linea]
769
+ prob_under = analisis['under'][linea]
770
+
771
+ # Cuotas implícitas (inverso de probabilidad en decimal)
772
+ cuota_impl_over = 100 / prob_over if prob_over > 0 else 999
773
+ cuota_impl_under = 100 / prob_under if prob_under > 0 else 999
774
+
775
+ conf_over = clasificar_confianza(prob_over)
776
+
777
+ print(f"O/U {linea:<5} {prob_over:6.2f}% @{cuota_impl_over:5.2f} {conf_over:<15} {prob_under:6.2f}% @{cuota_impl_under:5.2f}")
778
+
779
+ # ===========================
780
+ # ✅ RECOMENDACIONES CON CUOTAS
781
+ # ===========================
782
+
783
+ print(f"\n💡 RECOMENDACIONES DE APUESTA:")
784
+
785
+ mejores_over = [(l, p) for l, p in analisis['over'].items() if p >= 55]
786
+ mejores_under = [(l, p) for l, p in analisis['under'].items() if p >= 55]
787
+
788
+ if mejores_over:
789
+ print(f"\n✅ OVER con confianza MEDIA/ALTA:")
790
+ for linea, prob in sorted(mejores_over, key=lambda x: x[1], reverse=True):
791
+ cuota_impl = 100 / prob
792
+ conf = clasificar_confianza(prob)
793
+ print(f" • Over {linea}: {prob:.2f}% (Cuota justa: @{cuota_impl:.2f}) - {conf}")
794
+
795
+ if mejores_under:
796
+ print(f"\n✅ UNDER con confianza MEDIA/ALTA:")
797
+ for linea, prob in sorted(mejores_under, key=lambda x: x[1], reverse=True):
798
+ cuota_impl = 100 / prob
799
+ conf = clasificar_confianza(prob)
800
+ print(f" • Under {linea}: {prob:.2f}% (Cuota justa: @{cuota_impl:.2f}) - {conf}")
801
+
802
+ if not mejores_over and not mejores_under:
803
+ print(f" ⚠️ No hay apuestas con confianza MEDIA o superior")
804
+
805
+ # ===========================
806
+ # ✅ ANÁLISIS DE RIESGO
807
+ # ===========================
808
+
809
+ df_varianza_temp = analizar_fiabilidad_equipos(df_database, temporada=temporada, min_partidos=3)
810
+ riesgo = obtener_fiabilidad_partido(local, visitante, df_varianza_temp)
811
+
812
+ print(f"\n⚠️ ANÁLISIS DE RIESGO:")
813
+ print(f" Local ({local}): {riesgo['nivel_local']} (CV: {riesgo['cv_local']:.1f}%)")
814
+ print(f" Away ({visitante}): {riesgo['nivel_away']} (CV: {riesgo['cv_away']:.1f}%)")
815
+ print(f" 🎲 FIABILIDAD PARTIDO: {riesgo['fiabilidad']} (Score: {riesgo['score_promedio']:.1f})")
816
+ print(f" 💡 {riesgo['mensaje']}")
817
+
818
+ # ===========================
819
+ # RETORNAR DICCIONARIO COMPLETO
820
+ # ===========================
821
+
822
+ return {
823
+ "prediccion": round(prediccion, 2),
824
+ "local": local,
825
+ "visitante": visitante,
826
+ "ppp_local": local_ppp,
827
+ "ppp_away": away_ppp,
828
+ "ppp_diff": ppp_diff,
829
+ "riesgo": riesgo,
830
+ "stats": {
831
+ "local_ck": local_ck_home,
832
+ "away_ck": away_ck_away,
833
+ "local_ck_received": local_ck_received,
834
+ "away_ck_received": away_ck_received,
835
+ "h2h_total": h2h_total,
836
+ "partido_esperado": partido_ck_esperado
837
+ },
838
+ "probabilidades_exactas": analisis['exactas'],
839
+ "probabilidades_over": analisis['over'],
840
+ "probabilidades_under": analisis['under'],
841
+ "valor_mas_probable": valor_mas_probable[0],
842
+ "prob_mas_probable": valor_mas_probable[1],
843
+ "rango_80": (min(rango_80), max(rango_80))
844
+ }
845
+
846
+ except Exception as e:
847
+ print(f"\n❌ ERROR: {str(e)}")
848
+ import traceback
849
+ traceback.print_exc()
850
+ return {"error": str(e), "prediccion": None}
851
+
852
+ def predecir_partidos_batch(partidos, jornada, temporada="2526", league_code="ESP", export_csv=True, filename=None,df_database=pd.DataFrame(),xgb_model="",scaler="",lst_years=[]):
853
+ """
854
+ Predice corners para múltiples partidos y exporta resultados a CSV
855
+
856
+ Args:
857
+ partidos: Lista de tuplas [(local1, visitante1), (local2, visitante2), ...]
858
+ jornada: Número de jornada
859
+ temporada: Temporada (formato "2526")
860
+ league_code: Código de liga ("ESP", "GER", "FRA", "ITA", "NED")
861
+ export_csv: Si True, exporta a CSV
862
+ filename: Nombre del archivo CSV (opcional)
863
+
864
+ Returns:
865
+ DataFrame con todos los resultados
866
+ """
867
+
868
+ resultados = []
869
+
870
+ print("\n" + "=" * 120)
871
+ print(f"🎯 PROCESANDO {len(partidos)} PARTIDOS - {league_code} | J{jornada} | Temporada {temporada}")
872
+ print("=" * 120)
873
+
874
+ for idx, (local, visitante) in enumerate(partidos, 1):
875
+ print(f"\n[{idx}/{len(partidos)}] Procesando: {local} vs {visitante}...")
876
+
877
+ resultado = predecir_corners(
878
+ local=local,
879
+ visitante=visitante,
880
+ jornada=jornada,
881
+ temporada=temporada,
882
+ league_code=league_code,
883
+ df_database=df_database,
884
+ xgb_model=xgb_model,
885
+ scaler=scaler,
886
+ lst_years=lst_years)
887
+
888
+
889
+ if resultado.get("error"):
890
+ print(f" ❌ Error: {resultado['error']}")
891
+ continue
892
+
893
+ # ===========================
894
+ # CONSTRUIR FILA DE DATOS
895
+ # ===========================
896
+
897
+ fila = {
898
+ 'Partido': f"{local} vs {visitante}",
899
+ 'Local': local,
900
+ 'Visitante': visitante,
901
+ 'Liga': league_code,
902
+ 'Jornada': jornada,
903
+ 'Temporada': temporada,
904
+
905
+ # Predicción
906
+ 'Prediccion': resultado['prediccion'],
907
+ 'Valor_Mas_Probable': resultado['valor_mas_probable'],
908
+ 'Prob_Valor_Mas_Probable_%': round(resultado['prob_mas_probable'], 2),
909
+ 'Rango_80%_Min': resultado['rango_80'][0],
910
+ 'Rango_80%_Max': resultado['rango_80'][1],
911
+
912
+ # PPP
913
+ 'PPP_Local': round(resultado['ppp_local'], 2),
914
+ 'PPP_Away': round(resultado['ppp_away'], 2),
915
+ 'PPP_Diferencia': round(resultado['ppp_diff'], 2),
916
+
917
+ # Estadísticas históricas
918
+ 'CK_Local_Casa': round(resultado['stats']['local_ck'], 1),
919
+ 'CK_Away_Fuera': round(resultado['stats']['away_ck'], 1),
920
+ 'CK_Local_Recibidos': round(resultado['stats']['local_ck_received'], 1),
921
+ 'CK_Away_Recibidos': round(resultado['stats']['away_ck_received'], 1),
922
+ 'CK_Esperado_Suma': round(resultado['stats']['partido_esperado'], 1),
923
+ 'CK_H2H_Total': round(resultado['stats']['h2h_total'], 1) if resultado['stats']['h2h_total'] > 0 else 'N/A',
924
+
925
+ # Riesgo
926
+ 'Fiabilidad_Partido': resultado['riesgo']['fiabilidad'],
927
+ 'Score_Fiabilidad': round(resultado['riesgo']['score_promedio'], 1),
928
+ 'Nivel_Local': resultado['riesgo']['nivel_local'],
929
+ 'Nivel_Away': resultado['riesgo']['nivel_away'],
930
+ 'CV_Local_%': round(resultado['riesgo']['cv_local'], 1),
931
+ 'CV_Away_%': round(resultado['riesgo']['cv_away'], 1),
932
+ }
933
+
934
+ # ===========================
935
+ # OVER 6.5 a 10.5
936
+ # ===========================
937
+ for linea in [6.5, 7.5, 8.5, 9.5, 10.5]:
938
+ prob = resultado['probabilidades_over'].get(linea, 0)
939
+ cuota_impl = round(100 / prob, 2) if prob > 0 else 999
940
+ conf = clasificar_confianza(prob)
941
+
942
+ fila[f'Over_{linea}_Prob_%'] = round(prob, 2)
943
+ fila[f'Over_{linea}_Cuota'] = cuota_impl
944
+ fila[f'Over_{linea}_Confianza'] = conf
945
+
946
+ # ===========================
947
+ # UNDER 12.5 a 9.5
948
+ # ===========================
949
+ for linea in [12.5, 11.5, 10.5, 9.5]:
950
+ prob = resultado['probabilidades_under'].get(linea, 0)
951
+ cuota_impl = round(100 / prob, 2) if prob > 0 else 999
952
+ conf = clasificar_confianza(prob)
953
+
954
+ fila[f'Under_{linea}_Prob_%'] = round(prob, 2)
955
+ fila[f'Under_{linea}_Cuota'] = cuota_impl
956
+ fila[f'Under_{linea}_Confianza'] = conf
957
+
958
+ # ===========================
959
+ # RECOMENDACIONES
960
+ # ===========================
961
+
962
+ mejores_over = [(l, p) for l, p in resultado['probabilidades_over'].items() if p >= 55]
963
+ mejores_under = [(l, p) for l, p in resultado['probabilidades_under'].items() if p >= 55]
964
+
965
+ if resultado['riesgo']['score_promedio'] < 35:
966
+ fila['Recomendacion'] = "⛔ EVITAR - Baja fiabilidad"
967
+ fila['Es_Apostable'] = "NO"
968
+ elif not mejores_over and not mejores_under:
969
+ fila['Recomendacion'] = "⚠️ NO RECOMENDADO - Sin confianza suficiente"
970
+ fila['Es_Apostable'] = "NO"
971
+ else:
972
+ recomendaciones = []
973
+
974
+ if mejores_over:
975
+ mejor_over = max(mejores_over, key=lambda x: x[1])
976
+ cuota_over = round(100 / mejor_over[1], 2)
977
+ recomendaciones.append(f"Over {mejor_over[0]} ({mejor_over[1]:.1f}% @{cuota_over})")
978
+
979
+ if mejores_under:
980
+ mejor_under = max(mejores_under, key=lambda x: x[1])
981
+ cuota_under = round(100 / mejor_under[1], 2)
982
+ recomendaciones.append(f"Under {mejor_under[0]} ({mejor_under[1]:.1f}% @{cuota_under})")
983
+
984
+ fila['Recomendacion'] = " | ".join(recomendaciones)
985
+
986
+ if resultado['riesgo']['score_promedio'] >= 65:
987
+ fila['Es_Apostable'] = "SÍ ⭐⭐⭐"
988
+ elif resultado['riesgo']['score_promedio'] >= 50:
989
+ fila['Es_Apostable'] = "SÍ ✅"
990
+ else:
991
+ fila['Es_Apostable'] = "PRECAUCIÓN 🟡"
992
+
993
+ fila['Mensaje_Riesgo'] = resultado['riesgo']['mensaje']
994
+
995
+ resultados.append(fila)
996
+ print(f" ✅ Completado")
997
+
998
+ # ===========================
999
+ # CREAR DATAFRAME
1000
+ # ===========================
1001
+
1002
+ df_resultados = pd.DataFrame(resultados)
1003
+
1004
+ print("\n" + "=" * 120)
1005
+ print(f"✅ PROCESAMIENTO COMPLETADO: {len(df_resultados)} partidos analizados")
1006
+ print("=" * 120)
1007
+
1008
+ # ===========================
1009
+ # EXPORTAR A CSV
1010
+ # ===========================
1011
+
1012
+ if export_csv and len(df_resultados) > 0:
1013
+ if filename is None:
1014
+ filename = f"predicciones_{league_code}_J{jornada}_{temporada}.csv"
1015
+
1016
+ df_resultados.to_csv(filename, index=False, encoding='utf-8-sig')
1017
+ print(f"\n💾 Resultados exportados a: {filename}")
1018
+
1019
+ # ===========================
1020
+ # RESUMEN
1021
+ # ===========================
1022
+
1023
+ print(f"\n📊 RESUMEN DE APUESTAS:")
1024
+ print(f" Partidos apostables: {len(df_resultados[df_resultados['Es_Apostable'].str.contains('SÍ')])} / {len(df_resultados)}")
1025
+ print(f" Partidos ALTA confianza (⭐⭐⭐): {len(df_resultados[df_resultados['Es_Apostable'] == 'SÍ ⭐⭐⭐'])}")
1026
+ print(f" Partidos MEDIA confianza (✅): {len(df_resultados[df_resultados['Es_Apostable'] == 'SÍ ✅'])}")
1027
+ print(f" Partidos a evitar (⛔): {len(df_resultados[df_resultados['Es_Apostable'] == 'NO'])}")
1028
+
1029
+ return df_resultados
1030
+
1031
+ def mostrar_resumen_batch(df_resultados):
1032
+ """Muestra resumen visual de los resultados"""
1033
+
1034
+ print("\n" + "=" * 120)
1035
+ print("🎯 MEJORES OPORTUNIDADES DE APUESTA")
1036
+ print("=" * 120)
1037
+
1038
+ # Filtrar solo apostables
1039
+ df_apostables = df_resultados[df_resultados['Es_Apostable'].str.contains('SÍ')].copy()
1040
+
1041
+ if len(df_apostables) == 0:
1042
+ print("\n⚠️ No se encontraron partidos con oportunidades de apuesta")
1043
+ return
1044
+
1045
+ # Ordenar por score de fiabilidad
1046
+ df_apostables = df_apostables.sort_values('Score_Fiabilidad', ascending=False)
1047
+
1048
+ for idx, row in df_apostables.iterrows():
1049
+ print(f"\n{'='*120}")
1050
+ print(f"🏟️ {row['Partido']}")
1051
+ print(f"{'='*120}")
1052
+ print(f"📊 Predicción: {row['Prediccion']:.2f} corners | Valor más probable: {row['Valor_Mas_Probable']} ({row['Prob_Valor_Mas_Probable_%']:.1f}%)")
1053
+ print(f"📈 Histórico: Local {row['CK_Local_Casa']:.1f} CK | Away {row['CK_Away_Fuera']:.1f} CK | H2H: {row['CK_H2H_Total']}")
1054
+ print(f"🎲 Fiabilidad: {row['Fiabilidad_Partido']} (Score: {row['Score_Fiabilidad']:.1f}/100)")
1055
+ print(f"💡 {row['Recomendacion']}")
1056
+
1057
+ # Mostrar líneas con alta probabilidad
1058
+ print(f"\n 📌 Líneas destacadas:")
1059
+ for linea in [7.5, 8.5, 9.5, 10.5]:
1060
+ over_prob = row.get(f'Over_{linea}_Prob_%', 0)
1061
+ under_prob = row.get(f'Under_{linea}_Prob_%', 0)
1062
+
1063
+ if over_prob >= 55:
1064
+ cuota = row.get(f'Over_{linea}_Cuota', 0)
1065
+ conf = row.get(f'Over_{linea}_Confianza', '')
1066
+ print(f" • Over {linea}: {over_prob:.1f}% @{cuota:.2f} - {conf}")
1067
+
1068
+ if under_prob >= 55:
1069
+ cuota = row.get(f'Under_{linea}_Cuota', 0)
1070
+ conf = row.get(f'Under_{linea}_Confianza', '')
1071
+ print(f" • Under {linea}: {under_prob:.1f}% @{cuota:.2f} - {conf}")
1072
+
1073
+
1074
+
1075
+
1076
+ class USE_MODEL():
1077
+ def __init__(self):
1078
+ self.load_models()
1079
+ self.load_data()
1080
+ self.init_variables()
1081
+
1082
+ def load_models(self):
1083
+ """Cargar modelos desde GitHub usando raw URLs"""
1084
+
1085
+ print("📦 Cargando modelos desde GitHub...")
1086
+
1087
+ # URLs de descarga directa (raw.githubusercontent.com)
1088
+ base_url = "https://raw.githubusercontent.com/danielsaed/futbol_corners_forecast/refs/heads/main/models"
1089
+ model_url = f"{base_url}/xgboost_corners_v4_retrain.pkl"
1090
+ scaler_url = f"{base_url}/scaler_corners_v4_retrain.pkl"
1091
+
1092
+ try:
1093
+ # Descargar modelo
1094
+ print(f"📥 Descargando modelo desde: {model_url}")
1095
+ response_model = requests.get(model_url, timeout=30)
1096
+ response_model.raise_for_status()
1097
+
1098
+ # Descargar scaler
1099
+ print(f"📥 Descargando scaler desde: {scaler_url}")
1100
+ response_scaler = requests.get(scaler_url, timeout=30)
1101
+ response_scaler.raise_for_status()
1102
+
1103
+ # Guardar temporalmente y cargar
1104
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_model:
1105
+ tmp_model.write(response_model.content)
1106
+ tmp_model_path = tmp_model.name
1107
+
1108
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_scaler:
1109
+ tmp_scaler.write(response_scaler.content)
1110
+ tmp_scaler_path = tmp_scaler.name
1111
+
1112
+ # Cargar modelos desde archivos temporales
1113
+ self.xgb_model = joblib.load(tmp_model_path)
1114
+ self.scaler = joblib.load(tmp_scaler_path)
1115
+
1116
+ # Limpiar archivos temporales
1117
+ os.unlink(tmp_model_path)
1118
+ os.unlink(tmp_scaler_path)
1119
+
1120
+ print("✅ Modelos cargados correctamente desde GitHub")
1121
+
1122
+ except requests.exceptions.RequestException as e:
1123
+ raise Exception(f"❌ Error descargando modelos: {str(e)}")
1124
+ except Exception as e:
1125
+ raise Exception(f"❌ Error cargando modelos: {str(e)}")
1126
+
1127
+ def load_data(self):
1128
+ """Cargar datos desde GitHub"""
1129
+
1130
+ print("📂 Cargando datos desde GitHub...")
1131
+
1132
+ base_url = "https://raw.githubusercontent.com/danielsaed/futbol_corners_forecast/refs/heads/main/dataset/cleaned"
1133
+ historic_url = f"{base_url}/dataset_cleaned.csv"
1134
+ current_url = f"{base_url}/dataset_cleaned_current_year.csv"
1135
+
1136
+ try:
1137
+ # Cargar dataset histórico
1138
+ print(f"📥 Descargando dataset histórico...")
1139
+ self.df_dataset_historic = pd.read_csv(historic_url)
1140
+ print(f"✅ Dataset histórico cargado: {len(self.df_dataset_historic)} registros")
1141
+
1142
+ # Intentar cargar año actual
1143
+ try:
1144
+ print(f"📥 Descargando dataset año actual...")
1145
+ self.df_dataset_current_year = pd.read_csv(current_url)
1146
+ print(f"✅ Dataset año actual cargado: {len(self.df_dataset_current_year)} registros")
1147
+ self.df_dataset = pd.concat([self.df_dataset_historic, self.df_dataset_current_year])
1148
+ except:
1149
+ print("⚠️ No se pudo cargar dataset del año actual, usando solo histórico")
1150
+ self.df_dataset = self.df_dataset_historic
1151
+
1152
+ # Limpieza
1153
+ self.df_dataset["season"] = self.df_dataset["season"].astype(str)
1154
+ self.df_dataset["Performance_Save%"].fillna(0, inplace=True)
1155
+
1156
+ print(f"✅ Total registros: {len(self.df_dataset)}")
1157
+
1158
+ except Exception as e:
1159
+ raise FileNotFoundError(
1160
+ f"\n❌ ERROR: No se pudieron cargar los datos desde GitHub\n"
1161
+ f" Error: {str(e)}\n\n"
1162
+ f"💡 Verifica que los archivos existan en el repositorio\n"
1163
+ )
1164
+
1165
+ def init_variables(self):
1166
+ self.lst_years = ["1819", "1920", "2021", "2122", "2223", "2324", "2425", "2526"]
1167
+ print("✅ Variables inicializadas")
1168
+
1169
+ def consume_model_batch(self,partidos,jornada,temporada,league_code):
1170
+
1171
+ df_predict = predecir_partidos_batch(
1172
+ partidos=partidos,
1173
+ jornada=jornada,
1174
+ temporada=temporada,
1175
+ league_code=league_code,
1176
+ export_csv=True,
1177
+ filename=f"results\{league_code}\{league_code}-{temporada}-{jornada}-predicciones.csv",
1178
+ df_database = self.df_dataset,
1179
+ xgb_model = self.xgb_model,
1180
+ scaler=self.scaler,
1181
+ lst_years=self.lst_years
1182
+ )
1183
+
1184
+ # Mostrar resumen
1185
+ return df_predict
1186
+
1187
+ def consume_model_single(self,local,visitante,jornada,temporada,league_code):
1188
+
1189
+ return predecir_corners(
1190
+ local=local,
1191
+ visitante=visitante,
1192
+ jornada=jornada,
1193
+ temporada=temporada,
1194
+ league_code=league_code,
1195
+ df_database = self.df_dataset,
1196
+ xgb_model = self.xgb_model,
1197
+ scaler=self.scaler,
1198
+ lst_years=self.lst_years
1199
+ )
1200
+
1201
+
1202
+ def kelly_stats(self,p, odds, fraction=0.2):
1203
+
1204
+ b = odds - 1
1205
+ q = 1 - p
1206
+ f_star = (b * p - q) / b
1207
+ f_star = max(f_star, 0) # evita negativos
1208
+ return f_star * fraction # usa 0.1 para Kelly 10%
src/models/__init__.py ADDED
File without changes
src/models/test_model.py ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================
2
+ # SISTEMA DE PREDICCIÓN DE CORNERS - OPTIMIZADO PARA APUESTAS (VERSIÓN COMPLETA)
3
+ # ===========================
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import joblib
8
+ from scipy.stats import poisson
9
+ from scipy import stats
10
+
11
+ # ===========================
12
+ # 1. FUNCIONES FIABILIDAD
13
+ # ===========================
14
+
15
+ def analizar_fiabilidad_equipos(df_database, temporada="2526", min_partidos=5):
16
+ """
17
+ Análisis completo de fiabilidad para apuestas de corners
18
+ No solo varianza, sino consistencia, tendencias y patrones
19
+ """
20
+
21
+ df_temp = df_database[df_database['season'] == temporada].copy()
22
+ resultados = []
23
+ equipos = pd.concat([df_temp['team'], df_temp['opponent']]).unique()
24
+
25
+ for equipo in equipos:
26
+ # Partidos del equipo
27
+ partidos_equipo = df_temp[df_temp['team'] == equipo]
28
+
29
+ if len(partidos_equipo) < min_partidos:
30
+ continue
31
+
32
+ ck_sacados = partidos_equipo['Pass Types_CK'].values
33
+
34
+ # ===========================
35
+ # 1. MÉTRICAS DE VARIABILIDAD
36
+ # ===========================
37
+ media = ck_sacados.mean()
38
+ std = ck_sacados.std()
39
+ cv = (std / media * 100) if media > 0 else 0
40
+
41
+ # ===========================
42
+ # 2. MÉTRICAS DE CONSISTENCIA
43
+ # ===========================
44
+
45
+ # 2.1 Porcentaje de partidos cerca de la media (±2 corners)
46
+ cerca_media = np.sum(np.abs(ck_sacados - media) <= 2) / len(ck_sacados) * 100
47
+
48
+ # 2.2 Rachas (detectar equipos con "explosiones" de corners)
49
+ cambios_bruscos = np.sum(np.abs(np.diff(ck_sacados)) > 4)
50
+ pct_cambios_bruscos = cambios_bruscos / (len(ck_sacados) - 1) * 100
51
+
52
+ # 2.3 Cuartiles (Q1, Q2=mediana, Q3)
53
+ q1, q2, q3 = np.percentile(ck_sacados, [25, 50, 75])
54
+ iqr = q3 - q1 # Rango intercuartílico (más robusto que std)
55
+
56
+ # ===========================
57
+ # 3. MÉTRICAS DE TENDENCIA
58
+ # ===========================
59
+
60
+ # 3.1 Tendencia lineal (¿mejora/empeora con el tiempo?)
61
+ jornadas = np.arange(len(ck_sacados))
62
+ slope, intercept, r_value, p_value, std_err = stats.linregress(jornadas, ck_sacados)
63
+
64
+ # 3.2 Autocorrelación (¿resultado actual predice el siguiente?)
65
+ if len(ck_sacados) > 2:
66
+ autocorr = np.corrcoef(ck_sacados[:-1], ck_sacados[1:])[0, 1]
67
+ else:
68
+ autocorr = 0
69
+
70
+ # ===========================
71
+ # 4. MÉTRICAS DE OUTLIERS
72
+ # ===========================
73
+
74
+ # 4.1 Detección de valores atípicos (método IQR)
75
+ lower_bound = q1 - 1.5 * iqr
76
+ upper_bound = q3 + 1.5 * iqr
77
+ outliers = np.sum((ck_sacados < lower_bound) | (ck_sacados > upper_bound))
78
+ pct_outliers = outliers / len(ck_sacados) * 100
79
+
80
+ # 4.2 Z-score máximo
81
+ z_scores = np.abs(stats.zscore(ck_sacados))
82
+ max_z = z_scores.max()
83
+
84
+ # ===========================
85
+ # 5. MÉTRICAS DE RANGO
86
+ # ===========================
87
+
88
+ rango = ck_sacados.max() - ck_sacados.min()
89
+ rango_normalizado = rango / media if media > 0 else 0
90
+
91
+ # ===========================
92
+ # 6. SCORE GLOBAL DE FIABILIDAD
93
+ # ===========================
94
+
95
+ # Penalizaciones (0-100, menor = peor)
96
+ score_cv = max(0, 100 - cv * 2) # CV alto = mala
97
+ score_consistencia = cerca_media # Más cerca de media = mejor
98
+ score_cambios = max(0, 100 - pct_cambios_bruscos * 2) # Cambios bruscos = malo
99
+ score_outliers = max(0, 100 - pct_outliers * 3) # Outliers = malo
100
+ score_iqr = max(0, 100 - iqr * 10) # IQR grande = malo
101
+
102
+ # Score final (promedio ponderado)
103
+ score_fiabilidad = (
104
+ score_cv * 0.25 +
105
+ score_consistencia * 0.30 +
106
+ score_cambios * 0.20 +
107
+ score_outliers * 0.15 +
108
+ score_iqr * 0.10
109
+ )
110
+
111
+ # ===========================
112
+ # 7. CLASIFICACIÓN MULTI-CRITERIO
113
+ # ===========================
114
+
115
+ # Clasificación basada en score
116
+ if score_fiabilidad >= 70:
117
+ nivel = "EXCELENTE ⭐⭐⭐"
118
+ color = "#27ae60"
119
+ elif score_fiabilidad >= 55:
120
+ nivel = "BUENO ✅"
121
+ color = "#2ecc71"
122
+ elif score_fiabilidad >= 40:
123
+ nivel = "ACEPTABLE 🟡"
124
+ color = "#f39c12"
125
+ elif score_fiabilidad >= 25:
126
+ nivel = "REGULAR ⚠️"
127
+ color = "#e67e22"
128
+ else:
129
+ nivel = "EVITAR ⛔"
130
+ color = "#e74c3c"
131
+
132
+ resultados.append({
133
+ 'Equipo': equipo,
134
+ 'Partidos': len(ck_sacados),
135
+
136
+ # Estadísticas básicas
137
+ 'Media_CK': round(media, 2),
138
+ 'Mediana_CK': round(q2, 2),
139
+ 'Std_CK': round(std, 2),
140
+ 'CV_%': round(cv, 1),
141
+
142
+ # Consistencia
143
+ 'Pct_Cerca_Media': round(cerca_media, 1),
144
+ 'Cambios_Bruscos_%': round(pct_cambios_bruscos, 1),
145
+ 'IQR': round(iqr, 2),
146
+
147
+ # Rango
148
+ 'Rango': int(rango),
149
+ 'Rango_Norm': round(rango_normalizado, 2),
150
+ 'Min': int(ck_sacados.min()),
151
+ 'Max': int(ck_sacados.max()),
152
+
153
+ # Outliers
154
+ 'Outliers': int(outliers),
155
+ 'Pct_Outliers': round(pct_outliers, 1),
156
+ 'Max_ZScore': round(max_z, 2),
157
+
158
+ # Tendencia
159
+ 'Tendencia_Slope': round(slope, 3),
160
+ 'Autocorr': round(autocorr, 3),
161
+
162
+ # Score y clasificación
163
+ 'Score_Fiabilidad': round(score_fiabilidad, 1),
164
+ 'Nivel': nivel,
165
+ 'Color': color
166
+ })
167
+
168
+ df_resultado = pd.DataFrame(resultados)
169
+
170
+ df_resultado = df_resultado.sort_values('Score_Fiabilidad', ascending=False)
171
+
172
+ return df_resultado
173
+
174
+ def mostrar_analisis_fiabilidad(df_analisis, top_n=10):
175
+ """
176
+ Muestra el análisis completo de fiabilidad
177
+ """
178
+
179
+ print("\n" + "=" * 120)
180
+ print("🎯 ANÁLISIS DE FIABILIDAD PARA APUESTAS - CORNERS")
181
+ print("=" * 120)
182
+
183
+ # TOP EQUIPOS FIABLES
184
+ print(f"\n⭐ TOP {top_n} EQUIPOS MÁS FIABLES")
185
+ print("-" * 120)
186
+
187
+ top_fiables = df_analisis.head(top_n)
188
+
189
+ for idx, row in top_fiables.iterrows():
190
+ print(f"\n{row['Equipo']:25s} | {row['Nivel']:20s} | Score: {row['Score_Fiabilidad']:.1f}")
191
+ print(f" 📊 Media: {row['Media_CK']:.1f} | Mediana: {row['Mediana_CK']:.1f} | CV: {row['CV_%']:.1f}%")
192
+ print(f" ✅ {row['Pct_Cerca_Media']:.1f}% cerca de media | IQR: {row['IQR']:.1f}")
193
+ print(f" ⚠️ Cambios bruscos: {row['Cambios_Bruscos_%']:.1f}% | Outliers: {row['Pct_Outliers']:.1f}%")
194
+ print(f" 📈 Rango: {row['Min']}-{row['Max']} ({row['Rango']} corners)")
195
+
196
+ # TOP EQUIPOS NO FIABLES
197
+ print(f"\n\n⛔ TOP {top_n} EQUIPOS MENOS FIABLES")
198
+ print("-" * 120)
199
+
200
+ top_no_fiables = df_analisis.tail(top_n)
201
+
202
+ for idx, row in top_no_fiables.iterrows():
203
+ print(f"\n{row['Equipo']:25s} | {row['Nivel']:20s} | Score: {row['Score_Fiabilidad']:.1f}")
204
+ print(f" 📊 Media: {row['Media_CK']:.1f} | Mediana: {row['Mediana_CK']:.1f} | CV: {row['CV_%']:.1f}%")
205
+ print(f" ❌ Solo {row['Pct_Cerca_Media']:.1f}% cerca de media | IQR: {row['IQR']:.1f}")
206
+ print(f" ⚠️ Cambios bruscos: {row['Cambios_Bruscos_%']:.1f}% | Outliers: {row['Pct_Outliers']:.1f}%")
207
+
208
+ # ESTADÍSTICAS GENERALES
209
+ print(f"\n\n📊 DISTRIBUCIÓN POR NIVEL DE FIABILIDAD")
210
+ print("-" * 120)
211
+ print(df_analisis['Nivel'].value_counts())
212
+
213
+ print(f"\n📈 ESTADÍSTICAS DE SCORE:")
214
+ print(f" Media: {df_analisis['Score_Fiabilidad'].mean():.1f}")
215
+ print(f" Mediana: {df_analisis['Score_Fiabilidad'].median():.1f}")
216
+ print(f" Score máximo: {df_analisis['Score_Fiabilidad'].max():.1f}")
217
+ print(f" Score mínimo: {df_analisis['Score_Fiabilidad'].min():.1f}")
218
+
219
+ def obtener_fiabilidad_partido(local, visitante, df_analisis):
220
+ """
221
+ Evalúa la fiabilidad de un partido específico
222
+ """
223
+
224
+ datos_local = df_analisis[df_analisis['Equipo'] == local]
225
+ datos_away = df_analisis[df_analisis['Equipo'] == visitante]
226
+
227
+ if datos_local.empty or datos_away.empty:
228
+ return {
229
+ 'fiabilidad': 'DESCONOCIDO',
230
+ 'score': 0,
231
+ 'mensaje': '⚠️ Datos insuficientes'
232
+ }
233
+
234
+ score_local = datos_local['Score_Fiabilidad'].values[0]
235
+ score_away = datos_away['Score_Fiabilidad'].values[0]
236
+ score_promedio = (score_local + score_away) / 2
237
+
238
+ # Clasificación del partido
239
+ if score_promedio >= 65:
240
+ fiabilidad = "MUY ALTA ⭐⭐⭐"
241
+ mensaje = "✅ EXCELENTE PARTIDO PARA APOSTAR"
242
+ elif score_promedio >= 50:
243
+ fiabilidad = "ALTA ✅"
244
+ mensaje = "✅ BUEN PARTIDO PARA APOSTAR"
245
+ elif score_promedio >= 35:
246
+ fiabilidad = "MEDIA 🟡"
247
+ mensaje = "🟡 APOSTAR CON PRECAUCIÓN"
248
+ else:
249
+ fiabilidad = "BAJA ⛔"
250
+ mensaje = "⛔ EVITAR APUESTA"
251
+
252
+ return {
253
+ 'fiabilidad': fiabilidad,
254
+ 'score_local': score_local,
255
+ 'score_away': score_away,
256
+ 'score_promedio': score_promedio,
257
+ 'nivel_local': datos_local['Nivel'].values[0],
258
+ 'nivel_away': datos_away['Nivel'].values[0],
259
+ 'mensaje': mensaje,
260
+
261
+ # Datos adicionales útiles
262
+ 'cv_local': datos_local['CV_%'].values[0],
263
+ 'cv_away': datos_away['CV_%'].values[0],
264
+ 'consistencia_local': datos_local['Pct_Cerca_Media'].values[0],
265
+ 'consistencia_away': datos_away['Pct_Cerca_Media'].values[0]
266
+ }
267
+
268
+ def calcular_probabilidades_poisson(lambda_pred, rango_inferior=5, rango_superior=5):
269
+ """Calcula probabilidades usando distribución de Poisson"""
270
+
271
+ valor_central = int(round(lambda_pred))
272
+ valores_analizar = range(
273
+ max(0, valor_central - rango_inferior),
274
+ valor_central + rango_superior + 1
275
+ )
276
+
277
+ probabilidades_exactas = {}
278
+ for k in valores_analizar:
279
+ prob = poisson.pmf(k, lambda_pred) * 100
280
+ probabilidades_exactas[k] = prob
281
+
282
+ # ✅ CORRECCIÓN: MISMAS LÍNEAS PARA OVER Y UNDER
283
+ lines = [7.5, 8.5, 9.5, 10.5, 11.5, 12.5]
284
+
285
+ probabilidades_over = {}
286
+ for linea in lines:
287
+ prob_over = (1 - poisson.cdf(linea, lambda_pred)) * 100
288
+ probabilidades_over[linea] = prob_over
289
+
290
+ probabilidades_under = {}
291
+ for linea in lines: # ✅ CAMBIO: usar la misma lista
292
+ prob_under = poisson.cdf(linea, lambda_pred) * 100
293
+ probabilidades_under[linea] = prob_under
294
+
295
+ return {
296
+ 'exactas': probabilidades_exactas,
297
+ 'over': probabilidades_over,
298
+ 'under': probabilidades_under
299
+ }
300
+
301
+ def clasificar_confianza(prob):
302
+ """Clasifica la confianza según probabilidad"""
303
+ if prob >= 66:
304
+ return "ALTA ✅"
305
+ elif prob >= 55:
306
+ return "MEDIA ⚠️"
307
+ else:
308
+ return "BAJA ❌"
309
+
310
+ def get_dataframes(df, season, round_num, local, away, league=None):
311
+ """Retorna 8 DataFrames filtrados por equipo, venue y liga"""
312
+
313
+ season_round = (df['season'] == season) & (df['round'] < round_num)
314
+
315
+ if league is not None:
316
+ season_round = season_round & (df['league'] == league)
317
+
318
+ def filter_and_split(team_filter):
319
+ filtered = df[season_round & team_filter].copy()
320
+ home = filtered[filtered['venue'] == "Home"]
321
+ away = filtered[filtered['venue'] == "Away"]
322
+ return home, away
323
+
324
+ local_home, local_away = filter_and_split(df['team'] == local)
325
+ local_opp_home, local_opp_away = filter_and_split(df['opponent'] == local)
326
+
327
+ away_home, away_away = filter_and_split(df['team'] == away)
328
+ away_opp_home, away_opp_away = filter_and_split(df['opponent'] == away)
329
+
330
+ return (local_home, local_away, local_opp_home, local_opp_away,
331
+ away_home, away_away, away_opp_home, away_opp_away)
332
+
333
+ def get_head_2_head(df, local, away, seasons=None, league=None):
334
+ """Obtiene últimos 3 enfrentamientos directos"""
335
+ if seasons is None:
336
+ seasons = []
337
+
338
+ df_filtered = df[df['season'].isin(seasons)] if seasons else df
339
+
340
+ if league is not None:
341
+ df_filtered = df_filtered[df_filtered['league'] == league]
342
+
343
+ local_h2h = df_filtered[(df_filtered['team'] == local) & (df_filtered['opponent'] == away)]
344
+ away_h2h = df_filtered[(df_filtered['team'] == away) & (df_filtered['opponent'] == local)]
345
+
346
+ if len(local_h2h) < 4:
347
+ return local_h2h.tail(2), away_h2h.tail(2)
348
+
349
+ return local_h2h.tail(3), away_h2h.tail(3)
350
+
351
+ def get_average(df, is_team=False, lst_avg=None):
352
+ """Calcula promedios de estadísticas (VERSIÓN COMPLETA)"""
353
+
354
+ if len(df) == 0:
355
+ if is_team:
356
+ # ✅ Retornar 23 valores (métricas avanzadas)
357
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
358
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0)
359
+
360
+ if is_team:
361
+ # ===========================
362
+ # ESTADÍSTICAS BÁSICAS (NORMALIZADAS)
363
+ # ===========================
364
+ avg_cross = (df['Performance_Crs'].sum() / len(df)) - lst_avg[3]
365
+ avg_att_3rd = (df['Touches_Att 3rd'].sum() / len(df)) - lst_avg[4]
366
+ avg_sca = (df['SCA Types_SCA'].sum() / len(df)) - lst_avg[2]
367
+ avg_xg = (df['Expected_xG'].sum() / len(df)) - lst_avg[1]
368
+
369
+ # ✅ VARIANZA DE CORNERS
370
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
371
+ avg_ck = (df['Pass Types_CK'].sum() / len(df)) - lst_avg[8]
372
+
373
+ avg_poss = (df['Poss'].sum() / len(df)) - 50
374
+ avg_gf = (df['GF'].sum() / len(df)) - lst_avg[5]
375
+ avg_ga = (df['GA'].sum() / len(df)) - lst_avg[6]
376
+
377
+ # ===========================
378
+ # MÉTRICAS OFENSIVAS AVANZADAS
379
+ # ===========================
380
+ total_sh = df['Standard_Sh'].sum()
381
+ sh_accuracy = (df['Standard_SoT'].sum() / total_sh) if total_sh > 0 else 0
382
+ xg_shot = (df['Expected_xG'].sum() / total_sh) if total_sh > 0 else 0
383
+
384
+ total_touches = df['Touches_Touches'].sum()
385
+ attacking_presence = (df['Touches_Att 3rd'].sum() / total_touches) if total_touches > 0 else 0
386
+
387
+ total_poss = df['Poss'].sum()
388
+ possession_shot = (total_sh / total_poss) if total_poss > 0 else 0
389
+
390
+ # ===========================
391
+ # MÉTRICAS DE CREACIÓN
392
+ # ===========================
393
+ total_passes = df['Total_Att'].sum()
394
+ progressive_pass_ratio = (df['PrgP'].sum() / total_passes) if total_passes > 0 else 0
395
+ final_third_involvement = (df['1/3'].sum() / total_passes) if total_passes > 0 else 0
396
+
397
+ total_sca = df['SCA Types_SCA'].sum()
398
+ assist_sca = (df['Ast'].sum() / total_sca) if total_sca > 0 else 0
399
+ creative_efficiency = (total_sca / total_poss) if total_poss > 0 else 0
400
+
401
+ # ===========================
402
+ # MÉTRICAS DEFENSIVAS
403
+ # ===========================
404
+ total_tackles = df['Tackles_Tkl'].sum()
405
+ high_press_intensity = (df['Tackles_Att 3rd'].sum() / total_tackles) if total_tackles > 0 else 0
406
+ interception_tackle = (df['Int'].sum() / total_tackles) if total_tackles > 0 else 0
407
+
408
+ total_defensive_actions = total_tackles + df['Int'].sum()
409
+ clearance_ratio = (df['Clr'].sum() / total_defensive_actions) if total_defensive_actions > 0 else 0
410
+
411
+ # ===========================
412
+ # MÉTRICAS DE POSESIÓN
413
+ # ===========================
414
+ total_carries = df['Carries_Carries'].sum()
415
+ progressive_carry_ratio = (df['Carries_PrgC'].sum() / total_carries) if total_carries > 0 else 0
416
+
417
+ total_prog_passes = df['PrgP'].sum()
418
+ carry_pass_balance = (df['Carries_PrgC'].sum() / total_prog_passes) if total_prog_passes > 0 else 0
419
+
420
+ # ===========================
421
+ # ÍNDICES COMPUESTOS
422
+ # ===========================
423
+ avg_gf_raw = df['GF'].mean()
424
+ avg_xg_raw = df['Expected_xG'].mean()
425
+ avg_sot = df['Standard_SoT'].mean()
426
+ avg_sh = df['Standard_Sh'].mean()
427
+ offensive_index = (avg_gf_raw + avg_xg_raw) * (avg_sot / avg_sh) if avg_sh > 0 else 0
428
+
429
+ avg_prgp = df['PrgP'].mean()
430
+ avg_prgc = df['Carries_PrgC'].mean()
431
+ avg_poss_raw = df['Poss'].mean()
432
+ transition_index = ((avg_prgp + avg_prgc) / avg_poss_raw) if avg_poss_raw > 0 else 0
433
+
434
+ # ✅ RETORNAR 23 VALORES
435
+ return (
436
+ avg_ck, # 0
437
+ var_ck, # 1 - ✅ NUEVO
438
+ avg_xg, # 2
439
+ avg_sca, # 3
440
+ avg_cross, # 4
441
+ avg_poss, # 5
442
+ avg_att_3rd, # 6
443
+ avg_gf, # 7
444
+ avg_ga, # 8
445
+ sh_accuracy, # 9
446
+ xg_shot, # 10
447
+ attacking_presence, # 11
448
+ possession_shot, # 12
449
+ progressive_pass_ratio, # 13
450
+ final_third_involvement, # 14
451
+ assist_sca, # 15
452
+ creative_efficiency, # 16
453
+ high_press_intensity, # 17
454
+ interception_tackle, # 18
455
+ clearance_ratio, # 19
456
+ progressive_carry_ratio, # 20
457
+ carry_pass_balance, # 21
458
+ offensive_index, # 22
459
+ transition_index # 23
460
+ )
461
+
462
+ # ===========================
463
+ # PROMEDIOS DE LIGA (is_team=False)
464
+ # ===========================
465
+ avg_cross = df['Performance_Crs'].mean()
466
+ avg_att_3rd = df['Touches_Att 3rd'].mean()
467
+ avg_sca = df['SCA Types_SCA'].mean()
468
+ avg_xg = df['Expected_xG'].mean()
469
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
470
+ avg_ck = df['Pass Types_CK'].mean()
471
+ avg_gf = df['GF'].mean()
472
+ avg_ga = df['GA'].mean()
473
+ avg_sh = df['Standard_Sh'].mean() if 'Standard_Sh' in df.columns else 0
474
+
475
+ return (
476
+ var_ck, # 0
477
+ avg_xg, # 1
478
+ avg_sca, # 2
479
+ avg_cross, # 3
480
+ avg_att_3rd, # 4
481
+ avg_gf, # 5
482
+ avg_ga, # 6
483
+ avg_sh, # 7
484
+ avg_ck # 8
485
+ )
486
+
487
+ def get_points_from_result(result):
488
+ """Convierte resultado (W/D/L) a puntos"""
489
+ if result == 'W':
490
+ return 3
491
+ elif result == 'D':
492
+ return 1
493
+ else:
494
+ return 0
495
+
496
+ def get_team_ppp(df, team, season, round_num, league=None):
497
+ """Calcula puntos por partido (PPP) de un equipo"""
498
+ team_matches = df[
499
+ (df['team'] == team) &
500
+ (df['season'] == season) &
501
+ (df['round'] < round_num)
502
+ ]
503
+
504
+ if league is not None:
505
+ team_matches = team_matches[team_matches['league'] == league]
506
+
507
+ if len(team_matches) == 0:
508
+ return 0.0
509
+
510
+ total_points = team_matches['result'].apply(get_points_from_result).sum()
511
+ ppp = total_points / len(team_matches)
512
+
513
+ return ppp
514
+
515
+ def get_ppp_difference(df, local, away, season, round_num, league=None):
516
+ """Calcula diferencia de PPP entre local y visitante"""
517
+ local_ppp = get_team_ppp(df, local, season, round_num, league)
518
+ away_ppp = get_team_ppp(df, away, season, round_num, league)
519
+ return local_ppp - away_ppp
520
+
521
+ def predecir_corners(local, visitante, jornada, temporada="2526", league_code="ESP",df_database=pd.DataFrame(),xgb_model="",scaler="",lst_years=[]):
522
+ """
523
+ Predice corners totales con análisis completo para apuestas
524
+
525
+ Args:
526
+ local: Equipo local
527
+ visitante: Equipo visitante
528
+ jornada: Número de jornada
529
+ temporada: Temporada (formato "2526")
530
+ league_code: Código de liga ("ESP", "GER", "FRA", "ITA", "NED")
531
+ """
532
+
533
+ print(f"\n{'='*80}")
534
+ print(f"🏟️ {local} vs {visitante}")
535
+ print(f"📅 Temporada {temporada} | Jornada {jornada} | Liga: {league_code}")
536
+ print(f"{'='*80}")
537
+
538
+ if jornada < 5:
539
+ return {
540
+ "error": "❌ Se necesitan al menos 5 jornadas previas",
541
+ "prediccion": None
542
+ }
543
+
544
+ try:
545
+ # ===========================
546
+ # EXTRAER FEATURES (igual que antes)
547
+ # ===========================
548
+
549
+ lst_avg = get_average(
550
+ df_database[
551
+ (df_database['season'] == temporada) &
552
+ (df_database['round'] < jornada) &
553
+ (df_database['league'] == league_code)
554
+ ],
555
+ is_team=False
556
+ )
557
+
558
+ (team1_home, team1_away, team1_opp_home, team1_opp_away,
559
+ team2_home, team2_away, team2_opp_home, team2_opp_away) = get_dataframes(
560
+ df_database, temporada, jornada, local, visitante, league=league_code
561
+ )
562
+
563
+ index = lst_years.index(temporada)
564
+ result = lst_years[:index+1]
565
+ team1_h2h, team2_h2h = get_head_2_head(
566
+ df_database, local, visitante, seasons=result, league=league_code
567
+ )
568
+
569
+ local_ppp = get_team_ppp(df_database, local, temporada, jornada, league=league_code)
570
+ away_ppp = get_team_ppp(df_database, visitante, temporada, jornada, league=league_code)
571
+ ppp_diff = local_ppp - away_ppp
572
+
573
+ # ===========================
574
+ # CONSTRUIR DICCIONARIO DE FEATURES (igual que antes)
575
+ # ===========================
576
+
577
+ def create_line(df, is_form=True, is_team=False, use_advanced=True):
578
+ if is_form:
579
+ df = df[-6:]
580
+ if use_advanced:
581
+ return get_average(df, is_team, lst_avg)
582
+ else:
583
+ result = get_average(df, is_team, lst_avg)
584
+ return result[:9]
585
+
586
+ dic_features = {}
587
+
588
+ dic_features['ppp_local'] = (local_ppp,)
589
+ dic_features['ppp_away'] = (away_ppp,)
590
+ dic_features['ppp_difference'] = (ppp_diff,)
591
+
592
+ dic_features['lst_team1_home_form'] = create_line(team1_home, True, True, use_advanced=True)
593
+ dic_features['lst_team1_home_general'] = create_line(team1_home, False, True, use_advanced=True)
594
+ dic_features['lst_team1_away_form'] = create_line(team1_away, True, True, use_advanced=True)
595
+ dic_features['lst_team1_away_general'] = create_line(team1_away, False, True, use_advanced=True)
596
+
597
+ dic_features['lst_team2_home_form'] = create_line(team2_home, True, True, use_advanced=True)
598
+ dic_features['lst_team2_home_general'] = create_line(team2_home, False, True, use_advanced=True)
599
+ dic_features['lst_team2_away_form'] = create_line(team2_away, True, True, use_advanced=True)
600
+ dic_features['lst_team2_away_general'] = create_line(team2_away, False, True, use_advanced=True)
601
+
602
+ dic_features['lst_team1_h2h'] = create_line(team1_h2h, False, True, use_advanced=True)
603
+ dic_features['lst_team2_h2h'] = create_line(team2_h2h, False, True, use_advanced=True)
604
+
605
+ dic_features['lst_team1_opp_away'] = create_line(team1_opp_away, False, True, use_advanced=False)
606
+ dic_features['lst_team2_opp_home'] = create_line(team2_opp_home, False, True, use_advanced=False)
607
+
608
+ league_dummies = {
609
+ 'league_ESP': 1 if league_code == 'ESP' else 0,
610
+ 'league_GER': 1 if league_code == 'GER' else 0,
611
+ 'league_FRA': 1 if league_code == 'FRA' else 0,
612
+ 'league_ITA': 1 if league_code == 'ITA' else 0,
613
+ 'league_NED': 1 if league_code == 'NED' else 0,
614
+ 'league_ENG': 1 if league_code == 'ENG' else 0,
615
+ 'league_POR': 1 if league_code == 'POR' else 0,
616
+ 'league_BEL': 1 if league_code == 'BEL' else 0
617
+ }
618
+
619
+ for key, value in league_dummies.items():
620
+ dic_features[key] = (value,)
621
+
622
+ # ===========================
623
+ # CONSTRUIR VECTOR DE FEATURES
624
+ # ===========================
625
+
626
+ lst_base_advanced = [
627
+ "avg_ck", "var_ck", "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga",
628
+ "sh_accuracy", "xg_shot", "attacking_presence", "possession_shot",
629
+ "progressive_pass_ratio", "final_third_involvement", "assist_sca", "creative_efficiency",
630
+ "high_press_intensity", "interception_tackle", "clearance_ratio",
631
+ "progressive_carry_ratio", "carry_pass_balance", "offensive_index", "transition_index"
632
+ ]
633
+
634
+ lst_base_original = [
635
+ "var_ck", "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga", "avg_ck"
636
+ ]
637
+
638
+ lst_features_values = []
639
+ lst_features_names = []
640
+
641
+ for key in dic_features:
642
+ lst_features_values.extend(list(dic_features[key]))
643
+
644
+ if key in ['ppp_local', 'ppp_away', 'ppp_difference']:
645
+ lst_features_names.append(key)
646
+ elif key.startswith('league_'):
647
+ lst_features_names.append(key)
648
+ elif key in ['lst_team1_opp_away', 'lst_team2_opp_home']:
649
+ lst_features_names.extend([f"{key}_{col}" for col in lst_base_original])
650
+ else:
651
+ lst_features_names.extend([f"{key}_{col}" for col in lst_base_advanced])
652
+
653
+ df_input = pd.DataFrame([lst_features_values], columns=lst_features_names)
654
+
655
+ expected_features = scaler.feature_names_in_
656
+
657
+ if len(df_input.columns) != len(expected_features):
658
+ print(f"\n⚠️ ERROR: Número de features no coincide")
659
+ print(f" Esperadas: {len(expected_features)}")
660
+ print(f" Recibidas: {len(df_input.columns)}")
661
+ return {"error": "Desajuste de features", "prediccion": None}
662
+
663
+ df_input = df_input[expected_features]
664
+
665
+ X_input_scaled = pd.DataFrame(
666
+ scaler.transform(df_input),
667
+ columns=df_input.columns
668
+ )
669
+
670
+ # ===========================
671
+ # PREDICCIÓN
672
+ # ===========================
673
+
674
+ prediccion = xgb_model.predict(X_input_scaled)[0]
675
+
676
+ # ===========================
677
+ # ✅ ANÁLISIS PROBABILÍSTICO CON POISSON
678
+ # ===========================
679
+
680
+ analisis = calcular_probabilidades_poisson(prediccion, rango_inferior=5, rango_superior=5)
681
+
682
+ # ===========================
683
+ # ESTADÍSTICAS DETALLADAS
684
+ # ===========================
685
+
686
+ local_ck_home = team1_home['Pass Types_CK'].mean() if len(team1_home) > 0 else 0
687
+ local_xg_home = team1_home['Expected_xG'].mean() if len(team1_home) > 0 else 0
688
+ local_poss_home = team1_home['Poss'].mean() if len(team1_home) > 0 else 0
689
+
690
+ away_ck_away = team2_away['Pass Types_CK'].mean() if len(team2_away) > 0 else 0
691
+ away_xg_away = team2_away['Expected_xG'].mean() if len(team2_away) > 0 else 0
692
+ away_poss_away = team2_away['Poss'].mean() if len(team2_away) > 0 else 0
693
+
694
+ local_ck_received = team1_opp_home['Pass Types_CK'].mean() if len(team1_opp_home) > 0 else 0
695
+ away_ck_received = team2_opp_away['Pass Types_CK'].mean() if len(team2_opp_away) > 0 else 0
696
+
697
+ partido_ck_esperado = local_ck_home + away_ck_away
698
+
699
+ h2h_ck_local = team1_h2h['Pass Types_CK'].mean() if len(team1_h2h) > 0 else 0
700
+ h2h_ck_away = team2_h2h['Pass Types_CK'].mean() if len(team2_h2h) > 0 else 0
701
+ h2h_total = h2h_ck_local + h2h_ck_away
702
+
703
+ # ===========================
704
+ # ✅ MOSTRAR RESULTADOS CON PROBABILIDADES
705
+ # ===========================
706
+
707
+ print(f"\n🎲 PREDICCIÓN MODELO: {prediccion:.2f} corners totales")
708
+ print(f" PPP: {local} ({local_ppp:.2f}) vs {visitante} ({away_ppp:.2f}) | Diff: {ppp_diff:+.2f}")
709
+
710
+ print(f"\n📊 ESTADÍSTICAS HISTÓRICAS:")
711
+ print(f" {local} (Casa): {local_ck_home:.1f} CK/partido | xG: {local_xg_home:.2f} | Poss: {local_poss_home:.1f}%")
712
+ print(f" {visitante} (Fuera): {away_ck_away:.1f} CK/partido | xG: {away_xg_away:.2f} | Poss: {away_poss_away:.1f}%")
713
+ print(f" Corners recibidos: {local} ({local_ck_received:.1f}) | {visitante} ({away_ck_received:.1f})")
714
+ print(f" Total esperado (suma): {partido_ck_esperado:.1f} corners")
715
+
716
+ if len(team1_h2h) > 0 or len(team2_h2h) > 0:
717
+ print(f"\n🔄 HEAD TO HEAD (últimos {max(len(team1_h2h), len(team2_h2h))} partidos):")
718
+ print(f" {local}: {h2h_ck_local:.1f} CK/partido")
719
+ print(f" {visitante}: {h2h_ck_away:.1f} CK/partido")
720
+ print(f" Promedio total: {h2h_total:.1f} corners")
721
+
722
+ # ===========================
723
+ # ✅ MOSTRAR PROBABILIDADES EXACTAS
724
+ # ===========================
725
+
726
+ valor_mas_probable = max(analisis['exactas'].items(), key=lambda x: x[1])
727
+
728
+ print(f"\n📈 PROBABILIDADES EXACTAS (Poisson):")
729
+ for k in sorted(analisis['exactas'].keys()):
730
+ prob = analisis['exactas'][k]
731
+ bar = '█' * int(prob / 2)
732
+ marca = ' ⭐' if k == valor_mas_probable[0] else ''
733
+ print(f" {k:2d} corners: {prob:5.2f}% {bar}{marca}")
734
+
735
+ print(f"\n✅ Valor más probable: {valor_mas_probable[0]} corners ({valor_mas_probable[1]:.2f}%)")
736
+
737
+ # ✅ RANGO DE 80% CONFIANZA
738
+ probs_sorted = sorted(analisis['exactas'].items(), key=lambda x: x[1], reverse=True)
739
+ cumsum = 0
740
+ rango_80 = []
741
+ for val, prob in probs_sorted:
742
+ cumsum += prob
743
+ rango_80.append(val)
744
+ if cumsum >= 80:
745
+ break
746
+
747
+ print(f"📊 Rango 80% confianza: {min(rango_80)}-{max(rango_80)} corners")
748
+
749
+ # ===========================
750
+ # ✅ MOSTRAR OVER/UNDER CON CUOTAS IMPLÍCITAS
751
+ # ===========================
752
+
753
+ print(f"\n🎯 ANÁLISIS OVER/UNDER:")
754
+ print(f"{'Línea':<10} {'Prob Over':<12} {'Cuota Impl':<12} {'Confianza':<15} {'Prob Under':<12} {'Cuota Impl':<12}")
755
+ print("-" * 85)
756
+
757
+ for linea in [7.5, 8.5, 9.5, 10.5, 11.5, 12.5]:
758
+ prob_over = analisis['over'][linea]
759
+ prob_under = analisis['under'][linea]
760
+
761
+ # Cuotas implícitas (inverso de probabilidad en decimal)
762
+ cuota_impl_over = 100 / prob_over if prob_over > 0 else 999
763
+ cuota_impl_under = 100 / prob_under if prob_under > 0 else 999
764
+
765
+ conf_over = clasificar_confianza(prob_over)
766
+
767
+ print(f"O/U {linea:<5} {prob_over:6.2f}% @{cuota_impl_over:5.2f} {conf_over:<15} {prob_under:6.2f}% @{cuota_impl_under:5.2f}")
768
+
769
+ # ===========================
770
+ # ✅ RECOMENDACIONES CON CUOTAS
771
+ # ===========================
772
+
773
+ print(f"\n💡 RECOMENDACIONES DE APUESTA:")
774
+
775
+ mejores_over = [(l, p) for l, p in analisis['over'].items() if p >= 55]
776
+ mejores_under = [(l, p) for l, p in analisis['under'].items() if p >= 55]
777
+
778
+ if mejores_over:
779
+ print(f"\n✅ OVER con confianza MEDIA/ALTA:")
780
+ for linea, prob in sorted(mejores_over, key=lambda x: x[1], reverse=True):
781
+ cuota_impl = 100 / prob
782
+ conf = clasificar_confianza(prob)
783
+ print(f" • Over {linea}: {prob:.2f}% (Cuota justa: @{cuota_impl:.2f}) - {conf}")
784
+
785
+ if mejores_under:
786
+ print(f"\n✅ UNDER con confianza MEDIA/ALTA:")
787
+ for linea, prob in sorted(mejores_under, key=lambda x: x[1], reverse=True):
788
+ cuota_impl = 100 / prob
789
+ conf = clasificar_confianza(prob)
790
+ print(f" • Under {linea}: {prob:.2f}% (Cuota justa: @{cuota_impl:.2f}) - {conf}")
791
+
792
+ if not mejores_over and not mejores_under:
793
+ print(f" ⚠️ No hay apuestas con confianza MEDIA o superior")
794
+
795
+ # ===========================
796
+ # ✅ ANÁLISIS DE RIESGO
797
+ # ===========================
798
+
799
+ df_varianza_temp = analizar_fiabilidad_equipos(df_database, temporada=temporada, min_partidos=3)
800
+ riesgo = obtener_fiabilidad_partido(local, visitante, df_varianza_temp)
801
+
802
+ print(f"\n⚠️ ANÁLISIS DE RIESGO:")
803
+ print(f" Local ({local}): {riesgo['nivel_local']} (CV: {riesgo['cv_local']:.1f}%)")
804
+ print(f" Away ({visitante}): {riesgo['nivel_away']} (CV: {riesgo['cv_away']:.1f}%)")
805
+ print(f" 🎲 FIABILIDAD PARTIDO: {riesgo['fiabilidad']} (Score: {riesgo['score_promedio']:.1f})")
806
+ print(f" 💡 {riesgo['mensaje']}")
807
+
808
+ # ===========================
809
+ # RETORNAR DICCIONARIO COMPLETO
810
+ # ===========================
811
+
812
+ return {
813
+ "prediccion": round(prediccion, 2),
814
+ "local": local,
815
+ "visitante": visitante,
816
+ "ppp_local": local_ppp,
817
+ "ppp_away": away_ppp,
818
+ "ppp_diff": ppp_diff,
819
+ "riesgo": riesgo,
820
+ "stats": {
821
+ "local_ck": local_ck_home,
822
+ "away_ck": away_ck_away,
823
+ "local_ck_received": local_ck_received,
824
+ "away_ck_received": away_ck_received,
825
+ "h2h_total": h2h_total,
826
+ "partido_esperado": partido_ck_esperado
827
+ },
828
+ "probabilidades_exactas": analisis['exactas'],
829
+ "probabilidades_over": analisis['over'],
830
+ "probabilidades_under": analisis['under'],
831
+ "valor_mas_probable": valor_mas_probable[0],
832
+ "prob_mas_probable": valor_mas_probable[1],
833
+ "rango_80": (min(rango_80), max(rango_80))
834
+ }
835
+
836
+ except Exception as e:
837
+ print(f"\n❌ ERROR: {str(e)}")
838
+ import traceback
839
+ traceback.print_exc()
840
+ return {"error": str(e), "prediccion": None}
841
+
842
+ def predecir_partidos_batch(partidos, jornada, temporada="2526", league_code="ESP", export_csv=True, filename=None,df_database=pd.DataFrame(),xgb_model="",scaler="",lst_years=[]):
843
+ """
844
+ Predice corners para múltiples partidos y exporta resultados a CSV
845
+
846
+ Args:
847
+ partidos: Lista de tuplas [(local1, visitante1), (local2, visitante2), ...]
848
+ jornada: Número de jornada
849
+ temporada: Temporada (formato "2526")
850
+ league_code: Código de liga ("ESP", "GER", "FRA", "ITA", "NED")
851
+ export_csv: Si True, exporta a CSV
852
+ filename: Nombre del archivo CSV (opcional)
853
+
854
+ Returns:
855
+ DataFrame con todos los resultados
856
+ """
857
+
858
+ resultados = []
859
+
860
+ print("\n" + "=" * 120)
861
+ print(f"🎯 PROCESANDO {len(partidos)} PARTIDOS - {league_code} | J{jornada} | Temporada {temporada}")
862
+ print("=" * 120)
863
+
864
+ for idx, (local, visitante) in enumerate(partidos, 1):
865
+ print(f"\n[{idx}/{len(partidos)}] Procesando: {local} vs {visitante}...")
866
+
867
+ resultado = predecir_corners(
868
+ local=local,
869
+ visitante=visitante,
870
+ jornada=jornada,
871
+ temporada=temporada,
872
+ league_code=league_code,
873
+ df_database=df_database,
874
+ xgb_model=xgb_model,
875
+ scaler=scaler,
876
+ lst_years=lst_years)
877
+
878
+
879
+ if resultado.get("error"):
880
+ print(f" ❌ Error: {resultado['error']}")
881
+ continue
882
+
883
+ # ===========================
884
+ # CONSTRUIR FILA DE DATOS
885
+ # ===========================
886
+
887
+ fila = {
888
+ 'Partido': f"{local} vs {visitante}",
889
+ 'Local': local,
890
+ 'Visitante': visitante,
891
+ 'Liga': league_code,
892
+ 'Jornada': jornada,
893
+ 'Temporada': temporada,
894
+
895
+ # Predicción
896
+ 'Prediccion': resultado['prediccion'],
897
+ 'Valor_Mas_Probable': resultado['valor_mas_probable'],
898
+ 'Prob_Valor_Mas_Probable_%': round(resultado['prob_mas_probable'], 2),
899
+ 'Rango_80%_Min': resultado['rango_80'][0],
900
+ 'Rango_80%_Max': resultado['rango_80'][1],
901
+
902
+ # PPP
903
+ 'PPP_Local': round(resultado['ppp_local'], 2),
904
+ 'PPP_Away': round(resultado['ppp_away'], 2),
905
+ 'PPP_Diferencia': round(resultado['ppp_diff'], 2),
906
+
907
+ # Estadísticas históricas
908
+ 'CK_Local_Casa': round(resultado['stats']['local_ck'], 1),
909
+ 'CK_Away_Fuera': round(resultado['stats']['away_ck'], 1),
910
+ 'CK_Local_Recibidos': round(resultado['stats']['local_ck_received'], 1),
911
+ 'CK_Away_Recibidos': round(resultado['stats']['away_ck_received'], 1),
912
+ 'CK_Esperado_Suma': round(resultado['stats']['partido_esperado'], 1),
913
+ 'CK_H2H_Total': round(resultado['stats']['h2h_total'], 1) if resultado['stats']['h2h_total'] > 0 else 'N/A',
914
+
915
+ # Riesgo
916
+ 'Fiabilidad_Partido': resultado['riesgo']['fiabilidad'],
917
+ 'Score_Fiabilidad': round(resultado['riesgo']['score_promedio'], 1),
918
+ 'Nivel_Local': resultado['riesgo']['nivel_local'],
919
+ 'Nivel_Away': resultado['riesgo']['nivel_away'],
920
+ 'CV_Local_%': round(resultado['riesgo']['cv_local'], 1),
921
+ 'CV_Away_%': round(resultado['riesgo']['cv_away'], 1),
922
+ }
923
+
924
+ # ===========================
925
+ # OVER 6.5 a 10.5
926
+ # ===========================
927
+ for linea in [6.5, 7.5, 8.5, 9.5, 10.5]:
928
+ prob = resultado['probabilidades_over'].get(linea, 0)
929
+ cuota_impl = round(100 / prob, 2) if prob > 0 else 999
930
+ conf = clasificar_confianza(prob)
931
+
932
+ fila[f'Over_{linea}_Prob_%'] = round(prob, 2)
933
+ fila[f'Over_{linea}_Cuota'] = cuota_impl
934
+ fila[f'Over_{linea}_Confianza'] = conf
935
+
936
+ # ===========================
937
+ # UNDER 12.5 a 9.5
938
+ # ===========================
939
+ for linea in [12.5, 11.5, 10.5, 9.5]:
940
+ prob = resultado['probabilidades_under'].get(linea, 0)
941
+ cuota_impl = round(100 / prob, 2) if prob > 0 else 999
942
+ conf = clasificar_confianza(prob)
943
+
944
+ fila[f'Under_{linea}_Prob_%'] = round(prob, 2)
945
+ fila[f'Under_{linea}_Cuota'] = cuota_impl
946
+ fila[f'Under_{linea}_Confianza'] = conf
947
+
948
+ # ===========================
949
+ # RECOMENDACIONES
950
+ # ===========================
951
+
952
+ mejores_over = [(l, p) for l, p in resultado['probabilidades_over'].items() if p >= 55]
953
+ mejores_under = [(l, p) for l, p in resultado['probabilidades_under'].items() if p >= 55]
954
+
955
+ if resultado['riesgo']['score_promedio'] < 35:
956
+ fila['Recomendacion'] = "⛔ EVITAR - Baja fiabilidad"
957
+ fila['Es_Apostable'] = "NO"
958
+ elif not mejores_over and not mejores_under:
959
+ fila['Recomendacion'] = "⚠️ NO RECOMENDADO - Sin confianza suficiente"
960
+ fila['Es_Apostable'] = "NO"
961
+ else:
962
+ recomendaciones = []
963
+
964
+ if mejores_over:
965
+ mejor_over = max(mejores_over, key=lambda x: x[1])
966
+ cuota_over = round(100 / mejor_over[1], 2)
967
+ recomendaciones.append(f"Over {mejor_over[0]} ({mejor_over[1]:.1f}% @{cuota_over})")
968
+
969
+ if mejores_under:
970
+ mejor_under = max(mejores_under, key=lambda x: x[1])
971
+ cuota_under = round(100 / mejor_under[1], 2)
972
+ recomendaciones.append(f"Under {mejor_under[0]} ({mejor_under[1]:.1f}% @{cuota_under})")
973
+
974
+ fila['Recomendacion'] = " | ".join(recomendaciones)
975
+
976
+ if resultado['riesgo']['score_promedio'] >= 65:
977
+ fila['Es_Apostable'] = "SÍ ⭐⭐⭐"
978
+ elif resultado['riesgo']['score_promedio'] >= 50:
979
+ fila['Es_Apostable'] = "SÍ ✅"
980
+ else:
981
+ fila['Es_Apostable'] = "PRECAUCIÓN 🟡"
982
+
983
+ fila['Mensaje_Riesgo'] = resultado['riesgo']['mensaje']
984
+
985
+ resultados.append(fila)
986
+ print(f" ✅ Completado")
987
+
988
+ # ===========================
989
+ # CREAR DATAFRAME
990
+ # ===========================
991
+
992
+ df_resultados = pd.DataFrame(resultados)
993
+
994
+ print("\n" + "=" * 120)
995
+ print(f"✅ PROCESAMIENTO COMPLETADO: {len(df_resultados)} partidos analizados")
996
+ print("=" * 120)
997
+
998
+ # ===========================
999
+ # EXPORTAR A CSV
1000
+ # ===========================
1001
+
1002
+ if export_csv and len(df_resultados) > 0:
1003
+ if filename is None:
1004
+ filename = f"predicciones_{league_code}_J{jornada}_{temporada}.csv"
1005
+
1006
+ df_resultados.to_csv(filename, index=False, encoding='utf-8-sig')
1007
+ print(f"\n💾 Resultados exportados a: {filename}")
1008
+
1009
+ # ===========================
1010
+ # RESUMEN
1011
+ # ===========================
1012
+
1013
+ print(f"\n📊 RESUMEN DE APUESTAS:")
1014
+ print(f" Partidos apostables: {len(df_resultados[df_resultados['Es_Apostable'].str.contains('SÍ')])} / {len(df_resultados)}")
1015
+ print(f" Partidos ALTA confianza (⭐⭐⭐): {len(df_resultados[df_resultados['Es_Apostable'] == 'SÍ ⭐⭐⭐'])}")
1016
+ print(f" Partidos MEDIA confianza (✅): {len(df_resultados[df_resultados['Es_Apostable'] == 'SÍ ✅'])}")
1017
+ print(f" Partidos a evitar (⛔): {len(df_resultados[df_resultados['Es_Apostable'] == 'NO'])}")
1018
+
1019
+ return df_resultados
1020
+
1021
+ def mostrar_resumen_batch(df_resultados):
1022
+ """Muestra resumen visual de los resultados"""
1023
+
1024
+ print("\n" + "=" * 120)
1025
+ print("🎯 MEJORES OPORTUNIDADES DE APUESTA")
1026
+ print("=" * 120)
1027
+
1028
+ # Filtrar solo apostables
1029
+ df_apostables = df_resultados[df_resultados['Es_Apostable'].str.contains('SÍ')].copy()
1030
+
1031
+ if len(df_apostables) == 0:
1032
+ print("\n⚠️ No se encontraron partidos con oportunidades de apuesta")
1033
+ return
1034
+
1035
+ # Ordenar por score de fiabilidad
1036
+ df_apostables = df_apostables.sort_values('Score_Fiabilidad', ascending=False)
1037
+
1038
+ for idx, row in df_apostables.iterrows():
1039
+ print(f"\n{'='*120}")
1040
+ print(f"🏟️ {row['Partido']}")
1041
+ print(f"{'='*120}")
1042
+ print(f"📊 Predicción: {row['Prediccion']:.2f} corners | Valor más probable: {row['Valor_Mas_Probable']} ({row['Prob_Valor_Mas_Probable_%']:.1f}%)")
1043
+ print(f"📈 Histórico: Local {row['CK_Local_Casa']:.1f} CK | Away {row['CK_Away_Fuera']:.1f} CK | H2H: {row['CK_H2H_Total']}")
1044
+ print(f"🎲 Fiabilidad: {row['Fiabilidad_Partido']} (Score: {row['Score_Fiabilidad']:.1f}/100)")
1045
+ print(f"💡 {row['Recomendacion']}")
1046
+
1047
+ # Mostrar líneas con alta probabilidad
1048
+ print(f"\n 📌 Líneas destacadas:")
1049
+ for linea in [7.5, 8.5, 9.5, 10.5]:
1050
+ over_prob = row.get(f'Over_{linea}_Prob_%', 0)
1051
+ under_prob = row.get(f'Under_{linea}_Prob_%', 0)
1052
+
1053
+ if over_prob >= 55:
1054
+ cuota = row.get(f'Over_{linea}_Cuota', 0)
1055
+ conf = row.get(f'Over_{linea}_Confianza', '')
1056
+ print(f" • Over {linea}: {over_prob:.1f}% @{cuota:.2f} - {conf}")
1057
+
1058
+ if under_prob >= 55:
1059
+ cuota = row.get(f'Under_{linea}_Cuota', 0)
1060
+ conf = row.get(f'Under_{linea}_Confianza', '')
1061
+ print(f" • Under {linea}: {under_prob:.1f}% @{cuota:.2f} - {conf}")
1062
+
1063
+
1064
+
1065
+
1066
+ class USE_MODEL():
1067
+ def __init__(self):
1068
+ self.load_models()
1069
+ self.load_data()
1070
+ self.init_variables()
1071
+
1072
+ def init_variables(self):
1073
+ self.lst_years = ["1819", "1920", "2021", "2122", "2223", "2324", "2425", "2526"]
1074
+ print("Variables Loaded...")
1075
+
1076
+ def load_data(self):
1077
+
1078
+ #self.df_dataset = pd.read_csv(r"dataset\processed\dataset_processed.csv")
1079
+ import os
1080
+ #load clean dataset generated on generate_dataset.py
1081
+ self.df_dataset_historic = pd.read_csv("dataset/cleaned/dataset_cleaned.csv")
1082
+
1083
+ if os.path.exists(r"dataset/cleaned/dataset_cleaned_current_year.csv"):
1084
+ self.df_dataset_current_year = pd.read_csv("dataset/cleaned/dataset_cleaned_current_year.csv")
1085
+
1086
+ self.df_dataset = pd.concat([self.df_dataset_historic,self.df_dataset_current_year])
1087
+ else:
1088
+ self.df_dataset = self.df_dataset_historic
1089
+
1090
+ self.df_dataset["season"] = self.df_dataset["season"].astype(str)
1091
+ self.df_dataset["Performance_Save%"].fillna(0)
1092
+
1093
+ print("Data Loaded...")
1094
+
1095
+
1096
+ def load_models(self):
1097
+ self.xgb_model = joblib.load('models/xgboost_corners_optimized_v2_6_leagues.pkl')
1098
+ self.scaler = joblib.load('models/scaler_corners_xgb_v2_6_leagues.pkl')
1099
+ print("Models Ready...")
1100
+
1101
+ def consume_model(self,partidos,jornada,temporada,league_code):
1102
+
1103
+ df_predict = predecir_partidos_batch(
1104
+ partidos=partidos,
1105
+ jornada=jornada,
1106
+ temporada=temporada,
1107
+ league_code=league_code,
1108
+ export_csv=True,
1109
+ filename=f"results\{league_code}\{league_code}-{temporada}-{jornada}-predicciones.csv",
1110
+ df_database = self.df_dataset,
1111
+ xgb_model = self.xgb_model,
1112
+ scaler=self.scaler,
1113
+ lst_years=self.lst_years
1114
+ )
1115
+
1116
+ # Mostrar resumen
1117
+ mostrar_resumen_batch(df_predict)
1118
+
1119
+ def kelly_stats(self,p, odds, fraction=0.2):
1120
+
1121
+ b = odds - 1
1122
+ q = 1 - p
1123
+ f_star = (b * p - q) / b
1124
+ f_star = max(f_star, 0) # evita negativos
1125
+ return f_star * fraction # usa 0.1 para Kelly 10%
1126
+
1127
+ a = USE_MODEL()
1128
+
1129
+ partidos = [
1130
+ ("Werder Bremen", "Wolfsburg"),
1131
+ ("Hoffenheim", "RB Leipzig"),
1132
+ ("Leverkusen", "Heidenheim"),
1133
+ ("Hamburger SV", "Dortmund"),
1134
+ ("Union Berlin", "Bayern"),
1135
+ ("Gladbach", "Köln"),
1136
+ ("Freiburg", "St. Pauli"),
1137
+ ("Stuttgart", "Augsburg"),
1138
+ ("Eint Frankfurt", "Mainz 05")
1139
+ ]
1140
+
1141
+ a.consume_model(
1142
+ partidos=partidos,
1143
+ jornada=10,
1144
+ temporada="2526",
1145
+ league_code="GER"
1146
+ )
1147
+
1148
+
src/models/train_model.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ from datetime import datetime
6
+
7
+ # MLflow
8
+ import mlflow
9
+ import mlflow.sklearn
10
+ import mlflow.xgboost
11
+
12
+ from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
13
+ from sklearn.preprocessing import StandardScaler
14
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
15
+ from xgboost import XGBRegressor
16
+ import joblib
17
+
18
+
19
+ class TRAIN_MODEL():
20
+ def __init__(self, nombre, use_grid_search=False, config_path="config/model_config.json"):
21
+ """
22
+ Entrenar modelo con tracking MLflow
23
+
24
+ Args:
25
+ nombre: Identificador del modelo (ej: "v3_production")
26
+ use_grid_search: True = buscar hiperparámetros, False = usar config guardado
27
+ config_path: Ruta al archivo de configuración con hiperparámetros
28
+ """
29
+ # ===========================
30
+ # CONFIGURACIÓN MLFLOW
31
+ # ===========================
32
+ mlflow.set_tracking_uri("file:./mlruns")
33
+ mlflow.set_experiment("corners_prediction")
34
+
35
+ self.nombre = nombre
36
+ self.use_grid_search = use_grid_search
37
+ self.config_path = config_path
38
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
39
+
40
+ # Iniciar run de MLflow
41
+ with mlflow.start_run(run_name=f"{nombre}_{self.timestamp}") as run:
42
+ self.run_id = run.info.run_id
43
+
44
+ print(f"\n{'='*80}")
45
+ print(f"🚀 Entrenamiento iniciado con MLflow")
46
+ print(f" Run ID: {self.run_id}")
47
+ print(f" Nombre: {nombre}")
48
+ print(f" GridSearch: {'SÍ' if use_grid_search else 'NO (usando config)'}")
49
+ print(f"{'='*80}\n")
50
+
51
+ # Tags básicos
52
+ mlflow.set_tags({
53
+ "model_name": nombre,
54
+ "timestamp": self.timestamp,
55
+ "grid_search_used": str(use_grid_search),
56
+ "framework": "XGBoost",
57
+ "task": "regression"
58
+ })
59
+
60
+ # Pipeline de entrenamiento
61
+ try:
62
+ self.init_variables()
63
+ self.load_dataset()
64
+ self.split_train_test(0.15)
65
+ self.define_model()
66
+
67
+ if use_grid_search:
68
+ print("🔍 Ejecutando GridSearch (puede tardar)...")
69
+ self.train_grid_search()
70
+ self.save_best_params() # Guardar para futuros entrenamientos
71
+ else:
72
+ print("⚡ Usando hiperparámetros guardados (rápido)")
73
+ self.load_best_params()
74
+
75
+ self.train_model()
76
+ self.test_and_eval()
77
+ self.top_features()
78
+ self.save_models(nombre)
79
+
80
+ mlflow.set_tag("status", "SUCCESS")
81
+ print(f"\n✅ Entrenamiento completado")
82
+ print(f"📊 Ver en MLflow UI: mlflow ui")
83
+
84
+ except Exception as e:
85
+ mlflow.set_tag("status", "FAILED")
86
+ print(f"\n❌ Error: {e}")
87
+ raise
88
+
89
+ def init_variables(self):
90
+ """Definir espacio de búsqueda para GridSearch"""
91
+ # ✅ GRID INTELIGENTE (~243 combinaciones = 1-3 horas)
92
+ self.param_grid = {
93
+ 'n_estimators': [200], # 1 valor (200 suele ser óptimo)
94
+ 'max_depth': [3, 4, 5], # 3 valores (clave)
95
+ 'learning_rate': [0.02, 0.03], # 2 valores (0.01 es muy lento)
96
+ 'reg_alpha': [3.0, 5.0], # 2 valores
97
+ 'reg_lambda': [5.0, 8.0], # 2 valores
98
+ 'gamma': [0.5, 1.0], # 2 valores
99
+ 'subsample': [0.7], # 1 valor (0.7 suele funcionar)
100
+ 'colsample_bytree': [0.7], # 1 valor
101
+ 'colsample_bylevel': [0.6], # 1 valor
102
+ 'min_child_weight': [5, 7] # 2 valores
103
+ }
104
+ # Combinaciones: 1 × 3 × 2 × 2 × 2 × 2 × 1 × 1 × 1 × 2 = 192
105
+ # Tiempo: ~1.5-3 horas ⏱️
106
+
107
+ # Loggear configuración del grid
108
+ if self.use_grid_search:
109
+ for param, values in self.param_grid.items():
110
+ mlflow.log_param(f"grid_{param}", str(values))
111
+
112
+ print("✅ Variables inicializadas")
113
+
114
+ def load_dataset(self):
115
+ """Cargar y preparar dataset"""
116
+
117
+ self.df_data = pd.read_csv("dataset/processed/dataset_processed.csv")
118
+ self.y = self.df_data["y"]
119
+ self.df_data = self.df_data.drop(["y"], axis=1)
120
+ self.y_array = np.array(self.y).flatten()
121
+
122
+ # Filtrar outliers (3-17 corners)
123
+ mask = (self.y_array >= 3) & (self.y_array <= 17)
124
+ self.df_data = self.df_data[mask].copy()
125
+ self.y_array = self.y_array[mask]
126
+
127
+ # Limpiar nulos
128
+ if self.df_data.isnull().any().any():
129
+ self.df_data = self.df_data.fillna(0)
130
+
131
+ # Loggear info del dataset
132
+ mlflow.log_params({
133
+ "dataset_samples": len(self.df_data),
134
+ "dataset_features": self.df_data.shape[1],
135
+ "target_min": float(self.y_array.min()),
136
+ "target_max": float(self.y_array.max()),
137
+ "target_mean": float(self.y_array.mean()),
138
+ "target_std": float(self.y_array.std())
139
+ })
140
+
141
+ print(f"✅ Dataset cargado: {self.df_data.shape}")
142
+
143
+ def split_train_test(self, test_size_):
144
+ """Dividir datos en train/val/test"""
145
+
146
+ self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
147
+ self.df_data, self.y_array,
148
+ test_size=test_size_,
149
+ random_state=42,
150
+ shuffle=True
151
+ )
152
+
153
+ # Escalar
154
+ self.scaler = StandardScaler()
155
+ self.X_train = pd.DataFrame(
156
+ self.scaler.fit_transform(self.X_train),
157
+ columns=self.X_train.columns
158
+ )
159
+ self.X_test = pd.DataFrame(
160
+ self.scaler.transform(self.X_test),
161
+ columns=self.X_test.columns
162
+ )
163
+
164
+ # Split validación
165
+ self.X_train_fit, self.X_val, self.y_train_fit, self.y_val = train_test_split(
166
+ self.X_train, self.y_train,
167
+ test_size=0.15,
168
+ random_state=43
169
+ )
170
+
171
+ # Loggear splits
172
+ mlflow.log_params({
173
+ "train_samples": len(self.X_train_fit),
174
+ "val_samples": len(self.X_val),
175
+ "test_samples": len(self.X_test),
176
+ "test_size": test_size_
177
+ })
178
+
179
+ print(f"✅ Train: {len(self.X_train_fit)} | Val: {len(self.X_val)} | Test: {len(self.X_test)}")
180
+
181
+ def define_model(self):
182
+ """Definir modelo base y GridSearch"""
183
+
184
+ self.xgb_base = XGBRegressor(
185
+ objective="reg:squarederror",
186
+ tree_method="hist",
187
+ random_state=42,
188
+ n_jobs=-1,
189
+ verbosity=0
190
+ )
191
+
192
+ if self.use_grid_search:
193
+ self.kfold = KFold(n_splits=5, shuffle=True, random_state=42)
194
+ self.mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
195
+
196
+ self.grid_search = GridSearchCV(
197
+ estimator=self.xgb_base,
198
+ param_grid=self.param_grid,
199
+ cv=self.kfold,
200
+ scoring=self.mae_scorer,
201
+ n_jobs=-1,
202
+ verbose=2,
203
+ return_train_score=True
204
+ )
205
+
206
+ def train_grid_search(self):
207
+ """Ejecutar GridSearch y guardar mejores params"""
208
+
209
+ print("\n🔍 Buscando mejores hiperparámetros...")
210
+ self.grid_search.fit(self.X_train_fit, self.y_train_fit)
211
+
212
+ # Mejores parámetros
213
+ self.best_params = self.grid_search.best_params_
214
+
215
+ # Loggear en MLflow
216
+ for param, value in self.best_params.items():
217
+ mlflow.log_param(f"best_{param}", value)
218
+
219
+ mlflow.log_metric("cv_best_mae", -self.grid_search.best_score_)
220
+
221
+ print(f"\n✅ Mejores hiperparámetros encontrados:")
222
+ for param, value in self.best_params.items():
223
+ print(f" {param}: {value}")
224
+ print(f" CV MAE: {-self.grid_search.best_score_:.4f}")
225
+
226
+ def save_best_params(self):
227
+ """Guardar mejores hiperparámetros en archivo JSON"""
228
+
229
+ os.makedirs("config", exist_ok=True)
230
+
231
+ config = {
232
+ "model_name": self.nombre,
233
+ "timestamp": self.timestamp,
234
+ "best_params": self.best_params,
235
+ "cv_mae": float(-self.grid_search.best_score_),
236
+ "run_id": self.run_id
237
+ }
238
+
239
+ with open(self.config_path, 'w') as f:
240
+ json.dump(config, f, indent=4)
241
+
242
+ # Loggear archivo en MLflow
243
+ mlflow.log_artifact(self.config_path)
244
+
245
+ print(f"💾 Hiperparámetros guardados en: {self.config_path}")
246
+
247
+ def load_best_params(self):
248
+ """Cargar hiperparámetros desde archivo JSON"""
249
+
250
+ if not os.path.exists(self.config_path):
251
+ raise FileNotFoundError(
252
+ f"No se encontró {self.config_path}. "
253
+ "Ejecuta primero con use_grid_search=True"
254
+ )
255
+
256
+ with open(self.config_path, 'r') as f:
257
+ config = json.load(f)
258
+
259
+ self.best_params = config["best_params"]
260
+
261
+ # Loggear params en MLflow
262
+ for param, value in self.best_params.items():
263
+ mlflow.log_param(f"loaded_{param}", value)
264
+
265
+ mlflow.log_param("config_source", self.config_path)
266
+ mlflow.log_param("previous_cv_mae", config.get("cv_mae", "N/A"))
267
+
268
+ print(f"✅ Hiperparámetros cargados desde: {self.config_path}")
269
+ print(f" Origen: {config.get('model_name', 'unknown')} ({config.get('timestamp', 'unknown')})")
270
+
271
+ def train_model(self):
272
+ """Entrenar modelo final con mejores params"""
273
+
274
+ self.xgb_model = XGBRegressor(
275
+ **self.best_params,
276
+ objective="reg:squarederror",
277
+ tree_method="hist",
278
+ random_state=42,
279
+ n_jobs=-1,
280
+ verbosity=0
281
+ )
282
+
283
+ self.xgb_model.fit(
284
+ self.X_train_fit,
285
+ self.y_train_fit,
286
+ eval_set=[(self.X_val, self.y_val)],
287
+ verbose=False
288
+ )
289
+
290
+ print("✅ Modelo entrenado")
291
+
292
+ def test_and_eval(self):
293
+ """Evaluar y loggear métricas"""
294
+
295
+ # Predicciones
296
+ y_train_pred = self.xgb_model.predict(self.X_train_fit)
297
+ y_val_pred = self.xgb_model.predict(self.X_val)
298
+ y_test_pred = self.xgb_model.predict(self.X_test)
299
+
300
+ # Calcular métricas
301
+ metrics = {
302
+ 'train': {
303
+ 'mae': mean_absolute_error(self.y_train_fit, y_train_pred),
304
+ 'rmse': np.sqrt(mean_squared_error(self.y_train_fit, y_train_pred)),
305
+ 'r2': r2_score(self.y_train_fit, y_train_pred)
306
+ },
307
+ 'val': {
308
+ 'mae': mean_absolute_error(self.y_val, y_val_pred),
309
+ 'rmse': np.sqrt(mean_squared_error(self.y_val, y_val_pred)),
310
+ 'r2': r2_score(self.y_val, y_val_pred)
311
+ },
312
+ 'test': {
313
+ 'mae': mean_absolute_error(self.y_test, y_test_pred),
314
+ 'rmse': np.sqrt(mean_squared_error(self.y_test, y_test_pred)),
315
+ 'r2': r2_score(self.y_test, y_test_pred)
316
+ }
317
+ }
318
+
319
+ # Loggear TODAS las métricas en MLflow
320
+ for set_name, set_metrics in metrics.items():
321
+ for metric_name, value in set_metrics.items():
322
+ mlflow.log_metric(f"{set_name}_{metric_name}", value)
323
+
324
+ # Cross-validation
325
+ cv_mae = cross_val_score(
326
+ self.xgb_model, self.X_train, self.y_train,
327
+ cv=5, scoring='neg_mean_absolute_error'
328
+ )
329
+ cv_r2 = cross_val_score(
330
+ self.xgb_model, self.X_train, self.y_train,
331
+ cv=5, scoring='r2'
332
+ )
333
+
334
+ mlflow.log_metric("cv_mae_mean", -cv_mae.mean())
335
+ mlflow.log_metric("cv_mae_std", cv_mae.std())
336
+ mlflow.log_metric("cv_r2_mean", cv_r2.mean())
337
+ mlflow.log_metric("cv_r2_std", cv_r2.std())
338
+
339
+ # Análisis de errores
340
+ test_errors = np.abs(self.y_test - y_test_pred)
341
+ mlflow.log_metric("test_error_median", float(np.median(test_errors)))
342
+ mlflow.log_metric("test_error_p90", float(np.percentile(test_errors, 90)))
343
+ mlflow.log_metric("test_pct_error_lt_2", float((test_errors < 2.0).sum() / len(test_errors) * 100))
344
+
345
+ # Gap de overfitting
346
+ gap = metrics['train']['r2'] - metrics['test']['r2']
347
+ mlflow.log_metric("overfitting_gap", gap)
348
+
349
+ print(f"\n📊 MÉTRICAS:")
350
+ print(f" Train MAE: {metrics['train']['mae']:.4f} | R²: {metrics['train']['r2']:.4f}")
351
+ print(f" Val MAE: {metrics['val']['mae']:.4f} | R²: {metrics['val']['r2']:.4f}")
352
+ print(f" Test MAE: {metrics['test']['mae']:.4f} | R²: {metrics['test']['r2']:.4f}")
353
+ print(f" CV MAE: {-cv_mae.mean():.4f} ± {cv_mae.std():.4f}")
354
+ print(f" Overfitting Gap: {gap:.4f}")
355
+
356
+ def top_features(self):
357
+ """Guardar importancia de features"""
358
+
359
+ feature_importance = pd.DataFrame({
360
+ 'feature': self.df_data.columns,
361
+ 'importance': self.xgb_model.feature_importances_
362
+ }).sort_values('importance', ascending=False)
363
+
364
+ # Guardar CSV
365
+ feature_importance.to_csv(f"models/feature_importance_{self.nombre}.csv", index=False)
366
+ mlflow.log_artifact(f"models/feature_importance_{self.nombre}.csv")
367
+
368
+ # Loggear top 10
369
+ for idx, row in feature_importance.head(10).iterrows():
370
+ mlflow.log_metric(f"feat_imp_{row['feature']}", row['importance'])
371
+
372
+ print(f"\n🔍 Top 5 features:")
373
+ for idx, row in feature_importance.head(5).iterrows():
374
+ print(f" {row['feature']}: {row['importance']:.4f}")
375
+
376
+ def save_models(self, nombre):
377
+ """Guardar modelos localmente y en MLflow"""
378
+
379
+ os.makedirs("models", exist_ok=True)
380
+
381
+ # Paths
382
+ model_path = f'models/xgboost_corners_{nombre}.pkl'
383
+ scaler_path = f'models/scaler_corners_{nombre}.pkl'
384
+
385
+ # Guardar archivos
386
+ joblib.dump(self.xgb_model, model_path)
387
+ joblib.dump(self.scaler, scaler_path)
388
+
389
+ # Loggear en MLflow
390
+ mlflow.xgboost.log_model(
391
+ self.xgb_model,
392
+ artifact_path="model",
393
+ registered_model_name=f"corners_predictor"
394
+ )
395
+ mlflow.log_artifact(scaler_path, artifact_path="preprocessing")
396
+
397
+ print(f"\n💾 Modelos guardados:")
398
+ print(f" {model_path}")
399
+ print(f" {scaler_path}")
400
+ print(f" MLflow Model Registry ✓")
401
+
402
+
403
+ # ===========================
404
+ # USO
405
+ # ===========================
406
+
407
+ if __name__ == "__main__":
408
+
409
+ # ========================================
410
+ # OPCIÓN 1: Primera vez o cada 3-6 meses
411
+ # Ejecutar GridSearch (LENTO, 30-60 min)
412
+ # ========================================
413
+ # model = TRAIN_MODEL(
414
+ # nombre="v4_grid_search",
415
+ # use_grid_search=True # Busca mejores hiperparámetros
416
+ # )
417
+
418
+ # ========================================
419
+ # OPCIÓN 2: Reentrenamiento regular
420
+ # Usar hiperparámetros guardados (RÁPIDO, 2-5 min)
421
+ # ========================================
422
+ model = TRAIN_MODEL(
423
+ nombre="v4_retrain",
424
+ use_grid_search=True # Usa config/model_config.json
425
+ )
src/process_data/__init__.py ADDED
File without changes
src/process_data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (171 Bytes). View file
 
src/process_data/__pycache__/process_dataset.cpython-311.pyc ADDED
Binary file (27.7 kB). View file
 
src/process_data/generate_dataset.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Añadir la ruta raíz del proyecto al PYTHONPATH
5
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
6
+ sys.path.insert(0, project_root)
7
+
8
+ from src.utils.helper import desactivar_advertencias
9
+
10
+ import soccerdata as sd
11
+ import pandas as pd
12
+
13
+
14
+ def extract_local(game_str):
15
+ try:
16
+ parts = game_str.split(" ", 1)[1].split("-")
17
+ return parts[0].strip() if len(parts) > 0 else None
18
+ except (IndexError, AttributeError):
19
+ return None
20
+
21
+ def extract_away(game_str):
22
+ try:
23
+ parts = game_str.split(" ", 1)[1].split("-")
24
+ return parts[1].strip() if len(parts) > 1 else None
25
+ except (IndexError, AttributeError):
26
+ return None
27
+
28
+
29
+ class GENERATE_DATASET():
30
+ def __init__(self,current_year):
31
+ print("Clase GENERATE_DATASET Inicializada")
32
+
33
+ desactivar_advertencias()
34
+ self.init_variables()
35
+ self.mergue_raw_data_all_leagues(current_year)
36
+ self.process_and_output_dataset(current_year)
37
+
38
+
39
+ def init_variables(self):
40
+
41
+ #Years to get from datasource
42
+ self.LST_YEARS_CONFIG = [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
43
+
44
+ self.dic_historic_all_leagues = {
45
+ "ENG": {},
46
+ "ESP": {},
47
+ "GER": {},
48
+ "FRA": {},
49
+ "ITA": {},
50
+ "NED": {},
51
+ "ENG2": {},
52
+ "POR": {},
53
+ "BEL": {}
54
+ }
55
+
56
+
57
+ self.df_database = pd.DataFrame()
58
+
59
+ # Diccionary to name leagues to get from datasource
60
+ self.DIC_LEAGUES_CONFIG = {
61
+ "ENG": {
62
+ "name": "ENG-Premier League",
63
+ "code": "ENG"
64
+ },
65
+ "POR": {
66
+ "name": "POR-Primeira Liga",
67
+ "code": "POR"
68
+ },
69
+ "BEL": {
70
+ "name": "BEL-Belgian Pro League",
71
+ "code": "BEL"
72
+ },
73
+ "ESP": {
74
+ "name": "ESP-La Liga",
75
+ "code": "ESP"
76
+ },
77
+ "GER": {
78
+ "name": "GER-Bundesliga",
79
+ "code": "GER"
80
+ },
81
+ "FRA": {
82
+ "name": "FRA-Ligue 1",
83
+ "code": "FRA"
84
+ },
85
+ "ITA": {
86
+ "name": "ITA-Serie A",
87
+ "code": "ITA"
88
+ },
89
+ "NED": {
90
+ "name": "NED-Eredivisie",
91
+ "code": "NED"
92
+ }
93
+ }
94
+
95
+
96
+ lst_base = ['season','date','game','round','day','venue','team','GF','GA','opponent',"result"]
97
+ lst_columns_shooting = ['Expected_xG','Standard_Sh','Standard_SoT','Standard_Dist']
98
+ lst_columns_passing_type = ['Pass Types_CK']
99
+ lst_columns_passing = ['Total_Att','Long_Att','Ast','1/3','PrgP']
100
+ lst_columns_defensive = ['Tackles_Att 3rd','Tackles_Tkl','Blocks_Blocks','Int','Clr']
101
+ lst_columns_keeper = ['Performance_Save%']
102
+ lst_columns_shot_creation = ['SCA Types_SCA']
103
+ lst_columns_misc = ['Performance_Crs']
104
+ lst_columns_possesion = ['Poss', 'Touches_Att 3rd','Carries_PrgC','Touches_Touches','Touches_Att Pen','Carries_Carries','Carries_1/3','Carries_CPA']
105
+
106
+ self.lst_columns_combined = lst_base + lst_columns_passing_type +lst_columns_passing+lst_columns_defensive+lst_columns_shooting+lst_columns_keeper+lst_columns_shot_creation+lst_columns_misc+lst_columns_possesion
107
+ print("-Variables inicializadas")
108
+
109
+ def get_raw_data_from_source(self,league,year):
110
+
111
+ print(f"\nLiga {league}... 📅 Año {year}...", end=" ")
112
+ # Extraer equipos local/visitante
113
+ if league["name"] in ["NED-Eredivisie","POR-Primeira Liga","ENG-Championship"] and year == 2017:
114
+ return
115
+
116
+ # Crear scraper para la liga específica
117
+ fbref = sd.FBref(leagues=league["name"], seasons=year)
118
+
119
+ # Leer estadísticas
120
+ team_season_shooting = fbref.read_team_match_stats(stat_type="shooting",opponent_stats = False)
121
+ team_season_passing_types = fbref.read_team_match_stats(stat_type="passing_types",opponent_stats = False)
122
+ team_season_passing = fbref.read_team_match_stats(stat_type="passing",opponent_stats = False)
123
+ team_season_defensive = fbref.read_team_match_stats(stat_type="defense",opponent_stats = False)
124
+ team_season_goalkeeping = fbref.read_team_match_stats(stat_type="keeper",opponent_stats = False)
125
+ team_season_goal_shot_creation = fbref.read_team_match_stats(stat_type="goal_shot_creation",opponent_stats = False)
126
+ team_season_goal_misc = fbref.read_team_match_stats(stat_type="misc",opponent_stats = False)
127
+ team_season_goal_possession = fbref.read_team_match_stats(stat_type="possession",opponent_stats = False)
128
+
129
+ df_concat = pd.concat([team_season_shooting,team_season_passing_types,team_season_passing,team_season_defensive,
130
+ team_season_goalkeeping,team_season_goal_shot_creation,team_season_goal_misc,team_season_goal_possession], axis=1)
131
+
132
+ # Reset index
133
+ df_reset = df_concat.copy().reset_index()
134
+
135
+ # Aplanar MultiIndex
136
+ df_reset.columns = [
137
+ '_'.join(col).strip('_') if isinstance(col, tuple) else col
138
+ for col in df_reset.columns.values
139
+ ]
140
+
141
+ # Eliminar duplicados
142
+ df_reset = df_reset.loc[:, ~df_reset.columns.duplicated()]
143
+
144
+ df_filtered = df_reset[self.lst_columns_combined]
145
+
146
+ df_filtered["local"] = df_filtered["game"].apply(extract_local)
147
+ df_filtered["away"] = df_filtered["game"].apply(extract_away)
148
+
149
+ # Agregar código de liga
150
+ df_filtered["league"] = league["code"]
151
+
152
+ df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated(keep='first')]
153
+
154
+ # Verificar valores problemáticos
155
+ problematic = df_filtered[df_filtered["away"].isna()]
156
+ if len(problematic) > 0:
157
+ print(f"⚠️ {len(problematic)} registros con formato incorrecto")
158
+ else:
159
+ print(f"✅ {len(df_filtered)} partidos extraídos")
160
+
161
+ return df_filtered
162
+
163
+ def mergue_raw_data_all_leagues(self, current_year):
164
+
165
+ all_dataframes = []
166
+
167
+
168
+ if current_year == True:
169
+ #Process only current year
170
+ for league_key, league_info in self.DIC_LEAGUES_CONFIG.items():
171
+
172
+ self.dic_historic_all_leagues[league_key][self.LST_YEARS_CONFIG[-1]] = self.get_raw_data_from_source(league_info,self.LST_YEARS_CONFIG[-1])
173
+ else:
174
+
175
+ #Process all years needed execpt for current year
176
+ for league_key, league_info in self.DIC_LEAGUES_CONFIG.items():
177
+ for year in self.LST_YEARS_CONFIG:
178
+ if year == 2025:
179
+ continue
180
+ self.dic_historic_all_leagues[league_key][year] = self.get_raw_data_from_source(league_info,year)
181
+
182
+ for league_key, dic_historic in self.dic_historic_all_leagues.items():
183
+ for year, df in dic_historic.items():
184
+ all_dataframes.append(df)
185
+
186
+ self.df_database = pd.concat(all_dataframes, ignore_index=True)
187
+
188
+ print("Dataset conbinado")
189
+
190
+ def process_and_output_dataset(self,current_year):
191
+
192
+ # Filtrar solo Matchweek
193
+ self.df_database = self.df_database[self.df_database['round'].str.contains("Matchweek", na=False)]
194
+ self.df_database['round'] = self.df_database['round'].str.replace("Matchweek ", "")
195
+
196
+ # Convertir tipos
197
+ self.df_database['round'] = self.df_database['round'].astype(int)
198
+ self.df_database['GF'] = self.df_database['GF'].astype(int)
199
+ self.df_database['GA'] = self.df_database['GA'].astype(int)
200
+
201
+ self.df_database = self.df_database.drop_duplicates()
202
+
203
+ if current_year == True:
204
+ self.df_database.to_csv("dataset\cleaned\dataset_cleaned_current_year.csv",index=False)
205
+ else:
206
+ self.df_database.to_csv("dataset\cleaned\dataset_cleaned.csv",index=False)
207
+ print("Dataset cleaned and saved on dataset\cleaned")
208
+
209
+
210
+
211
+ a = GENERATE_DATASET(False)
src/process_data/process_dataset.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+
5
+ def get_ck(df, season, round_num, local, away, league=None):
6
+ """Obtiene corners totales de un partido específico"""
7
+ season_round = (df['season'] == season) & (df['round'] == round_num)
8
+
9
+ if league is not None:
10
+ season_round = season_round & (df['league'] == league)
11
+
12
+ df = df[season_round]
13
+
14
+ df_local = df[df['team'] == local]
15
+ df_away = df[df['team'] == away]
16
+
17
+ total_ck = df_local["Pass Types_CK"].sum() + df_away["Pass Types_CK"].sum()
18
+
19
+ return total_ck
20
+
21
+ def get_dataframes(df, season, round_num, local, away, league=None):
22
+ """Retorna 8 DataFrames filtrados por equipo, venue y liga"""
23
+
24
+ season_round = (df['season'] == season) & (df['round'] < round_num)
25
+
26
+ if league is not None:
27
+ season_round = season_round & (df['league'] == league)
28
+
29
+ def filter_and_split(team_filter):
30
+ filtered = df[season_round & team_filter].copy()
31
+ home = filtered[filtered['venue'] == "Home"]
32
+ away = filtered[filtered['venue'] == "Away"]
33
+ return home, away
34
+
35
+ local_home, local_away = filter_and_split(df['team'] == local)
36
+ local_opp_home, local_opp_away = filter_and_split(df['opponent'] == local)
37
+
38
+ away_home, away_away = filter_and_split(df['team'] == away)
39
+ away_opp_home, away_opp_away = filter_and_split(df['opponent'] == away)
40
+
41
+ return (local_home, local_away, local_opp_home, local_opp_away,
42
+ away_home, away_away, away_opp_home, away_opp_away)
43
+
44
+ def get_head_2_head(df, local, away, seasons=None, league=None):
45
+ """Obtiene últimos 3 enfrentamientos directos"""
46
+ if seasons is None:
47
+ seasons = []
48
+
49
+ df_filtered = df[df['season'].isin(seasons)] if seasons else df
50
+
51
+ if league is not None:
52
+ df_filtered = df_filtered[df_filtered['league'] == league]
53
+
54
+ local_h2h = df_filtered[(df_filtered['team'] == local) & (df_filtered['opponent'] == away)]
55
+ away_h2h = df_filtered[(df_filtered['team'] == away) & (df_filtered['opponent'] == local)]
56
+
57
+ if len(local_h2h) < 4:
58
+ return local_h2h.tail(2), away_h2h.tail(2)
59
+
60
+ return local_h2h.tail(3), away_h2h.tail(3)
61
+
62
+ def get_points_from_result(result):
63
+ """Convierte resultado (W/D/L) a puntos"""
64
+ if result == 'W':
65
+ return 3
66
+ elif result == 'D':
67
+ return 1
68
+ else:
69
+ return 0
70
+
71
+ # ✅ NUEVA FUNCIÓN: Calcular PPP (Puntos Por Partido)
72
+ def get_team_ppp(df, team, season, round_num, league=None):
73
+ """
74
+ Calcula puntos por partido (PPP) de un equipo
75
+
76
+ Args:
77
+ df: DataFrame completo
78
+ team: Nombre del equipo
79
+ season: Temporada
80
+ round_num: Número de jornada (NO incluye esta jornada)
81
+ league: Código de liga (opcional)
82
+
83
+ Returns:
84
+ float: Puntos por partido (0-3)
85
+ """
86
+ team_matches = df[
87
+ (df['team'] == team) &
88
+ (df['season'] == season) &
89
+ (df['round'] < round_num)
90
+ ]
91
+
92
+ if league is not None:
93
+ team_matches = team_matches[team_matches['league'] == league]
94
+
95
+ if len(team_matches) == 0:
96
+ return 0.0
97
+
98
+ total_points = team_matches['result'].apply(get_points_from_result).sum()
99
+ ppp = total_points / len(team_matches)
100
+
101
+ return ppp
102
+
103
+ # ✅ NUEVA FUNCIÓN: Calcular diferencia de PPP
104
+ def get_ppp_difference(df, local, away, season, round_num, league=None):
105
+ """
106
+ Calcula la diferencia de puntos por partido entre local y visitante
107
+
108
+ Args:
109
+ df: DataFrame completo
110
+ local: Equipo local
111
+ away: Equipo visitante
112
+ season: Temporada
113
+ round_num: Jornada actual
114
+ league: Código de liga (opcional)
115
+
116
+ Returns:
117
+ float: Diferencia de PPP (local - away)
118
+ """
119
+ local_ppp = get_team_ppp(df, local, season, round_num, league)
120
+ away_ppp = get_team_ppp(df, away, season, round_num, league)
121
+
122
+ return local_ppp - away_ppp
123
+
124
+ def get_average(df, is_team=False, lst_avg=None):
125
+ """Calcula promedios de estadísticas"""
126
+
127
+ if len(df) == 0:
128
+ # Retornar valores por defecto si el DataFrame está vacío
129
+ if is_team:
130
+ return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
131
+ return (0, 0, 0, 0, 0, 0, 0, 0)
132
+
133
+ if is_team:
134
+ # ===========================
135
+ # ESTADÍSTICAS BÁSICAS (NORMALIZADAS)
136
+ # ===========================
137
+ avg_cross = (df['Performance_Crs'].sum() / len(df)) - lst_avg[3]
138
+ avg_att_3rd = (df['Touches_Att 3rd'].sum() / len(df)) - lst_avg[4]
139
+ avg_sca = (df['SCA Types_SCA'].sum() / len(df)) - lst_avg[2]
140
+ avg_xg = (df['Expected_xG'].sum() / len(df)) - lst_avg[1]
141
+
142
+ # ✅ CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
143
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
144
+ avg_ck = (df['Pass Types_CK'].sum() / len(df)) - lst_avg[8]
145
+
146
+ avg_poss = (df['Poss'].sum() / len(df)) - 50
147
+ avg_gf = (df['GF'].sum() / len(df)) - lst_avg[5]
148
+ avg_ga = (df['GA'].sum() / len(df)) - lst_avg[6]
149
+
150
+ # ===========================
151
+ # MÉTRICAS OFENSIVAS AVANZADAS
152
+ # ===========================
153
+
154
+ # Precisión de tiros
155
+ total_sh = df['Standard_Sh'].sum()
156
+ sh_accuracy = (df['Standard_SoT'].sum() / total_sh) if total_sh > 0 else 0
157
+
158
+ # Eficiencia xG por tiro
159
+ xg_shot = (df['Expected_xG'].sum() / total_sh) if total_sh > 0 else 0
160
+
161
+ # Presencia atacante (% toques en área rival)
162
+ total_touches = df['Touches_Touches'].sum()
163
+ attacking_presence = (df['Touches_Att 3rd'].sum() / total_touches) if total_touches > 0 else 0
164
+
165
+ # Tiros por posesión
166
+ total_poss = df['Poss'].sum()
167
+ possession_shot = (total_sh / total_poss) if total_poss > 0 else 0
168
+
169
+ # Distancia promedio de tiros
170
+ standard_dist = df['Standard_Dist'].mean() if 'Standard_Dist' in df.columns else 0
171
+
172
+ # ===========================
173
+ # MÉTRICAS DE CREACIÓN
174
+ # ===========================
175
+
176
+ # Ratio de pases progresivos
177
+ total_passes = df['Total_Att'].sum()
178
+ progressive_pass_ratio = (df['PrgP'].sum() / total_passes) if total_passes > 0 else 0
179
+
180
+ # Participación en último tercio
181
+ final_third_passes = df['1/3'].sum()
182
+ final_third_involvement = (final_third_passes / total_passes) if total_passes > 0 else 0
183
+
184
+ # Ratio de pases largos
185
+ long_ball_ratio = (df['Long_Att'].sum() / total_passes) if total_passes > 0 else 0
186
+
187
+ # Asistencias por SCA
188
+ total_sca = df['SCA Types_SCA'].sum()
189
+ assist_sca = (df['Ast'].sum() / total_sca) if total_sca > 0 else 0
190
+
191
+ # Dependencia de centros
192
+ cross_dependency = (df['Performance_Crs'].sum() / total_passes) if total_passes > 0 else 0
193
+
194
+ # Eficiencia creativa
195
+ creative_efficiency = (total_sca / total_poss) if total_poss > 0 else 0
196
+
197
+ # ===========================
198
+ # MÉTRICAS DEFENSIVAS
199
+ # ===========================
200
+
201
+ # Intensidad de presión alta
202
+ total_tackles = df['Tackles_Tkl'].sum()
203
+ high_press_intensity = (df['Tackles_Att 3rd'].sum() / total_tackles) if total_tackles > 0 else 0
204
+
205
+ # Ratio intercepciones/tackles
206
+ interception_tackle = (df['Int'].sum() / total_tackles) if total_tackles > 0 else 0
207
+
208
+ # Ratio bloqueos/tackles
209
+ blocks_tackle = (df['Blocks_Blocks'].sum() / total_tackles) if total_tackles > 0 else 0
210
+
211
+ # Ratio de despejes
212
+ total_defensive_actions = total_tackles + df['Int'].sum()
213
+ clearance_ratio = (df['Clr'].sum() / total_defensive_actions) if total_defensive_actions > 0 else 0
214
+
215
+ # ===========================
216
+ # MÉTRICAS DE PORTERÍA
217
+ # ===========================
218
+
219
+ # Rendimiento del portero normalizado
220
+ avg_save_pct = df['Performance_Save%'].mean() if 'Performance_Save%' in df.columns else 0
221
+ avg_xg_against = df['Expected_xG'].mean() if len(df) > 0 else 1
222
+ performance_save = (avg_save_pct / (1 / avg_xg_against)) if avg_xg_against > 0 else 0
223
+
224
+ # ===========================
225
+ # MÉTRICAS DE POSESIÓN
226
+ # ===========================
227
+
228
+ # Ratio de conducciones progresivas
229
+ total_carries = df['Carries_Carries'].sum()
230
+ progressive_carry_ratio = (df['Carries_PrgC'].sum() / total_carries) if total_carries > 0 else 0
231
+
232
+ # Ratio de conducciones al área
233
+ penalty_carry_ratio = (df['Carries_CPA'].sum() / total_carries) if total_carries > 0 else 0
234
+
235
+ # Balance conducción/pase progresivo
236
+ total_prog_passes = df['PrgP'].sum()
237
+ carry_pass_balance = (df['Carries_PrgC'].sum() / total_prog_passes) if total_prog_passes > 0 else 0
238
+
239
+ # ===========================
240
+ # ÍNDICES COMPUESTOS
241
+ # ===========================
242
+
243
+ # Índice ofensivo
244
+ avg_gf_raw = df['GF'].mean()
245
+ avg_xg_raw = df['Expected_xG'].mean()
246
+ avg_sot = df['Standard_SoT'].mean()
247
+ avg_sh = df['Standard_Sh'].mean()
248
+ offensive_index = (avg_gf_raw + avg_xg_raw) * (avg_sot / avg_sh) if avg_sh > 0 else 0
249
+
250
+ # Índice defensivo
251
+ avg_int = df['Int'].mean()
252
+ avg_tkl = df['Tackles_Tkl'].mean()
253
+ avg_clr = df['Clr'].mean()
254
+ defensive_index = avg_save_pct * (avg_int / (avg_tkl + avg_clr)) if (avg_tkl + avg_clr) > 0 else 0
255
+
256
+ # Índice de control de posesión
257
+ avg_touches_att = df['Touches_Att 3rd'].mean()
258
+ avg_carries_third = df['Carries_1/3'].mean() if 'Carries_1/3' in df.columns else 0
259
+ avg_touches_total = df['Touches_Touches'].mean()
260
+ possession_control_index = ((avg_touches_att + avg_carries_third) / avg_touches_total) if avg_touches_total > 0 else 0
261
+
262
+ # Índice de transición
263
+ avg_prgp = df['PrgP'].mean()
264
+ avg_prgc = df['Carries_PrgC'].mean()
265
+ avg_poss_raw = df['Poss'].mean()
266
+ transition_index = ((avg_prgp + avg_prgc) / avg_poss_raw) if avg_poss_raw > 0 else 0
267
+
268
+ # ✅ RETORNAR TODAS LAS MÉTRICAS (23 valores)
269
+ return (
270
+ avg_ck,
271
+ var_ck, # 0 - ✅ CAMBIADO: varianza en vez de promedio
272
+ avg_xg, # 1
273
+ avg_sca, # 2
274
+ avg_cross, # 3
275
+ avg_poss, # 4
276
+ avg_att_3rd, # 5
277
+ avg_gf, # 6
278
+ avg_ga, # 7
279
+ sh_accuracy, # 8
280
+ xg_shot, # 9
281
+ attacking_presence, # 10
282
+ possession_shot, # 11
283
+ progressive_pass_ratio, # 12
284
+ final_third_involvement, # 13
285
+ assist_sca, # 14
286
+ creative_efficiency, # 15
287
+ high_press_intensity, # 16
288
+ interception_tackle, # 17
289
+ clearance_ratio, # 18
290
+ progressive_carry_ratio, # 19
291
+ carry_pass_balance, # 20
292
+ offensive_index, # 21
293
+ transition_index # 22
294
+ )
295
+
296
+ # ===========================
297
+ # PROMEDIOS DE LIGA (is_team=False)
298
+ # ===========================
299
+
300
+ avg_cross = df['Performance_Crs'].mean()
301
+ avg_att_3rd = df['Touches_Att 3rd'].mean()
302
+ avg_sca = df['SCA Types_SCA'].mean()
303
+ avg_xg = df['Expected_xG'].mean()
304
+
305
+ # ✅ CAMBIO: VARIANZA EN VEZ DE PROMEDIO DE CK
306
+ var_ck = df['Pass Types_CK'].var() if len(df) > 1 else 0
307
+ avg_ck = df['Pass Types_CK'].mean()
308
+
309
+ avg_gf = df['GF'].mean()
310
+ avg_ga = df['GA'].mean()
311
+
312
+ # ✅ AGREGAR MÉTRICAS BÁSICAS PARA NORMALIZACIÓN
313
+ avg_sh = df['Standard_Sh'].mean() if 'Standard_Sh' in df.columns else 0
314
+
315
+ return (
316
+
317
+ var_ck, # 0 - ✅ CAMBIADO
318
+ avg_xg, # 1
319
+ avg_sca, # 2
320
+ avg_cross, # 3
321
+ avg_att_3rd, # 4
322
+ avg_gf, # 5
323
+ avg_ga, # 6
324
+ avg_sh, # 7 - NUEVO
325
+ avg_ck
326
+ )
327
+
328
+
329
+
330
+ class PROCESS_DATA():
331
+ def __init__(self,use_one_hot_encoding):
332
+
333
+ self.USE_ONE_HOT_ENCODING = use_one_hot_encoding
334
+
335
+ self.init_variables()
336
+
337
+ self.load_clean_dataset()
338
+
339
+ self.process_all_matches()
340
+
341
+ self.clean_and_ouput_dataset()
342
+ # Excluir temporada 1718 si es necesario
343
+
344
+
345
+ def init_variables(self):
346
+
347
+ self.y = []
348
+
349
+ self.lst_data = []
350
+
351
+ self.lst_years = ["1819", "1920", "2021", "2122", "2223", "2324", "2425", "2526"]
352
+
353
+ # ✅ CONSTRUIR VECTOR DE FEATURES CON NOMBRES DESCRIPTIVOS
354
+ self.lst_base_advanced = [
355
+ "avg_ck","var_ck", # ✅ CAMBIADO
356
+ "xg", "sca", "cross", "poss", "att_3rd", "gf", "ga",
357
+ "sh_accuracy", "xg_shot", "attacking_presence", "possession_shot",
358
+ "progressive_pass_ratio", "final_third_involvement", "assist_sca", "creative_efficiency",
359
+ "high_press_intensity", "interception_tackle", "clearance_ratio",
360
+ "progressive_carry_ratio", "carry_pass_balance", "offensive_index", "transition_index"
361
+ ]
362
+
363
+ self.lst_base_original = [
364
+ "var_ck","xg", "sca", "cross", "poss", "att_3rd", "gf", "ga","avg_ck"
365
+ ]
366
+
367
+ print("Variables inicializadas")
368
+
369
+ def load_clean_dataset(self):
370
+
371
+ #load clean dataset generated on generate_dataset.py
372
+ self.df_dataset_historic = pd.read_csv("dataset/cleaned/dataset_cleaned.csv")
373
+
374
+ if os.path.exists(r"dataset/cleaned/dataset_cleaned_current_year.csv"):
375
+ self.df_dataset_current_year = pd.read_csv("dataset/cleaned/dataset_cleaned_current_year.csv")
376
+
377
+ self.df_dataset = pd.concat([self.df_dataset_historic,self.df_dataset_current_year])
378
+ else:
379
+ self.df_dataset = self.df_dataset_historic
380
+
381
+ self.df_dataset["season"] = self.df_dataset["season"].astype(str)
382
+ self.df_dataset["Performance_Save%"].fillna(0)
383
+
384
+ self.df_dataset_export = self.df_dataset.copy()
385
+
386
+ #filter data to get key elements on mathces
387
+ self.df_dataset_export = self.df_dataset_export.drop_duplicates(subset=["game", "league"])
388
+ self.df_dataset_export = self.df_dataset_export[["local", "away", "round", "season", "date", "league"]]
389
+
390
+ #load all unique matches on a list to process
391
+ self.lst_matches = self.df_dataset_export.values.tolist()
392
+
393
+ self.lst_matches = [row for row in self.lst_matches if row[3] != "1718"]
394
+
395
+ print("dataset loaded")
396
+
397
+ def process_all_matches(self):
398
+
399
+ for i in self.lst_matches:
400
+ if i[2] < 5:
401
+ continue
402
+
403
+ local = i[0]
404
+ away = i[1]
405
+ round_num = i[2]
406
+ season = i[3]
407
+ date = i[4]
408
+ league_code = i[5]
409
+
410
+ dic_df = {}
411
+ # Promedios de liga
412
+ lst_avg = get_average(
413
+ self.df_dataset[
414
+ (self.df_dataset['season'] == season) &
415
+ (self.df_dataset['round'] < round_num) &
416
+ (self.df_dataset['league'] == league_code)
417
+ ],
418
+ is_team=False
419
+ )
420
+
421
+ # ✅ FUNCIÓN MEJORADA: Maneja métricas originales y avanzadas
422
+ def create_line(df, is_form=True, is_team=False, use_advanced=True):
423
+ """
424
+ Args:
425
+ df: DataFrame con datos del equipo
426
+ is_form: Si True, toma solo últimos 8 partidos
427
+ is_team: Si True, normaliza contra promedios de liga
428
+ use_advanced: Si True, incluye métricas avanzadas (23 valores)
429
+ Si False, solo métricas originales (8 valores)
430
+ """
431
+ if is_form:
432
+ df = df[-6:]
433
+
434
+ if use_advanced:
435
+ # Retorna 23 valores (todas las métricas)
436
+ return get_average(df, is_team, lst_avg)
437
+ else:
438
+ # Retorna solo 8 valores originales
439
+ result = get_average(df, is_team, lst_avg)
440
+ return result[:9] # Primeros 8 valores
441
+
442
+
443
+
444
+ # Extraer DataFrames
445
+ (team1_home, team1_away, team1_opp_home, team1_opp_away,
446
+ team2_home, team2_away, team2_opp_home, team2_opp_away) = get_dataframes(
447
+ self.df_dataset, season, round_num, local, away, league=league_code
448
+ )
449
+
450
+ # Corners reales
451
+ ck = get_ck(self.df_dataset, season, round_num, local, away, league=league_code)
452
+ self.y.append(ck)
453
+
454
+ # Head to Head
455
+ index = self.lst_years.index(season)
456
+ result = self.lst_years[:index+1]
457
+ team1_h2h, team2_h2h = get_head_2_head(
458
+ self.df_dataset, local, away, seasons=result, league=league_code
459
+ )
460
+
461
+ # ✅ PPP
462
+ local_ppp = get_team_ppp(self.df_dataset, local, season, round_num, league=league_code)
463
+ away_ppp = get_team_ppp(self.df_dataset, away, season, round_num, league=league_code)
464
+ ppp_diff = local_ppp - away_ppp
465
+
466
+ dic_df['ppp_local'] = (local_ppp,)
467
+ dic_df['ppp_away'] = (away_ppp,)
468
+ dic_df['ppp_difference'] = (ppp_diff,)
469
+
470
+ # ✅ FEATURES CON MÉTRICAS AVANZADAS (23 valores cada una)
471
+ dic_df['lst_team1_home_form'] = create_line(team1_home, True, True, use_advanced=True)
472
+ dic_df['lst_team1_home_general'] = create_line(team1_home, False, True, use_advanced=True)
473
+ dic_df['lst_team1_away_form'] = create_line(team1_away, True, True, use_advanced=True)
474
+ dic_df['lst_team1_away_general'] = create_line(team1_away, False, True, use_advanced=True)
475
+
476
+ dic_df['lst_team2_home_form'] = create_line(team2_home, True, True, use_advanced=True)
477
+ dic_df['lst_team2_home_general'] = create_line(team2_home, False, True, use_advanced=True)
478
+ dic_df['lst_team2_away_form'] = create_line(team2_away, True, True, use_advanced=True)
479
+ dic_df['lst_team2_away_general'] = create_line(team2_away, False, True, use_advanced=True)
480
+
481
+ dic_df['lst_team1_h2h'] = create_line(team1_h2h, False, True, use_advanced=True)
482
+ dic_df['lst_team2_h2h'] = create_line(team2_h2h, False, True, use_advanced=True)
483
+
484
+ # ✅ FEATURES CON MÉTRICAS ORIGINALES (8 valores) - SOLO PARA OPONENTES
485
+ dic_df['lst_team1_opp_away'] = create_line(team1_opp_away, False, True, use_advanced=False)
486
+ dic_df['lst_team2_opp_home'] = create_line(team2_opp_home, False, True, use_advanced=False)
487
+
488
+ # One-Hot Encoding
489
+ if self.USE_ONE_HOT_ENCODING:
490
+ league_dummies = {
491
+ 'league_ESP': 1 if league_code == 'ESP' else 0,
492
+ 'league_GER': 1 if league_code == 'GER' else 0,
493
+ 'league_FRA': 1 if league_code == 'FRA' else 0,
494
+ 'league_ITA': 1 if league_code == 'ITA' else 0,
495
+ 'league_NED': 1 if league_code == 'NED' else 0,
496
+ 'league_ENG': 1 if league_code == 'ENG' else 0,
497
+ 'league_POR': 1 if league_code == 'POR' else 0,
498
+ 'league_BEL': 1 if league_code == 'BEL' else 0
499
+ }
500
+
501
+ for key, value in league_dummies.items():
502
+ dic_df[key] = (value,)
503
+
504
+
505
+
506
+ lst_features_values = []
507
+ self.lst_features_values = []
508
+
509
+ for key in dic_df:
510
+ lst_features_values.extend(list(dic_df[key]))
511
+
512
+ # Casos especiales
513
+ if key in ['ppp_local', 'ppp_away', 'ppp_difference']:
514
+ self.lst_features_values.append(key)
515
+ elif key.startswith('league_'):
516
+ self.lst_features_values.append(key)
517
+ elif key in ['lst_team1_opp_away', 'lst_team2_opp_home']:
518
+ # ✅ Métricas ORIGINALES (8 valores)
519
+ self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_original])
520
+ else:
521
+ # ✅ Métricas AVANZADAS (23 valores)
522
+ self.lst_features_values.extend([f"{key}_{col}" for col in self.lst_base_advanced])
523
+
524
+ self.lst_data.append(lst_features_values)
525
+ print("Dataset processed")
526
+
527
+ def clean_and_ouput_dataset(self):
528
+
529
+ self.df_data = pd.DataFrame(data=self.lst_data, columns=self.lst_features_values)
530
+
531
+ print(f"\n✅ PROCESAMIENTO COMPLETADO:")
532
+ print(f" Shape inicial: {self.df_data.shape}")
533
+ print(f" Total partidos: {len(self.df_data)}")
534
+ print(f" Features totales: {self.df_data.shape[1]}")
535
+
536
+ # ===========================
537
+ # LIMPIEZA DE DATOS NULOS
538
+ # ===========================
539
+
540
+ print(f"\n🧹 LIMPIANDO DATOS NULOS...")
541
+
542
+ import numpy as np
543
+ nulos_antes_X = self.df_data.isnull().sum().sum()
544
+ nulos_antes_y = np.isnan(self.y).sum() if isinstance(self.y, np.ndarray) else sum(pd.isna(self.y))
545
+
546
+ print(f" Nulos en X (antes): {nulos_antes_X}")
547
+ print(f" Nulos en Y (antes): {nulos_antes_y}")
548
+
549
+ y_array = np.array(self.y).flatten()
550
+
551
+ mask_valid_X = ~self.df_data.isnull().any(axis=1)
552
+ mask_valid_y = ~np.isnan(y_array)
553
+ mask_combined = mask_valid_X & mask_valid_y
554
+
555
+ self.df_data = self.df_data[mask_combined].reset_index(drop=True)
556
+ y_array = y_array[mask_combined]
557
+
558
+ print(f"\n✅ LIMPIEZA COMPLETADA:")
559
+ print(f" Nulos en X (después): {self.df_data.isnull().sum().sum()}")
560
+ print(f" Nulos en Y (después): {np.isnan(y_array).sum()}")
561
+ print(f" Filas eliminadas: {len(mask_combined) - mask_combined.sum()}")
562
+ print(f" Shape final: {self.df_data.shape}")
563
+
564
+ # ===========================
565
+ # VERIFICACIÓN FINAL
566
+ # ===========================
567
+
568
+ print(f"\n🔍 VERIFICACIÓN DE NUEVAS FEATURES:")
569
+ print(f" ✅ Features con 'var_ck': {len([c for c in self.df_data.columns if 'var_ck' in c])}")
570
+ print(f" ✅ Features con métricas avanzadas: {len([c for c in self.df_data.columns if any(m in c for m in ['sh_accuracy', 'offensive_index'])])}")
571
+ print(f" ✅ Features de oponentes (8 valores): {len([c for c in self.df_data.columns if 'opp' in c])}")
572
+
573
+ print("\n" + "=" * 80)
574
+ print("✅ PROCESO COMPLETADO - DATOS LISTOS PARA ENTRENAMIENTO")
575
+ print("=" * 80)
576
+
577
+ self.y = y_array.tolist()
578
+
579
+ self.df_data["y"] = self.y
580
+ self.df_data.to_csv("dataset\processed\dataset_processed.csv",index=False)
581
+ print("Dataset")
582
+
583
+ #a = PROCESS_DATA(True)
584
+
src/utils/__init__.py ADDED
File without changes
src/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (164 Bytes). View file
 
src/utils/__pycache__/helper.cpython-311.pyc ADDED
Binary file (1 kB). View file
 
src/utils/helper.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import warnings
3
+ import os
4
+
5
+
6
+ def desactivar_advertencias():
7
+ warnings.filterwarnings('ignore')
8
+
9
+ # Ignorar warnings específicos de bibliotecas comunes
10
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
11
+ warnings.filterwarnings('ignore', category=FutureWarning)
12
+ warnings.filterwarnings('ignore', category=UserWarning)
13
+
14
+ os.environ['PYTHONWARNINGS'] = 'ignore'
15
+
16
+ pd.options.mode.chained_assignment = None # Desactivar SettingWithCopyWarning
17
+
18
+ print("Advertencias desactivadas...")
streamlit_app.py ADDED
@@ -0,0 +1,812 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from datetime import datetime
4
+ import requests
5
+ import plotly.graph_objects as go
6
+ import plotly.express as px
7
+ import numpy as np
8
+ from scipy import stats as scipy_stats
9
+ from dotenv import load_dotenv
10
+ import os
11
+
12
+ load_dotenv()
13
+ API_KEY = os.getenv("API_KEY") # ⚠️ CÁMBIALA POR UNA SEGURA
14
+ # --- CONFIGURACIÓN INICIAL ---
15
+ st.set_page_config(layout="wide", page_title="Corners Forecast", page_icon="⚽")
16
+
17
+ # 👈 AÑADIR MARGEN AL LAYOUT WIDE
18
+ st.markdown("""
19
+ <style>
20
+ .block-container {
21
+ padding-left: 5rem;
22
+ padding-right: 5rem;
23
+ max-width: 1400px;
24
+ margin: 0 auto;
25
+ }
26
+ </style>
27
+ """, unsafe_allow_html=True)
28
+
29
+ # --- CONSTANTES DEL MODELO ---
30
+ MSE_MODELO = 1.9
31
+ RMSE_MODELO = 2.42
32
+ R2_MODELO = 0.39
33
+ N_SIMULACIONES = 5000 # 👈 REDUCIDO A 5000
34
+
35
+ # --- FUNCIONES AUXILIARES ---
36
+ def probabilidad_a_momio(probabilidad):
37
+ """Convierte probabilidad (%) a momio decimal"""
38
+ if probabilidad <= 0:
39
+ return 0
40
+ return round(100 / probabilidad, 2)
41
+
42
+ def clasificar_valor_apuesta(momio_real, momio_modelo):
43
+ """Determina si hay valor en la apuesta"""
44
+ if momio_real > momio_modelo * 1.1:
45
+ return "🟢 EXCELENTE VALOR"
46
+ elif momio_real > momio_modelo:
47
+ return "🟡 BUEN VALOR"
48
+ else:
49
+ return "🔴 SIN VALOR"
50
+
51
+ @st.cache_data(ttl=3600) # 👈 CACHE 1 HORA
52
+ def simular_lambda_montecarlo(lambda_pred, sigma=RMSE_MODELO, n_sims=N_SIMULACIONES):
53
+ """Genera simulaciones Monte Carlo con CACHE"""
54
+ lambdas = np.random.normal(lambda_pred, sigma, n_sims)
55
+ lambdas = np.maximum(lambdas, 0.1)
56
+ return lambdas
57
+
58
+ @st.cache_data(ttl=3600) # 👈 CACHE 1 HORA
59
+ def calcular_probabilidades_con_incertidumbre(lambda_pred, linea, tipo='over', sigma=RMSE_MODELO, n_sims=N_SIMULACIONES):
60
+ """Calcula probabilidades con CACHE"""
61
+ lambdas_sim = simular_lambda_montecarlo(lambda_pred, sigma, n_sims)
62
+ probs = []
63
+
64
+ if tipo == 'over':
65
+ for lam in lambdas_sim:
66
+ prob = 1 - scipy_stats.poisson.cdf(int(linea), lam)
67
+ probs.append(prob * 100)
68
+ else:
69
+ for lam in lambdas_sim:
70
+ prob = scipy_stats.poisson.cdf(int(linea) - 1, lam)
71
+ probs.append(prob * 100)
72
+
73
+ probs = np.array(probs)
74
+
75
+ return {
76
+ 'prob_media': np.mean(probs),
77
+ 'prob_low': np.percentile(probs, 5),
78
+ 'prob_high': np.percentile(probs, 95),
79
+ 'prob_std': np.std(probs),
80
+ 'distribucion': probs
81
+ }
82
+
83
+ def calcular_expected_value(prob_media, momio_casa):
84
+ """Calcula Expected Value (EV)"""
85
+ prob_decimal = prob_media / 100
86
+ ev = (prob_decimal * momio_casa) - 1
87
+ return ev * 100
88
+
89
+ def calcular_kelly_criterion(prob_media, momio_casa):
90
+ """Calcula Kelly Criterion"""
91
+ p = prob_media / 100
92
+
93
+ if momio_casa <= 1:
94
+ return 0
95
+
96
+ kelly = (p * momio_casa - 1) / (momio_casa - 1)
97
+
98
+ if kelly < 0:
99
+ return 0
100
+
101
+ return min(kelly, 0.25)
102
+
103
+ def recomendar_apuesta_avanzada(prob_media, prob_low, prob_high, momio_casa):
104
+ """Sistema avanzado de recomendación"""
105
+ prob_casa = (1 / momio_casa) * 100
106
+ ev = calcular_expected_value(prob_media, momio_casa)
107
+ kelly = calcular_kelly_criterion(prob_media, momio_casa)
108
+ kelly_conservador = kelly * 0.25
109
+
110
+ ev_positivo = ev > 0
111
+ confianza_alta = prob_low > prob_casa
112
+ margen_seguridad = (prob_media - prob_casa) / prob_casa
113
+
114
+ if confianza_alta and ev > 5 and margen_seguridad > 0.1:
115
+ nivel = "EXCELENTE"
116
+ emoji = "🟢"
117
+ recomendar = True
118
+ elif confianza_alta and ev > 0:
119
+ nivel = "BUENA"
120
+ emoji = "🟡"
121
+ recomendar = True
122
+ elif ev > 0:
123
+ nivel = "MODERADA"
124
+ emoji = "🟠"
125
+ recomendar = False
126
+ else:
127
+ nivel = "MALA"
128
+ emoji = "🔴"
129
+ recomendar = False
130
+
131
+ return {
132
+ 'recomendar': recomendar,
133
+ 'nivel': nivel,
134
+ 'emoji': emoji,
135
+ 'ev': ev,
136
+ 'kelly': kelly * 100,
137
+ 'kelly_conservador': kelly_conservador * 100,
138
+ 'prob_casa': prob_casa,
139
+ 'prob_media': prob_media,
140
+ 'prob_low': prob_low,
141
+ 'prob_high': prob_high,
142
+ 'margen_seguridad': margen_seguridad * 100,
143
+ 'ev_positivo': ev_positivo,
144
+ 'confianza_alta': confianza_alta
145
+ }
146
+
147
+ # --- DICCIONARIO DE LIGAS ---
148
+ LEAGUES_DICT = {
149
+ "Ligue 1": "FRA",
150
+ "La Liga": "ESP",
151
+ "Premier League": "ENG",
152
+ "Eredivisie": "NED",
153
+ "Liga NOS": "POR",
154
+ "Pro League": "BEL",
155
+ "Bundesliga": "GER",
156
+ "Serie A": "ITA"
157
+ }
158
+
159
+ # --- HEADER ---
160
+ st.markdown("<h1 style='text-align: center;'>Corners Forecast</h1>", unsafe_allow_html=True)
161
+
162
+
163
+ # --- CARGAR DATOS ---
164
+ @st.cache_data # 👈 CACHE PERMANENTE
165
+ def cargar_datos():
166
+ df = pd.read_csv(r"https://raw.githubusercontent.com/danielsaed/futbol_corners_forecast/refs/heads/main/dataset/cleaned/dataset_cleaned.csv")
167
+ return df[['local','league']].drop_duplicates()
168
+
169
+ df = cargar_datos()
170
+
171
+ # --- INICIALIZAR SESSION STATE ---
172
+ if 'prediccion_realizada' not in st.session_state:
173
+ st.session_state.prediccion_realizada = False
174
+ if 'resultado_api' not in st.session_state:
175
+ st.session_state.resultado_api = None
176
+
177
+ st.markdown("")
178
+
179
+ # --- SELECCIÓN DE PARÁMETROS ---
180
+ col1, col2, col3 = st.columns([1, 1, 1])
181
+
182
+
183
+
184
+ with col2:
185
+ option = st.selectbox(
186
+ "🏆 Liga",
187
+ ["La Liga", "Premier League", "Ligue 1", "Serie A", "Eredivisie", "Liga NOS", "Pro League", "Bundesliga"],
188
+ index=None,
189
+ placeholder="Selecciona liga",
190
+ )
191
+
192
+ st.write("")
193
+
194
+ col_jornada1, col_jornada2, col_jornada3, col_jornada4 = st.columns([2, 1, 1, 2])
195
+ with col_jornada2:
196
+ if option:
197
+ jornada = st.number_input("📅 Jornada", min_value=5, max_value=42, value=15, step=1)
198
+ with col_jornada3:
199
+ if option:
200
+ temporada = st.selectbox(
201
+ "Temporada",
202
+ [2526, 2425, 2324, 2223, 2122],
203
+ index=0
204
+ )
205
+
206
+ st.write("")
207
+
208
+ cl2, cl3, cl4 = st.columns([ 4, 1, 4])
209
+
210
+ with cl2:
211
+ if option:
212
+ if jornada:
213
+ option_local = st.selectbox(
214
+ "🏠 Equipo Local",
215
+ list(df["local"][df["league"] == LEAGUES_DICT[option]]),
216
+ index=None,
217
+ placeholder="Equipo local",
218
+ )
219
+
220
+ with cl3:
221
+ if option:
222
+ st.write("")
223
+ st.write("")
224
+ st.markdown("<h3 style='text-align: center'>VS</h3>", unsafe_allow_html=True)
225
+
226
+ with cl4:
227
+ if option:
228
+ if jornada:
229
+ option_away = st.selectbox(
230
+ "✈️ Equipo Visitante",
231
+ list(df["local"][df["league"] == LEAGUES_DICT[option]]),
232
+ index=None,
233
+ placeholder="Equipo visitante",
234
+ )
235
+
236
+ # --- BOTÓN PARA GENERAR PREDICCIÓN ---
237
+ if option and option_local and option_away:
238
+
239
+ st.markdown("---")
240
+
241
+ col_btn1, col_btn2, col_btn3 = st.columns([1, 1, 1])
242
+
243
+ with col_btn2:
244
+ # 👈 BOTÓN PARA EJECUTAR PREDICCIÓN
245
+ if st.button("Generar Predicción", type="secondary", use_container_width=True):
246
+ st.session_state.prediccion_realizada = True
247
+ st.session_state.resultado_api = None # Reset resultado
248
+
249
+ st.write("")
250
+ st.write("")
251
+
252
+ # --- REALIZAR PREDICCIÓN (SOLO SI SE PRESIONÓ EL BOTÓN) ---
253
+ if option and option_local and option_away and st.session_state.prediccion_realizada:
254
+
255
+ # Si no hay resultado en cache, hacer petición
256
+ if st.session_state.resultado_api is None:
257
+
258
+ with st.spinner('🔮 Generando predicción con análisis de incertidumbre...'):
259
+
260
+ url = "https://daniel-saed-futbol-corners-forecast-api.hf.space/items/"
261
+ #url = "http://localhost:7860//items/"
262
+ headers = {"X-API-Key": API_KEY}
263
+ params = {
264
+ "local": option_local,
265
+ "visitante": option_away,
266
+ "jornada": jornada,
267
+ "league_code": LEAGUES_DICT[option],
268
+ "temporada": str(temporada)
269
+ }
270
+
271
+ try:
272
+ response = requests.get(url, headers=headers, params=params, timeout=30)
273
+
274
+ if response.status_code == 200:
275
+ st.session_state.resultado_api = response.json() # 👈 GUARDAR EN SESSION
276
+ st.success("✅ Predicción generada")
277
+ elif response.status_code == 401:
278
+ st.error("❌ Error de Autenticación - API Key inválida")
279
+ st.stop()
280
+ elif response.status_code == 400:
281
+ st.error(f"❌ Error: {response.json().get('detail', 'Parámetros inválidos')}")
282
+ st.stop()
283
+ else:
284
+ st.error(f"❌ Error {response.status_code}")
285
+ st.stop()
286
+
287
+ except requests.exceptions.Timeout:
288
+ st.error("⏱️ Timeout - Intenta de nuevo")
289
+ st.stop()
290
+ except requests.exceptions.ConnectionError:
291
+ st.error("🌐 Error de conexión")
292
+ st.stop()
293
+ except Exception as e:
294
+ st.error(f"❌ Error: {str(e)}")
295
+ import traceback
296
+ st.code(traceback.format_exc())
297
+ st.stop()
298
+
299
+ # --- MOSTRAR RESULTADOS (DESDE SESSION STATE) ---
300
+ if st.session_state.resultado_api:
301
+ resultado = st.session_state.resultado_api
302
+ lambda_pred = resultado['prediccion']
303
+
304
+ st.write("")
305
+ st.write("")
306
+
307
+ # ============================================
308
+ # 1. PREDICCIÓN PRINCIPAL
309
+ # ============================================
310
+
311
+ lambda_low = max(0, lambda_pred - 1.96 * RMSE_MODELO)
312
+ lambda_high = lambda_pred + 1.96 * RMSE_MODELO
313
+
314
+ st.markdown("## 🎯 Predicción de Corners")
315
+
316
+ st.write("")
317
+
318
+ # Métricas principales con Streamlit nativo
319
+ col_pred1, col_pred2, col_pred3 = st.columns(3)
320
+
321
+ with col_pred1:
322
+ st.metric(
323
+ label="Corners Esperados",
324
+ value=f"{lambda_pred:.1f}",
325
+ help="Valor esperado (λ) del modelo"
326
+ )
327
+
328
+ with col_pred2:
329
+ st.metric(
330
+ label="Límite Inferior",
331
+ value=f"{lambda_low:.1f}",
332
+ delta=f"{lambda_low - lambda_pred:.1f}",
333
+ help="Intervalo de confianza 95% (inferior)"
334
+ )
335
+
336
+ with col_pred3:
337
+ st.metric(
338
+ label="Límite Superior",
339
+ value=f"{lambda_high:.1f}",
340
+ delta=f"{lambda_high - lambda_pred:.1f}",
341
+ help="Intervalo de confianza 95% (superior)"
342
+ )
343
+
344
+ st.write("")
345
+
346
+
347
+
348
+ st.write("")
349
+ st.write("")
350
+ st.markdown("---")
351
+ st.write("")
352
+ st.write("")
353
+
354
+ # ============================================
355
+ # 2. ANÁLISIS DE EQUIPOS (CON TABLAS)
356
+ # ============================================
357
+
358
+ stats_data = resultado['stats']
359
+ local_ck = stats_data['local_ck']
360
+ away_ck = stats_data['away_ck']
361
+ local_ck_received = stats_data['local_ck_received']
362
+ away_ck_received = stats_data['away_ck_received']
363
+ h2h_total = stats_data['h2h_total']
364
+ partido_esperado = stats_data['partido_esperado']
365
+
366
+ riesgo = resultado['riesgo']
367
+
368
+ # 👈 TABLA DE CORNERS GENERADOS Y CONCEDIDOS
369
+ st.markdown("### Análisis de Corners")
370
+
371
+ df_corners = pd.DataFrame({
372
+ 'Métrica': ['Corners Generados ⚽', 'Corners Concedidos 🛡️', 'Head to Head'],
373
+ f'🏠 {option_local}': [f'{local_ck:.2f}', f'{local_ck_received:.2f}','---'],
374
+ f'✈️ {option_away}': [f'{away_ck:.2f}', f'{away_ck_received:.2f}','---'],
375
+ '🎯 Total': [
376
+ f'{(local_ck + away_ck):.2f}',
377
+ f'{(local_ck_received + away_ck_received):.2f}',
378
+ f"{h2h_total:.2f}"
379
+ ]
380
+ })
381
+
382
+ st.dataframe(
383
+ df_corners,
384
+ hide_index=True,
385
+ use_container_width=True,
386
+ column_config={
387
+ 'Métrica': st.column_config.TextColumn('📊 Métrica', width='medium'),
388
+ f'🏠 {option_local}': st.column_config.TextColumn(f'🏠 {option_local}', width='medium'),
389
+ f'✈️ {option_away}': st.column_config.TextColumn(f'✈️ {option_away}', width='medium'),
390
+ '🎯 Total': st.column_config.TextColumn('🎯 Total', width='medium')
391
+ }
392
+ )
393
+
394
+ st.write("")
395
+ st.write("")
396
+
397
+ # --- FIABILIDAD ---
398
+ st.markdown("### Fiabilidad")
399
+
400
+ col_fiab1, col_fiab2, col_fiab3 = st.columns(3)
401
+
402
+ with col_fiab1:
403
+ st.markdown(f"**🏠 {option_local}**")
404
+ st.write(f"**Score:** {riesgo['score_local']:.0f}/100")
405
+ st.write(f"**Nivel:** {riesgo['nivel_local']}")
406
+ st.write(f"**CV:** {riesgo['cv_local']:.1f}%")
407
+ st.progress(riesgo['score_local'] / 100)
408
+
409
+ with col_fiab2:
410
+ st.markdown("**📊 Fiabilidad Global**")
411
+ score_promedio = riesgo['score_promedio']
412
+ st.write(f"**Score:** {score_promedio:.0f}/100")
413
+ st.write("")
414
+
415
+ if score_promedio >= 65:
416
+ st.success("🟢 Fiabilidad MUY ALTA")
417
+ elif score_promedio >= 50:
418
+ st.info("🟡 Fiabilidad ALTA")
419
+ elif score_promedio >= 35:
420
+ st.warning("🟠 Fiabilidad MEDIA")
421
+ else:
422
+ st.error("🔴 Fiabilidad BAJA")
423
+
424
+ with col_fiab3:
425
+ st.markdown(f"**✈️ {option_away}**")
426
+ st.write(f"**Score:** {riesgo['score_away']:.0f}/100")
427
+ st.write(f"**Nivel:** {riesgo['nivel_away']}")
428
+ st.write(f"**CV:** {riesgo['cv_away']:.1f}%")
429
+ st.progress(riesgo['score_away'] / 100)
430
+
431
+ st.write("")
432
+ st.write("")
433
+ st.markdown("---")
434
+ st.write("")
435
+ st.write("")
436
+
437
+ # ============================================
438
+ # 3. PROBABILIDADES CON MONTE CARLO
439
+ # ============================================
440
+
441
+ st.info(f"🔬 **Análisis con {N_SIMULACIONES:,} simulaciones Monte Carlo** considerando RMSE={RMSE_MODELO}")
442
+
443
+ tab_over, tab_under = st.tabs(["⬆️ OVER", "⬇️ UNDER"])
444
+
445
+ # TAB OVER
446
+ with tab_over:
447
+ probs_over = resultado['probabilidades_over']
448
+
449
+ st.markdown("### 📈 Probabilidades Over (con Intervalos de Confianza 90%)")
450
+
451
+ df_over_incertidumbre = []
452
+
453
+ with st.spinner('Calculando incertidumbres Over...'):
454
+ for linea_str in sorted(probs_over.keys(), key=float, reverse=True):
455
+ linea = float(linea_str)
456
+
457
+ resultado_inc = calcular_probabilidades_con_incertidumbre(
458
+ lambda_pred, linea, tipo='over'
459
+ )
460
+
461
+ prob_media = resultado_inc['prob_media']
462
+ prob_low = resultado_inc['prob_low']
463
+ prob_high = resultado_inc['prob_high']
464
+
465
+ momio_medio = probabilidad_a_momio(prob_media)
466
+ momio_low = probabilidad_a_momio(prob_high)
467
+ momio_high = probabilidad_a_momio(prob_low)
468
+
469
+ df_over_incertidumbre.append({
470
+ 'Línea': f"Over {linea_str}",
471
+ 'Prob. Media': f"{prob_media:.1f}%",
472
+ 'IC 90%': f"[{prob_low:.1f}%, {prob_high:.1f}%]",
473
+ 'Momio Justo': f"@{momio_medio:.2f}",
474
+ 'Rango Momio': f"[@{momio_low:.2f} - @{momio_high:.2f}]",
475
+ 'linea_num': linea,
476
+ 'prob_media_raw': prob_media,
477
+ 'prob_low_raw': prob_low,
478
+ 'prob_high_raw': prob_high,
479
+ 'tipo': 'Over'
480
+ })
481
+
482
+ df_over_display = pd.DataFrame(df_over_incertidumbre)
483
+
484
+ st.dataframe(
485
+ df_over_display[['Línea', 'Prob. Media', 'Momio Justo']],
486
+ hide_index=True,
487
+ use_container_width=True,
488
+ column_config={
489
+ 'Línea': st.column_config.TextColumn('🎯 Línea', width='small'),
490
+ 'Prob. Media': st.column_config.TextColumn('📊 Probabilidad', width='small'),
491
+ 'Momio Justo': st.column_config.TextColumn('💰 Momio', width='small'),
492
+ }
493
+ )
494
+
495
+ st.write("")
496
+
497
+ # Gráfico
498
+ fig_over = go.Figure()
499
+
500
+ lineas_sorted = sorted([x['linea_num'] for x in df_over_incertidumbre])
501
+ probs_medias = [x['prob_media_raw'] for x in sorted(df_over_incertidumbre, key=lambda x: x['linea_num'])]
502
+ probs_low = [x['prob_low_raw'] for x in sorted(df_over_incertidumbre, key=lambda x: x['linea_num'])]
503
+ probs_high = [x['prob_high_raw'] for x in sorted(df_over_incertidumbre, key=lambda x: x['linea_num'])]
504
+
505
+ fig_over.add_trace(go.Scatter(
506
+ x=[f"Over {l}" for l in lineas_sorted] + [f"Over {l}" for l in lineas_sorted[::-1]],
507
+ y=probs_high + probs_low[::-1],
508
+ fill='toself',
509
+ fillcolor='rgba(46, 204, 113, 0.2)',
510
+ line=dict(color='rgba(255,255,255,0)'),
511
+ showlegend=True,
512
+ name='IC 90%',
513
+ hoverinfo='skip'
514
+ ))
515
+
516
+ fig_over.add_trace(go.Scatter(
517
+ x=[f"Over {l}" for l in lineas_sorted],
518
+ y=probs_medias,
519
+ mode='lines+markers',
520
+ name='Probabilidad Media',
521
+ line=dict(color='#2ecc71', width=3),
522
+ marker=dict(size=10)
523
+ ))
524
+
525
+ fig_over.update_layout(
526
+ title="Probabilidades Over con Banda de Incertidumbre (Monte Carlo)",
527
+ xaxis_title="Línea",
528
+ yaxis_title="Probabilidad (%)",
529
+ height=500,
530
+ hovermode='x unified'
531
+ )
532
+
533
+ st.plotly_chart(fig_over, use_container_width=True)
534
+
535
+ # TAB UNDER
536
+ with tab_under:
537
+ probs_under = resultado['probabilidades_under']
538
+
539
+ st.markdown("### 📉 Probabilidades Under (con Intervalos de Confianza 90%)")
540
+
541
+ df_under_incertidumbre = []
542
+
543
+ with st.spinner('Calculando incertidumbres Under...'):
544
+ for linea_str in sorted(probs_under.keys(), key=float, reverse=True):
545
+ linea = float(linea_str)
546
+
547
+ resultado_inc = calcular_probabilidades_con_incertidumbre(
548
+ lambda_pred, linea, tipo='under'
549
+ )
550
+
551
+ prob_media = resultado_inc['prob_media']
552
+ prob_low = resultado_inc['prob_low']
553
+ prob_high = resultado_inc['prob_high']
554
+
555
+ momio_medio = probabilidad_a_momio(prob_media)
556
+ momio_low = probabilidad_a_momio(prob_high)
557
+ momio_high = probabilidad_a_momio(prob_low)
558
+
559
+ df_under_incertidumbre.append({
560
+ 'Línea': f"Under {linea_str}",
561
+ 'Prob. Media': f"{prob_media:.1f}%",
562
+ 'IC 90%': f"[{prob_low:.1f}%, {prob_high:.1f}%]",
563
+ 'Momio Justo': f"@{momio_medio:.2f}",
564
+ 'Rango Momio': f"[@{momio_low:.2f} - @{momio_high:.2f}]",
565
+ 'linea_num': linea,
566
+ 'prob_media_raw': prob_media,
567
+ 'prob_low_raw': prob_low,
568
+ 'prob_high_raw': prob_high,
569
+ 'tipo': 'Under'
570
+ })
571
+
572
+ df_under_display = pd.DataFrame(df_under_incertidumbre)
573
+
574
+ st.dataframe(
575
+ df_under_display[['Línea', 'Prob. Media', 'IC 90%', 'Momio Justo', 'Rango Momio']],
576
+ hide_index=True,
577
+ use_container_width=True,
578
+ column_config={
579
+ 'Línea': st.column_config.TextColumn('🎯 Línea', width='small'),
580
+ 'Prob. Media': st.column_config.TextColumn('📊 Probabilidad', width='small'),
581
+ 'IC 90%': st.column_config.TextColumn('📉 Intervalo 90%', width='medium'),
582
+ 'Momio Justo': st.column_config.TextColumn('💰 Momio', width='small'),
583
+ 'Rango Momio': st.column_config.TextColumn('📈 Rango Momios', width='medium')
584
+ }
585
+ )
586
+
587
+ st.write("")
588
+
589
+ # Gráfico
590
+ fig_under = go.Figure()
591
+
592
+ lineas_sorted_under = sorted([x['linea_num'] for x in df_under_incertidumbre])
593
+ probs_medias_under = [x['prob_media_raw'] for x in sorted(df_under_incertidumbre, key=lambda x: x['linea_num'])]
594
+ probs_low_under = [x['prob_low_raw'] for x in sorted(df_under_incertidumbre, key=lambda x: x['linea_num'])]
595
+ probs_high_under = [x['prob_high_raw'] for x in sorted(df_under_incertidumbre, key=lambda x: x['linea_num'])]
596
+
597
+ fig_under.add_trace(go.Scatter(
598
+ x=[f"Under {l}" for l in lineas_sorted_under] + [f"Under {l}" for l in lineas_sorted_under[::-1]],
599
+ y=probs_high_under + probs_low_under[::-1],
600
+ fill='toself',
601
+ fillcolor='rgba(231, 76, 60, 0.2)',
602
+ line=dict(color='rgba(255,255,255,0)'),
603
+ showlegend=True,
604
+ name='IC 90%',
605
+ hoverinfo='skip'
606
+ ))
607
+
608
+ fig_under.add_trace(go.Scatter(
609
+ x=[f"Under {l}" for l in lineas_sorted_under],
610
+ y=probs_medias_under,
611
+ mode='lines+markers',
612
+ name='Probabilidad Media',
613
+ line=dict(color='#e74c3c', width=3),
614
+ marker=dict(size=10)
615
+ ))
616
+
617
+ fig_under.update_layout(
618
+ title="Probabilidades Under con Banda de Incertidumbre (Monte Carlo)",
619
+ xaxis_title="Línea",
620
+ yaxis_title="Probabilidad (%)",
621
+ height=500,
622
+ hovermode='x unified'
623
+ )
624
+
625
+ st.plotly_chart(fig_under, use_container_width=True)
626
+
627
+ st.write("")
628
+ st.write("")
629
+ st.markdown("---")
630
+ st.write("")
631
+ st.write("")
632
+
633
+ # ============================================
634
+ # 4. CALCULADORA AVANZADA
635
+ # ============================================
636
+ st.markdown("## 💰 Calculadora de Valor")
637
+
638
+ st.write("")
639
+
640
+ # Combinar datos
641
+ todas_lineas_datos = {}
642
+
643
+ for item in df_over_incertidumbre:
644
+ todas_lineas_datos[item['Línea']] = item
645
+
646
+ for item in df_under_incertidumbre:
647
+ todas_lineas_datos[item['Línea']] = item
648
+
649
+ todas_lineas_ordenadas = sorted(
650
+ todas_lineas_datos.keys(),
651
+ key=lambda x: (0 if 'Over' in x else 1, float(x.split()[1])),
652
+ reverse=True
653
+ )
654
+
655
+ col_calc1, col_calc2 = st.columns(2)
656
+
657
+ with col_calc1:
658
+ linea_calc = st.selectbox(
659
+ "🎯 Selecciona línea",
660
+ todas_lineas_ordenadas,
661
+ key="calc_linea"
662
+ )
663
+
664
+ with col_calc2:
665
+ momio_casa = st.number_input(
666
+ "💰 Momio del casino",
667
+ min_value=1.01,
668
+ max_value=20.0,
669
+ value=2.0,
670
+ step=0.01,
671
+ key="calc_momio",
672
+ help="Ingresa el momio decimal que ofrece la casa de apuestas"
673
+ )
674
+
675
+ st.write("")
676
+
677
+ datos_linea = todas_lineas_datos[linea_calc]
678
+
679
+ prob_media = datos_linea['prob_media_raw']
680
+ prob_low = datos_linea['prob_low_raw']
681
+ prob_high = datos_linea['prob_high_raw']
682
+
683
+ recomendacion = recomendar_apuesta_avanzada(
684
+ prob_media, prob_low, prob_high, momio_casa
685
+ )
686
+
687
+ st.markdown("### 📊 Métricas de la Apuesta")
688
+
689
+ col_m1, col_m2, col_m3, col_m4 = st.columns(4)
690
+
691
+ with col_m1:
692
+ st.metric(
693
+ "Prob. Media",
694
+ f"{prob_media:.1f}%",
695
+ help="Probabilidad media según Monte Carlo"
696
+ )
697
+
698
+ with col_m2:
699
+ momio_justo = probabilidad_a_momio(prob_media)
700
+ st.metric(
701
+ "Momio Justo",
702
+ f"@{momio_justo:.2f}",
703
+ help="Momio que refleja la probabilidad real"
704
+ )
705
+
706
+ with col_m3:
707
+ delta_ev = "📈 Positivo" if recomendacion['ev'] > 0 else "📉 Negativo"
708
+ st.metric(
709
+ "Expected Value",
710
+ f"{recomendacion['ev']:+.2f}%",
711
+ delta=delta_ev,
712
+ help="Ganancia esperada por cada $1 apostado"
713
+ )
714
+
715
+ with col_m4:
716
+ st.metric(
717
+ "Prob. Casino",
718
+ f"{recomendacion['prob_casa']:.1f}%",
719
+ help="Probabilidad implícita del momio del casino"
720
+ )
721
+
722
+ st.write("")
723
+ st.write("")
724
+
725
+ st.markdown("### 💵 Gestión de Bankroll (Kelly Criterion)")
726
+
727
+ col_kelly1, col_kelly2 = st.columns(2)
728
+
729
+ with col_kelly1:
730
+ if recomendacion['kelly'] > 0:
731
+ st.write(f"**Kelly Completo:** {recomendacion['kelly']:.2f}% del bankroll")
732
+ st.write(f"**Kelly Conservador (1/4):** {recomendacion['kelly_conservador']:.2f}% del bankroll ⭐")
733
+
734
+ st.write("")
735
+ st.markdown("**Ejemplo con Bankroll de $1,000:**")
736
+ apuesta_kelly = (recomendacion['kelly'] / 100) * 1000
737
+ apuesta_conservador = (recomendacion['kelly_conservador'] / 100) * 1000
738
+
739
+ st.write(f"- Kelly Completo: **${apuesta_kelly:.2f}**")
740
+ st.write(f"- Conservador: **${apuesta_conservador:.2f}**")
741
+
742
+ ganancia_potencial = apuesta_conservador * (momio_casa - 1)
743
+ st.write(f"- Ganancia potencial: **${ganancia_potencial:.2f}**")
744
+ else:
745
+ st.error("❌ Kelly = 0 - No apostar")
746
+
747
+ with col_kelly2:
748
+ st.write(f"**EV:** {recomendacion['ev']:+.2f}%")
749
+ st.write(f"**Margen de Seguridad:** {recomendacion['margen_seguridad']:+.1f}%")
750
+ st.write(f"**IC 90%:** [{prob_low:.1f}%, {prob_high:.1f}%]")
751
+
752
+ st.write("")
753
+
754
+ if recomendacion['confianza_alta']:
755
+ st.success("✅ Alta confianza: IC inferior supera prob. casino")
756
+ else:
757
+ st.warning("⚠️ Baja confianza: IC inferior NO supera prob. casino")
758
+
759
+ if recomendacion['ev'] > 10:
760
+ st.success("🟢 EV excelente (>10%)")
761
+ elif recomendacion['ev'] > 5:
762
+ st.info("🟡 EV bueno (5-10%)")
763
+ elif recomendacion['ev'] > 0:
764
+ st.warning("🟠 EV positivo pero bajo (<5%)")
765
+ else:
766
+ st.error("🔴 EV negativo")
767
+
768
+ # Footer
769
+ st.write("")
770
+ st.write("")
771
+ st.markdown("---")
772
+ st.caption(f"🤖 XGBoost v4.2 + Monte Carlo | 🎲 {N_SIMULACIONES:,} simulaciones | 📊 RMSE: {RMSE_MODELO} | ⏰ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
773
+
774
+ else:
775
+ if option:
776
+ if option_local and option_away:
777
+ pass # Esperando botón
778
+ else:
779
+ st.info("👆 Selecciona ambos equipos")
780
+ else:
781
+ st.info("👆 Selecciona una liga para comenzar")
782
+
783
+ # Sidebar
784
+ with st.sidebar:
785
+ st.markdown("## Corners Forecast")
786
+
787
+ st.markdown("---")
788
+
789
+ st.markdown("### 🔗 Enlaces")
790
+ st.markdown("""
791
+ [![GitHub](https://img.shields.io/badge/GitHub-Repository-181717?style=flat&logo=github)](https://github.com/danielsaed/futbol_corners_forecast)
792
+
793
+ [![Hugging Face](https://img.shields.io/badge/🤗_Hugging_Face-API-FFD21E?style=flat)](https://huggingface.co/spaces/daniel-saed/futbol-corners-forecast-api)
794
+ """)
795
+
796
+ st.markdown("---")
797
+
798
+ st.markdown("### Ligas")
799
+ for league in LEAGUES_DICT.keys():
800
+ st.write(f"• {league}")
801
+
802
+
803
+
804
+ # 👈 BOTÓN PARA LIMPIAR CACHE
805
+ if st.button("🗑️ Limpiar Cache", use_container_width=True):
806
+ st.cache_data.clear()
807
+ st.session_state.prediccion_realizada = False
808
+ st.session_state.resultado_api = None
809
+ st.success("✅ Cache limpiado")
810
+ st.rerun()
811
+
812
+ st.markdown("---")