nananie143 commited on
Commit
a3dc3ee
·
verified ·
1 Parent(s): 3057638

feat: Add data source integration module

Browse files
Files changed (1) hide show
  1. src/data/integration.py +283 -0
src/data/integration.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Source Integration
3
+ Connects all blueprint data collectors to the main application.
4
+ """
5
+
6
+ import logging
7
+ from typing import Dict, List, Optional
8
+ import pandas as pd
9
+ from datetime import datetime, timedelta
10
+ import asyncio
11
+ from concurrent.futures import ThreadPoolExecutor
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Import collectors
16
+ try:
17
+ from src.data.collectors.football_data import get_collector as get_fdcouk_collector
18
+ except ImportError:
19
+ get_fdcouk_collector = None
20
+ logger.warning("Football-data collector not available")
21
+
22
+ try:
23
+ from src.data.collectors.fbref_scraper import get_scraper as get_fbref_scraper
24
+ except ImportError:
25
+ get_fbref_scraper = None
26
+ logger.warning("FBRef scraper not available")
27
+
28
+ try:
29
+ from src.data.collectors.understat_api import get_api as get_understat_api
30
+ except ImportError:
31
+ get_understat_api = None
32
+ logger.warning("Understat API not available")
33
+
34
+ try:
35
+ from src.data.collectors.sofascore_api import get_api as get_sofascore_api
36
+ except ImportError:
37
+ get_sofascore_api = None
38
+ logger.warning("Sofascore API not available")
39
+
40
+ try:
41
+ from src.data.collectors.statsbomb_loader import get_loader as get_statsbomb_loader
42
+ except ImportError:
43
+ get_statsbomb_loader = None
44
+ logger.warning("StatsBomb loader not available")
45
+
46
+
47
+ class DataSourceManager:
48
+ """
49
+ Manages all data sources and provides unified data access.
50
+
51
+ Connects:
52
+ - Football-Data.co.uk (historical results, odds)
53
+ - FBRef (advanced stats)
54
+ - Understat (xG data)
55
+ - Sofascore (live data)
56
+ - StatsBomb (open data)
57
+ """
58
+
59
+ def __init__(self):
60
+ self.collectors = {}
61
+ self._initialize_collectors()
62
+
63
+ def _initialize_collectors(self):
64
+ """Initialize all available collectors."""
65
+ if get_fdcouk_collector:
66
+ try:
67
+ self.collectors['football_data'] = get_fdcouk_collector()
68
+ logger.info("✅ Football-Data.co.uk collector initialized")
69
+ except Exception as e:
70
+ logger.error(f"Failed to init football-data: {e}")
71
+
72
+ if get_fbref_scraper:
73
+ try:
74
+ self.collectors['fbref'] = get_fbref_scraper()
75
+ logger.info("✅ FBRef scraper initialized")
76
+ except Exception as e:
77
+ logger.error(f"Failed to init fbref: {e}")
78
+
79
+ if get_understat_api:
80
+ try:
81
+ self.collectors['understat'] = get_understat_api()
82
+ logger.info("✅ Understat API initialized")
83
+ except Exception as e:
84
+ logger.error(f"Failed to init understat: {e}")
85
+
86
+ if get_sofascore_api:
87
+ try:
88
+ self.collectors['sofascore'] = get_sofascore_api()
89
+ logger.info("✅ Sofascore API initialized")
90
+ except Exception as e:
91
+ logger.error(f"Failed to init sofascore: {e}")
92
+
93
+ if get_statsbomb_loader:
94
+ try:
95
+ self.collectors['statsbomb'] = get_statsbomb_loader()
96
+ logger.info("✅ StatsBomb loader initialized")
97
+ except Exception as e:
98
+ logger.error(f"Failed to init statsbomb: {e}")
99
+
100
+ def get_status(self) -> Dict:
101
+ """Get status of all data sources."""
102
+ return {
103
+ 'sources': list(self.collectors.keys()),
104
+ 'count': len(self.collectors),
105
+ 'available': {
106
+ 'football_data': 'football_data' in self.collectors,
107
+ 'fbref': 'fbref' in self.collectors,
108
+ 'understat': 'understat' in self.collectors,
109
+ 'sofascore': 'sofascore' in self.collectors,
110
+ 'statsbomb': 'statsbomb' in self.collectors,
111
+ }
112
+ }
113
+
114
+ def fetch_upcoming_fixtures(
115
+ self,
116
+ days_ahead: int = 7,
117
+ leagues: List[str] = None
118
+ ) -> pd.DataFrame:
119
+ """
120
+ Fetch upcoming fixtures from all sources.
121
+
122
+ Args:
123
+ days_ahead: Number of days to look ahead
124
+ leagues: Specific leagues to filter
125
+
126
+ Returns:
127
+ Combined DataFrame of upcoming fixtures
128
+ """
129
+ all_fixtures = []
130
+
131
+ # Try Sofascore first (best for live data)
132
+ if 'sofascore' in self.collectors:
133
+ try:
134
+ fixtures = self.collectors['sofascore'].get_fixtures(days=days_ahead)
135
+ if fixtures is not None and len(fixtures) > 0:
136
+ fixtures['source'] = 'sofascore'
137
+ all_fixtures.append(fixtures)
138
+ logger.info(f"Got {len(fixtures)} fixtures from Sofascore")
139
+ except Exception as e:
140
+ logger.error(f"Sofascore fixtures error: {e}")
141
+
142
+ # Try Football-Data
143
+ if 'football_data' in self.collectors:
144
+ try:
145
+ fixtures = self.collectors['football_data'].get_upcoming_fixtures()
146
+ if fixtures is not None and len(fixtures) > 0:
147
+ fixtures['source'] = 'football_data'
148
+ all_fixtures.append(fixtures)
149
+ logger.info(f"Got {len(fixtures)} fixtures from Football-Data")
150
+ except Exception as e:
151
+ logger.error(f"Football-Data fixtures error: {e}")
152
+
153
+ if all_fixtures:
154
+ combined = pd.concat(all_fixtures, ignore_index=True)
155
+ # Remove duplicates based on teams and date
156
+ if 'home_team' in combined.columns and 'away_team' in combined.columns:
157
+ combined = combined.drop_duplicates(
158
+ subset=['home_team', 'away_team'],
159
+ keep='first'
160
+ )
161
+ return combined
162
+
163
+ return pd.DataFrame()
164
+
165
+ def fetch_historical_data(
166
+ self,
167
+ seasons: List[str] = None,
168
+ leagues: List[str] = None
169
+ ) -> pd.DataFrame:
170
+ """Fetch historical match data from all sources."""
171
+ all_data = []
172
+
173
+ # Football-Data.co.uk (primary source)
174
+ if 'football_data' in self.collectors:
175
+ try:
176
+ data = self.collectors['football_data'].fetch_all_leagues(
177
+ seasons=seasons,
178
+ leagues=leagues
179
+ )
180
+ if data is not None and len(data) > 0:
181
+ all_data.append(data)
182
+ logger.info(f"Got {len(data)} matches from Football-Data")
183
+ except Exception as e:
184
+ logger.error(f"Football-Data historical error: {e}")
185
+
186
+ # StatsBomb (free open data)
187
+ if 'statsbomb' in self.collectors:
188
+ try:
189
+ data = self.collectors['statsbomb'].load_competitions()
190
+ if data is not None and len(data) > 0:
191
+ all_data.append(data)
192
+ logger.info(f"Got {len(data)} matches from StatsBomb")
193
+ except Exception as e:
194
+ logger.error(f"StatsBomb error: {e}")
195
+
196
+ if all_data:
197
+ return pd.concat(all_data, ignore_index=True)
198
+
199
+ return pd.DataFrame()
200
+
201
+ def fetch_xg_data(
202
+ self,
203
+ league: str = None,
204
+ team: str = None
205
+ ) -> pd.DataFrame:
206
+ """Fetch expected goals data from Understat."""
207
+ if 'understat' not in self.collectors:
208
+ logger.warning("Understat API not available")
209
+ return pd.DataFrame()
210
+
211
+ try:
212
+ return self.collectors['understat'].get_team_xg(
213
+ league=league,
214
+ team=team
215
+ )
216
+ except Exception as e:
217
+ logger.error(f"Understat xG error: {e}")
218
+ return pd.DataFrame()
219
+
220
+ def fetch_advanced_stats(
221
+ self,
222
+ league: str = None,
223
+ season: str = None
224
+ ) -> pd.DataFrame:
225
+ """Fetch advanced statistics from FBRef."""
226
+ if 'fbref' not in self.collectors:
227
+ logger.warning("FBRef scraper not available")
228
+ return pd.DataFrame()
229
+
230
+ try:
231
+ return self.collectors['fbref'].get_league_stats(
232
+ league=league,
233
+ season=season
234
+ )
235
+ except Exception as e:
236
+ logger.error(f"FBRef stats error: {e}")
237
+ return pd.DataFrame()
238
+
239
+ def refresh_all_data(self) -> Dict:
240
+ """Refresh data from all sources."""
241
+ results = {
242
+ 'timestamp': datetime.now().isoformat(),
243
+ 'sources': {},
244
+ 'total_fixtures': 0,
245
+ 'total_historical': 0
246
+ }
247
+
248
+ # Fetch fixtures
249
+ fixtures = self.fetch_upcoming_fixtures(days_ahead=14)
250
+ results['total_fixtures'] = len(fixtures)
251
+
252
+ # Fetch historical
253
+ historical = self.fetch_historical_data(seasons=['2425', '2324'])
254
+ results['total_historical'] = len(historical)
255
+
256
+ # Source status
257
+ results['sources'] = self.get_status()
258
+
259
+ logger.info(f"Data refresh complete: {results['total_fixtures']} fixtures, {results['total_historical']} historical")
260
+
261
+ return results
262
+
263
+
264
+ # Global instance
265
+ _manager: Optional[DataSourceManager] = None
266
+
267
+
268
+ def get_data_manager() -> DataSourceManager:
269
+ """Get or create the data source manager."""
270
+ global _manager
271
+ if _manager is None:
272
+ _manager = DataSourceManager()
273
+ return _manager
274
+
275
+
276
+ def fetch_all_fixtures(days: int = 7) -> pd.DataFrame:
277
+ """Convenience function to fetch upcoming fixtures."""
278
+ return get_data_manager().fetch_upcoming_fixtures(days_ahead=days)
279
+
280
+
281
+ def get_data_status() -> Dict:
282
+ """Get current data source status."""
283
+ return get_data_manager().get_status()