sghorbal commited on
Commit
1fa825b
·
1 Parent(s): 8a553ae

add year matches ingestion endpoint

Browse files
Files changed (2) hide show
  1. src/main.py +39 -5
  2. src/service/match.py +56 -1
src/main.py CHANGED
@@ -36,7 +36,11 @@ from src.entity.player import (
36
  PlayerApiDetail,
37
  )
38
  from src.repository.common import get_session
39
- from src.service.match import insert_new_match
 
 
 
 
40
 
41
  from contextlib import asynccontextmanager
42
  from src.api_factory import create_forward_endpoint, get_remote_params
@@ -263,7 +267,8 @@ async def get_match(
263
  @app.post("/match/insert", tags=["match"], description="Insert a match into the database")
264
  async def insert_match(
265
  raw_match: RawMatch,
266
- session: Session = Depends(provide_connection)
 
267
  ):
268
  """
269
  Insert a match into the database
@@ -271,7 +276,8 @@ async def insert_match(
271
  try:
272
  match = insert_new_match(
273
  db=session,
274
- raw_match=raw_match.model_dump(exclude_unset=True)
 
275
  )
276
  except IntegrityError as e:
277
  logger.error(f"Error inserting match: {e}")
@@ -282,15 +288,42 @@ async def insert_match(
282
 
283
  output = {
284
  "status": "ok",
285
- "match_id": match.id,
286
  }
287
 
288
  return JSONResponse(content=output, status_code=HTTP_200_OK)
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  @app.post("/batch/match/insert", tags=["match"], description="Insert a batch of matches into the database")
292
  async def insert_batch_match(
293
  raw_matches: list[RawMatch],
 
294
  session: Session = Depends(provide_connection)
295
  ):
296
  """
@@ -302,7 +335,8 @@ async def insert_batch_match(
302
  try:
303
  match = insert_new_match(
304
  db=session,
305
- raw_match=raw_match.model_dump(exclude_unset=True)
 
306
  )
307
  matches.append(match)
308
  except IntegrityError as e:
 
36
  PlayerApiDetail,
37
  )
38
  from src.repository.common import get_session
39
+ from src.service.match import (
40
+ insert_new_match,
41
+ fetch_raw_data,
42
+ get_cleaned_data,
43
+ )
44
 
45
  from contextlib import asynccontextmanager
46
  from src.api_factory import create_forward_endpoint, get_remote_params
 
267
  @app.post("/match/insert", tags=["match"], description="Insert a match into the database")
268
  async def insert_match(
269
  raw_match: RawMatch,
270
+ session: Session = Depends(provide_connection),
271
+ on_conflict_do_nothing: bool = False,
272
  ):
273
  """
274
  Insert a match into the database
 
276
  try:
277
  match = insert_new_match(
278
  db=session,
279
+ raw_match=raw_match.model_dump(exclude_unset=True),
280
+ on_conflict_do_nothing=on_conflict_do_nothing,
281
  )
282
  except IntegrityError as e:
283
  logger.error(f"Error inserting match: {e}")
 
288
 
289
  output = {
290
  "status": "ok",
291
+ "match_id": match.id if match else None,
292
  }
293
 
294
  return JSONResponse(content=output, status_code=HTTP_200_OK)
295
 
296
+ @app.post("/match/ingest_year", tags=["match"], description="Ingest matches from tennis-data.co.uk for a given year")
297
+ async def ingest_matches(
298
+ year: Optional[int] = None,
299
+ session: Session = Depends(provide_connection)
300
+ ):
301
+ """
302
+ Ingest matches from tennis-data.co.uk for a given year
303
+ """
304
+ fetch_raw_data(db=session, year=year)
305
+ # Get the cleaned data
306
+ df = get_cleaned_data(year=year)
307
+
308
+ # Send requests of 100 matches
309
+ for i in range(0, len(df), 100):
310
+ start = i
311
+ end = start + 99
312
+ df_small = df.loc[start:end]
313
+
314
+ response = await insert_batch_match(
315
+ raw_matches=df_small.to_dict(orient='records'),
316
+ on_conflict_do_nothing=True,
317
+ session=session
318
+ )
319
+
320
+ if response.status_code != HTTP_200_OK:
321
+ logger.error(f"Batch insert failed: {response.status_code} - {response.text}")
322
 
323
  @app.post("/batch/match/insert", tags=["match"], description="Insert a batch of matches into the database")
324
  async def insert_batch_match(
325
  raw_matches: list[RawMatch],
326
+ on_conflict_do_nothing: bool = False,
327
  session: Session = Depends(provide_connection)
328
  ):
329
  """
 
335
  try:
336
  match = insert_new_match(
337
  db=session,
338
+ raw_match=raw_match.model_dump(exclude_unset=True) if isinstance(raw_match, RawMatch) else raw_match,
339
+ on_conflict_do_nothing=on_conflict_do_nothing,
340
  )
341
  matches.append(match)
342
  except IntegrityError as e:
src/service/match.py CHANGED
@@ -1,7 +1,13 @@
 
 
 
 
 
1
  from typing import Dict, List, Tuple
 
2
  from sqlalchemy.exc import IntegrityError
3
  from sqlalchemy.orm import Session
4
- import logging
5
  from src.entity.match import Match
6
  from src.entity.odds import Odds
7
  from src.entity.player import Player
@@ -119,3 +125,52 @@ def _should_fetch_details(player: Player) -> bool:
119
  Check if player details should be fetched
120
  """
121
  return player.tennis_id is None or player.caracteristics is None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import requests
4
+ import numpy as np
5
+ import pandas as pd
6
  from typing import Dict, List, Tuple
7
+ from datetime import datetime
8
  from sqlalchemy.exc import IntegrityError
9
  from sqlalchemy.orm import Session
10
+
11
  from src.entity.match import Match
12
  from src.entity.odds import Odds
13
  from src.entity.player import Player
 
125
  Check if player details should be fetched
126
  """
127
  return player.tennis_id is None or player.caracteristics is None
128
+
129
+ def fetch_raw_data(db: Session, year: int) -> None:
130
+ """
131
+ Fetch data from tennis-data.co.uk for a given year and circuit (ATP or WTA) and save it to a file
132
+
133
+ Args:
134
+ year (int, optional): Year to retrieve. If None, fetch current year data.
135
+ """
136
+ current_year = datetime.now().year
137
+
138
+ if not year:
139
+ year = current_year
140
+
141
+ filename = f"{year}.xlsx"
142
+ file_path = f"./data/atp/{filename}"
143
+
144
+ # Check if the file already exists
145
+ if os.path.exists(file_path) and year != current_year:
146
+ logging.info(f"File {file_path} already exists. Skipping download.")
147
+ return
148
+
149
+ logging.info(f"Fetching data from tennis-data.co.uk for year {year}")
150
+
151
+ url = f"http://www.tennis-data.co.uk/{year}/{filename}"
152
+
153
+ response = requests.get(url, stream=True)
154
+
155
+ # Check response status code
156
+ response.raise_for_status()
157
+
158
+ with open(file_path, "wb") as file:
159
+ for chunk in response.iter_content(chunk_size=8192):
160
+ file.write(chunk)
161
+ file.flush()
162
+
163
+ logging.info(f"Data fetched from {url} 👍 and saved to {file_path}")
164
+
165
+ def get_cleaned_data(year: int) -> pd.DataFrame:
166
+ df = pd.read_csv(f'./data/atp/{year}.csv')
167
+ # Remove rows where LRank or WRank is NaN
168
+ df = df.dropna(subset=['LRank', 'WRank'])
169
+ df['Lsets'] = df['Lsets'].fillna(0)
170
+ df['Wsets'] = df['Wsets'].fillna(0)
171
+ # Replace NaN values with None
172
+ df = df.replace({np.nan: None})
173
+ # Replace NaN values with None
174
+ df = df.where(pd.notnull(df), None)
175
+
176
+ return df