|
|
import pandas as pd |
|
|
|
|
|
|
|
|
ball_df = pd.read_csv('ODI_Match_Data.csv', low_memory=False) |
|
|
|
|
|
|
|
|
ball_df['match_id'] = ball_df['match_id'].astype(int) |
|
|
ball_df['season'] = ball_df['season'].astype(str).str.strip() |
|
|
ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce') |
|
|
ball_df['runs_off_bat'] = ball_df['runs_off_bat'].fillna(0).astype(int) |
|
|
ball_df['extras'] = ball_df['extras'].fillna(0).astype(int) |
|
|
ball_df['wides'] = ball_df['wides'].fillna(0).astype(int) |
|
|
ball_df['noballs'] = ball_df['noballs'].fillna(0).astype(int) |
|
|
ball_df['byes'] = ball_df['byes'].fillna(0).astype(int) |
|
|
ball_df['legbyes'] = ball_df['legbyes'].fillna(0).astype(int) |
|
|
ball_df['penalty'] = ball_df['penalty'].fillna(0).astype(int) |
|
|
ball_df['wicket_type'] = ball_df['wicket_type'].notna().astype(int) |
|
|
ball_df['batting_team'] = ball_df['batting_team'].astype(str).str.strip().str.title() |
|
|
ball_df['bowling_team'] = ball_df['bowling_team'].astype(str).str.strip().str.title() |
|
|
ball_df['striker'] = ball_df['striker'].astype(str).str.strip().str.title() |
|
|
ball_df['non_striker'] = ball_df['non_striker'].astype(str).str.strip().str.title() |
|
|
ball_df['bowler'] = ball_df['bowler'].astype(str).str.strip().str.title() |
|
|
ball_df['player_dismissed'] = ball_df['player_dismissed'].astype(str).str.strip().str.title() |
|
|
|
|
|
|
|
|
if ball_df['other_player_dismissed'].dtype != 'object': |
|
|
ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].astype(str).str.strip().str.title() |
|
|
else: |
|
|
|
|
|
ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].fillna('').astype(str).str.strip().str.title() |
|
|
|
|
|
ball_df['other_wicket_type'] = ball_df['other_wicket_type'].astype(str).str.strip() |
|
|
|
|
|
|
|
|
if 'venue' not in ball_df.columns: |
|
|
if 'start_date' in ball_df.columns and ball_df['start_date'].dtype == 'object': |
|
|
ball_df['venue'] = ball_df['start_date'].str.extract(r', (.+)$').fillna('N/A') |
|
|
else: |
|
|
ball_df['venue'] = 'N/A' |
|
|
|
|
|
|
|
|
ball_df['total_runs'] = ball_df['runs_off_bat'] + ball_df['extras'] |
|
|
|
|
|
|
|
|
ball_df.to_csv('cleaned_ball_data.csv', index=False) |
|
|
print("Cleaned ball-by-ball dataset saved as 'cleaned_ball_data.csv'") |