File size: 2,841 Bytes
36da710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd

# Load ball-by-ball dataset with low_memory=False to handle mixed types
ball_df = pd.read_csv('ODI_Match_Data.csv', low_memory=False)

# Clean ball-by-ball dataset
ball_df['match_id'] = ball_df['match_id'].astype(int)  # Ensure match_id is an integer
ball_df['season'] = ball_df['season'].astype(str).str.strip()  # Ensure season is a string, handle mixed types
ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce')  # Convert to datetime, handle invalid dates
ball_df['runs_off_bat'] = ball_df['runs_off_bat'].fillna(0).astype(int)
ball_df['extras'] = ball_df['extras'].fillna(0).astype(int)
ball_df['wides'] = ball_df['wides'].fillna(0).astype(int)
ball_df['noballs'] = ball_df['noballs'].fillna(0).astype(int)
ball_df['byes'] = ball_df['byes'].fillna(0).astype(int)
ball_df['legbyes'] = ball_df['legbyes'].fillna(0).astype(int)
ball_df['penalty'] = ball_df['penalty'].fillna(0).astype(int)
ball_df['wicket_type'] = ball_df['wicket_type'].notna().astype(int)  # 1 if wicket, 0 if not
ball_df['batting_team'] = ball_df['batting_team'].astype(str).str.strip().str.title()  # Ensure string, then title case
ball_df['bowling_team'] = ball_df['bowling_team'].astype(str).str.strip().str.title()
ball_df['striker'] = ball_df['striker'].astype(str).str.strip().str.title()
ball_df['non_striker'] = ball_df['non_striker'].astype(str).str.strip().str.title()
ball_df['bowler'] = ball_df['bowler'].astype(str).str.strip().str.title()
ball_df['player_dismissed'] = ball_df['player_dismissed'].astype(str).str.strip().str.title()  # Ensure string, then title case

# Handle 'other_player_dismissed' - check if it's numeric or non-string, convert to string if possible
if ball_df['other_player_dismissed'].dtype != 'object':  # If not string type
    ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].astype(str).str.strip().str.title()
else:
    # If already object type, handle NaN or non-string values
    ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].fillna('').astype(str).str.strip().str.title()

ball_df['other_wicket_type'] = ball_df['other_wicket_type'].astype(str).str.strip()  # Ensure string, handle as is

# Extract venue if needed (assuming start_date might contain venue or it's separate)
if 'venue' not in ball_df.columns:
    if 'start_date' in ball_df.columns and ball_df['start_date'].dtype == 'object':
        ball_df['venue'] = ball_df['start_date'].str.extract(r', (.+)$').fillna('N/A')
    else:
        ball_df['venue'] = 'N/A'  # Default if venue isn’t available

# Calculate total runs (including extras)
ball_df['total_runs'] = ball_df['runs_off_bat'] + ball_df['extras']

# Save cleaned ball-by-ball dataset
ball_df.to_csv('cleaned_ball_data.csv', index=False)
print("Cleaned ball-by-ball dataset saved as 'cleaned_ball_data.csv'")