import pandas as pd # Load ball-by-ball dataset with low_memory=False to handle mixed types ball_df = pd.read_csv('ODI_Match_Data.csv', low_memory=False) # Clean ball-by-ball dataset ball_df['match_id'] = ball_df['match_id'].astype(int) # Ensure match_id is an integer ball_df['season'] = ball_df['season'].astype(str).str.strip() # Ensure season is a string, handle mixed types ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce') # Convert to datetime, handle invalid dates ball_df['runs_off_bat'] = ball_df['runs_off_bat'].fillna(0).astype(int) ball_df['extras'] = ball_df['extras'].fillna(0).astype(int) ball_df['wides'] = ball_df['wides'].fillna(0).astype(int) ball_df['noballs'] = ball_df['noballs'].fillna(0).astype(int) ball_df['byes'] = ball_df['byes'].fillna(0).astype(int) ball_df['legbyes'] = ball_df['legbyes'].fillna(0).astype(int) ball_df['penalty'] = ball_df['penalty'].fillna(0).astype(int) ball_df['wicket_type'] = ball_df['wicket_type'].notna().astype(int) # 1 if wicket, 0 if not ball_df['batting_team'] = ball_df['batting_team'].astype(str).str.strip().str.title() # Ensure string, then title case ball_df['bowling_team'] = ball_df['bowling_team'].astype(str).str.strip().str.title() ball_df['striker'] = ball_df['striker'].astype(str).str.strip().str.title() ball_df['non_striker'] = ball_df['non_striker'].astype(str).str.strip().str.title() ball_df['bowler'] = ball_df['bowler'].astype(str).str.strip().str.title() ball_df['player_dismissed'] = ball_df['player_dismissed'].astype(str).str.strip().str.title() # Ensure string, then title case # Handle 'other_player_dismissed' - check if it's numeric or non-string, convert to string if possible if ball_df['other_player_dismissed'].dtype != 'object': # If not string type ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].astype(str).str.strip().str.title() else: # If already object type, handle NaN or non-string values ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].fillna('').astype(str).str.strip().str.title() ball_df['other_wicket_type'] = ball_df['other_wicket_type'].astype(str).str.strip() # Ensure string, handle as is # Extract venue if needed (assuming start_date might contain venue or it's separate) if 'venue' not in ball_df.columns: if 'start_date' in ball_df.columns and ball_df['start_date'].dtype == 'object': ball_df['venue'] = ball_df['start_date'].str.extract(r', (.+)$').fillna('N/A') else: ball_df['venue'] = 'N/A' # Default if venue isn’t available # Calculate total runs (including extras) ball_df['total_runs'] = ball_df['runs_off_bat'] + ball_df['extras'] # Save cleaned ball-by-ball dataset ball_df.to_csv('cleaned_ball_data.csv', index=False) print("Cleaned ball-by-ball dataset saved as 'cleaned_ball_data.csv'")