sivapriya175
deploy backend files
36da710
import pandas as pd
# Load ball-by-ball dataset with low_memory=False to handle mixed types
ball_df = pd.read_csv('ODI_Match_Data.csv', low_memory=False)
# Clean ball-by-ball dataset
ball_df['match_id'] = ball_df['match_id'].astype(int) # Ensure match_id is an integer
ball_df['season'] = ball_df['season'].astype(str).str.strip() # Ensure season is a string, handle mixed types
ball_df['start_date'] = pd.to_datetime(ball_df['start_date'], errors='coerce') # Convert to datetime, handle invalid dates
ball_df['runs_off_bat'] = ball_df['runs_off_bat'].fillna(0).astype(int)
ball_df['extras'] = ball_df['extras'].fillna(0).astype(int)
ball_df['wides'] = ball_df['wides'].fillna(0).astype(int)
ball_df['noballs'] = ball_df['noballs'].fillna(0).astype(int)
ball_df['byes'] = ball_df['byes'].fillna(0).astype(int)
ball_df['legbyes'] = ball_df['legbyes'].fillna(0).astype(int)
ball_df['penalty'] = ball_df['penalty'].fillna(0).astype(int)
ball_df['wicket_type'] = ball_df['wicket_type'].notna().astype(int) # 1 if wicket, 0 if not
ball_df['batting_team'] = ball_df['batting_team'].astype(str).str.strip().str.title() # Ensure string, then title case
ball_df['bowling_team'] = ball_df['bowling_team'].astype(str).str.strip().str.title()
ball_df['striker'] = ball_df['striker'].astype(str).str.strip().str.title()
ball_df['non_striker'] = ball_df['non_striker'].astype(str).str.strip().str.title()
ball_df['bowler'] = ball_df['bowler'].astype(str).str.strip().str.title()
ball_df['player_dismissed'] = ball_df['player_dismissed'].astype(str).str.strip().str.title() # Ensure string, then title case
# Handle 'other_player_dismissed' - check if it's numeric or non-string, convert to string if possible
if ball_df['other_player_dismissed'].dtype != 'object': # If not string type
ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].astype(str).str.strip().str.title()
else:
# If already object type, handle NaN or non-string values
ball_df['other_player_dismissed'] = ball_df['other_player_dismissed'].fillna('').astype(str).str.strip().str.title()
ball_df['other_wicket_type'] = ball_df['other_wicket_type'].astype(str).str.strip() # Ensure string, handle as is
# Extract venue if needed (assuming start_date might contain venue or it's separate)
if 'venue' not in ball_df.columns:
if 'start_date' in ball_df.columns and ball_df['start_date'].dtype == 'object':
ball_df['venue'] = ball_df['start_date'].str.extract(r', (.+)$').fillna('N/A')
else:
ball_df['venue'] = 'N/A' # Default if venue isn’t available
# Calculate total runs (including extras)
ball_df['total_runs'] = ball_df['runs_off_bat'] + ball_df['extras']
# Save cleaned ball-by-ball dataset
ball_df.to_csv('cleaned_ball_data.csv', index=False)
print("Cleaned ball-by-ball dataset saved as 'cleaned_ball_data.csv'")