# Block 2: Load and Prepare Data # ------------------------------ # This block loads the data from your Excel file, fixes the header, # and prepares it for the model. import pandas as pd print("--- Loading and Preparing Data ---") # Define the correct column names we want to use. correct_column_names = ['Id', 'Review', 'Rating'] # 1. Load the Excel file, skipping the bad header row. # We explicitly tell pandas there is no header to read. df = pd.read_excel('train_best.xlsx', header=None, skiprows=1) # 2. Manually assign our correct column names. This is the key step # to prevent the 'KeyError'. df.columns = correct_column_names # 3. Clean the data: # - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'. # - Drop any rows where 'Rating' or 'Review' is missing. df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce') df.dropna(subset=['Rating', 'Review'], inplace=True) # 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale. # This helps the model train more effectively. df['normalized_rating'] = (df['Rating'] - 1) / 9.0 # 5. Create our final, clean DataFrame for the model. df_regression = df[['Review', 'normalized_rating']].copy() print("✅ Data loaded and prepared successfully!") print("\nHere's a sample of the prepared data:") print(df_regression.head())