# Block 2: Load and Prepare Data
# ------------------------------
# This block loads the data from your Excel file, fixes the header,
# and prepares it for the model.

import pandas as pd

print("--- Loading and Preparing Data ---")

# Define the correct column names we want to use.
correct_column_names = ['Id', 'Review', 'Rating']

# 1. Load the Excel file, skipping the bad header row.
#    We explicitly tell pandas there is no header to read.
df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)

# 2. Manually assign our correct column names. This is the key step
#    to prevent the 'KeyError'.
df.columns = correct_column_names

# 3. Clean the data:
#    - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
#    - Drop any rows where 'Rating' or 'Review' is missing.
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df.dropna(subset=['Rating', 'Review'], inplace=True)

# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
#    This helps the model train more effectively.
df['normalized_rating'] = (df['Rating'] - 1) / 9.0

# 5. Create our final, clean DataFrame for the model.
df_regression = df[['Review', 'normalized_rating']].copy()

print("✅ Data loaded and prepared successfully!")
print("\nHere's a sample of the prepared data:")
print(df_regression.head())