Spaces:
Sleeping
Sleeping
| # Block 2: Load and Prepare Data | |
| # ------------------------------ | |
| # This block loads the data from your Excel file, fixes the header, | |
| # and prepares it for the model. | |
| import pandas as pd | |
| print("--- Loading and Preparing Data ---") | |
| # Define the correct column names we want to use. | |
| correct_column_names = ['Id', 'Review', 'Rating'] | |
| # 1. Load the Excel file, skipping the bad header row. | |
| # We explicitly tell pandas there is no header to read. | |
| df = pd.read_excel('train_best.xlsx', header=None, skiprows=1) | |
| # 2. Manually assign our correct column names. This is the key step | |
| # to prevent the 'KeyError'. | |
| df.columns = correct_column_names | |
| # 3. Clean the data: | |
| # - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'. | |
| # - Drop any rows where 'Rating' or 'Review' is missing. | |
| df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce') | |
| df.dropna(subset=['Rating', 'Review'], inplace=True) | |
| # 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale. | |
| # This helps the model train more effectively. | |
| df['normalized_rating'] = (df['Rating'] - 1) / 9.0 | |
| # 5. Create our final, clean DataFrame for the model. | |
| df_regression = df[['Review', 'normalized_rating']].copy() | |
| print("✅ Data loaded and prepared successfully!") | |
| print("\nHere's a sample of the prepared data:") | |
| print(df_regression.head()) |