Spaces:

Agrannya
/

app_sentiment_analysis

Sleeping

app_sentiment_analysis / model_traing.py

Upload 9 files

ce676fe verified 7 months ago

1.37 kB

	# Block 2: Load and Prepare Data
	# ------------------------------
	# This block loads the data from your Excel file, fixes the header,
	# and prepares it for the model.

	import pandas as pd

	print("--- Loading and Preparing Data ---")

	# Define the correct column names we want to use.
	correct_column_names = ['Id', 'Review', 'Rating']

	# 1. Load the Excel file, skipping the bad header row.
	# We explicitly tell pandas there is no header to read.
	df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)

	# 2. Manually assign our correct column names. This is the key step
	# to prevent the 'KeyError'.
	df.columns = correct_column_names

	# 3. Clean the data:
	# - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
	# - Drop any rows where 'Rating' or 'Review' is missing.
	df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
	df.dropna(subset=['Rating', 'Review'], inplace=True)

	# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
	# This helps the model train more effectively.
	df['normalized_rating'] = (df['Rating'] - 1) / 9.0

	# 5. Create our final, clean DataFrame for the model.
	df_regression = df[['Review', 'normalized_rating']].copy()

	print("✅ Data loaded and prepared successfully!")
	print("\nHere's a sample of the prepared data:")
	print(df_regression.head())