Spaces:
Sleeping
Sleeping
File size: 2,098 Bytes
3ccf31a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from importlib import reload
import pandas as pd
import nltk
import sys
import os
# Add parent directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Get the CSV path from the dataset folder (where kagglehub put it)
csv_file = "./code/dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv"
def load_data():
return pd.read_csv(csv_file)
# Reload modules to make sure the latest code is used
import utils.data_process
import utils.metadata
import utils.text_cleaning
# Reload modules
reload(utils.data_process)
reload(utils.metadata)
reload(utils.text_cleaning)
# Import
from utils.data_process import tokenize, clean_text, filter_data, remove_empty, remove_duplicates
from utils.text_cleaning import preprocess_reviews
from utils.metadata import unique_brands, unique_categories, unique_primary_categories, unique_manufacturers, unique_product_names
if 'df' in locals():
print("\nChecking 'reviews.text' column...")
if "reviews.text" in df.columns:
print("β
'reviews.text' column found.")
else:
print("β 'reviews.text' column is missing. Check dataset format.")
else:
print("\nβ οΈ Skipping column check β df not loaded.")
# Check load_data
print("\nChecking load_data...")
try:
df = load_data()
print("β
load_data() succeeded.")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
except Exception as e:
print("β load_data() failed:", e)
# Check 'reviews.text' column
print("\nChecking 'reviews.text' column...")
if "reviews.text" in df.columns:
print("β
'reviews.text' column found.")
else:
print("β 'reviews.text' column is missing. Check dataset format.")
# Check preprocess_reviews
print("\nChecking preprocess_reviews...")
try:
df_sample = df.head(3).copy()
df_cleaned = preprocess_reviews(df_sample, column="reviews.text")
print("β
preprocess_reviews() succeeded.")
print("Sample cleaned text:\n", df_cleaned["reviews.text"])
except Exception as e:
print("β preprocess_reviews() failed:", e)
|