amazon_product / preprocess.py
d-e-e-k-11's picture
Upload folder using huggingface_hub
4416e3b verified
import pandas as pd
import json
def preprocess_data(file_path):
# Load the CSV
df = pd.read_csv(file_path)
# Select relevant columns
# name, brand, categories, reviews.text, reviews.title, reviews.rating
relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating']
# Drop rows with missing reviews
df = df.dropna(subset=['reviews.text', 'name'])
documents = []
for _, row in df.iterrows():
name = row['name']
brand = row.get('brand', 'Unknown')
categories = row.get('categories', 'N/A')
text = row['reviews.text']
title = row.get('reviews.title', '')
rating = row.get('reviews.rating', 'N/A')
# Parse price
price_str = "Price info not available"
prices_raw = row.get('prices')
if pd.notna(prices_raw):
try:
# The CSV might have escaped quotes
prices_data = json.loads(prices_raw.replace('""', '"'))
if isinstance(prices_data, list) and len(prices_data) > 0:
best_price = min([p.get('amountMin', float('inf')) for p in prices_data])
currency = prices_data[0].get('currency', 'USD')
if best_price != float('inf'):
price_str = f"{best_price} {currency}"
except:
pass
doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}"
metadata = {
"name": name,
"brand": brand,
"rating": str(rating),
"price": price_str
}
documents.append({"content": doc_content, "metadata": metadata})
return documents
if __name__ == "__main__":
docs = preprocess_data("7817_1.csv")
with open("preprocessed_docs.json", "w") as f:
json.dump(docs, f, indent=2)
print(f"Preprocessed {len(docs)} documents.")