d-e-e-k-11
/

amazon_product

Model card Files Files and versions

amazon_product / preprocess.py

d-e-e-k-11's picture

Upload folder using huggingface_hub

4416e3b verified 3 months ago

history blame contribute delete

2.12 kB

	import pandas as pd
	import json

	def preprocess_data(file_path):
	# Load the CSV
	df = pd.read_csv(file_path)

	# Select relevant columns
	# name, brand, categories, reviews.text, reviews.title, reviews.rating
	relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating']

	# Drop rows with missing reviews
	df = df.dropna(subset=['reviews.text', 'name'])

	documents = []
	for _, row in df.iterrows():
	name = row['name']
	brand = row.get('brand', 'Unknown')
	categories = row.get('categories', 'N/A')
	text = row['reviews.text']
	title = row.get('reviews.title', '')
	rating = row.get('reviews.rating', 'N/A')

	# Parse price
	price_str = "Price info not available"
	prices_raw = row.get('prices')
	if pd.notna(prices_raw):
	try:
	# The CSV might have escaped quotes
	prices_data = json.loads(prices_raw.replace('""', '"'))
	if isinstance(prices_data, list) and len(prices_data) > 0:
	best_price = min([p.get('amountMin', float('inf')) for p in prices_data])
	currency = prices_data[0].get('currency', 'USD')
	if best_price != float('inf'):
	price_str = f"{best_price} {currency}"
	except:
	pass

	doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}"

	metadata = {
	"name": name,
	"brand": brand,
	"rating": str(rating),
	"price": price_str
	}

	documents.append({"content": doc_content, "metadata": metadata})

	return documents

	if __name__ == "__main__":
	docs = preprocess_data("7817_1.csv")
	with open("preprocessed_docs.json", "w") as f:
	json.dump(docs, f, indent=2)
	print(f"Preprocessed {len(docs)} documents.")