import pandas as pd import json def preprocess_data(file_path): # Load the CSV df = pd.read_csv(file_path) # Select relevant columns # name, brand, categories, reviews.text, reviews.title, reviews.rating relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating'] # Drop rows with missing reviews df = df.dropna(subset=['reviews.text', 'name']) documents = [] for _, row in df.iterrows(): name = row['name'] brand = row.get('brand', 'Unknown') categories = row.get('categories', 'N/A') text = row['reviews.text'] title = row.get('reviews.title', '') rating = row.get('reviews.rating', 'N/A') # Parse price price_str = "Price info not available" prices_raw = row.get('prices') if pd.notna(prices_raw): try: # The CSV might have escaped quotes prices_data = json.loads(prices_raw.replace('""', '"')) if isinstance(prices_data, list) and len(prices_data) > 0: best_price = min([p.get('amountMin', float('inf')) for p in prices_data]) currency = prices_data[0].get('currency', 'USD') if best_price != float('inf'): price_str = f"{best_price} {currency}" except: pass doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}" metadata = { "name": name, "brand": brand, "rating": str(rating), "price": price_str } documents.append({"content": doc_content, "metadata": metadata}) return documents if __name__ == "__main__": docs = preprocess_data("7817_1.csv") with open("preprocessed_docs.json", "w") as f: json.dump(docs, f, indent=2) print(f"Preprocessed {len(docs)} documents.")