Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| def preprocess_data(file_path): | |
| # Load the CSV | |
| df = pd.read_csv(file_path) | |
| # Select relevant columns | |
| # name, brand, categories, reviews.text, reviews.title, reviews.rating | |
| relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating'] | |
| # Drop rows with missing reviews | |
| df = df.dropna(subset=['reviews.text', 'name']) | |
| documents = [] | |
| for _, row in df.iterrows(): | |
| name = row['name'] | |
| brand = row.get('brand', 'Unknown') | |
| categories = row.get('categories', 'N/A') | |
| text = row['reviews.text'] | |
| title = row.get('reviews.title', '') | |
| rating = row.get('reviews.rating', 'N/A') | |
| # Parse price | |
| price_str = "Price info not available" | |
| prices_raw = row.get('prices') | |
| if pd.notna(prices_raw): | |
| try: | |
| # The CSV might have escaped quotes | |
| prices_data = json.loads(prices_raw.replace('""', '"')) | |
| if isinstance(prices_data, list) and len(prices_data) > 0: | |
| best_price = min([p.get('amountMin', float('inf')) for p in prices_data]) | |
| currency = prices_data[0].get('currency', 'USD') | |
| if best_price != float('inf'): | |
| price_str = f"{best_price} {currency}" | |
| except: | |
| pass | |
| doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}" | |
| metadata = { | |
| "name": name, | |
| "brand": brand, | |
| "rating": str(rating), | |
| "price": price_str | |
| } | |
| documents.append({"content": doc_content, "metadata": metadata}) | |
| return documents | |
| if __name__ == "__main__": | |
| docs = preprocess_data("7817_1.csv") | |
| with open("preprocessed_docs.json", "w") as f: | |
| json.dump(docs, f, indent=2) | |
| print(f"Preprocessed {len(docs)} documents.") | |