File size: 2,115 Bytes

4416e3b

import pandas as pd
import json

def preprocess_data(file_path):
    # Load the CSV
    df = pd.read_csv(file_path)
    
    # Select relevant columns
    # name, brand, categories, reviews.text, reviews.title, reviews.rating
    relevant_cols = ['name', 'brand', 'categories', 'reviews.text', 'reviews.title', 'reviews.rating']
    
    # Drop rows with missing reviews
    df = df.dropna(subset=['reviews.text', 'name'])
    
    documents = []
    for _, row in df.iterrows():
        name = row['name']
        brand = row.get('brand', 'Unknown')
        categories = row.get('categories', 'N/A')
        text = row['reviews.text']
        title = row.get('reviews.title', '')
        rating = row.get('reviews.rating', 'N/A')
        
        # Parse price
        price_str = "Price info not available"
        prices_raw = row.get('prices')
        if pd.notna(prices_raw):
            try:
                # The CSV might have escaped quotes
                prices_data = json.loads(prices_raw.replace('""', '"'))
                if isinstance(prices_data, list) and len(prices_data) > 0:
                    best_price = min([p.get('amountMin', float('inf')) for p in prices_data])
                    currency = prices_data[0].get('currency', 'USD')
                    if best_price != float('inf'):
                        price_str = f"{best_price} {currency}"
            except:
                pass

        doc_content = f"Product: {name}\nBrand: {brand}\nCategories: {categories}\nPrice: {price_str}\nReview Title: {title}\nRating: {rating}\nReview Content: {text}"
        
        metadata = {
            "name": name,
            "brand": brand,
            "rating": str(rating),
            "price": price_str
        }
        
        documents.append({"content": doc_content, "metadata": metadata})
    
    return documents

if __name__ == "__main__":
    docs = preprocess_data("7817_1.csv")
    with open("preprocessed_docs.json", "w") as f:
        json.dump(docs, f, indent=2)
    print(f"Preprocessed {len(docs)} documents.")