Spaces:
Sleeping
Sleeping
File size: 4,126 Bytes
ad8974a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | #!/usr/bin/env python3
"""
Kaggle 'Amazon Books Reviews' Processor (Ratings Only Mode)
Adaptable to missing metadata file.
"""
import os
import pandas as pd
import json
import zipfile
from tqdm import tqdm
class KaggleBooksProcessor:
def __init__(self, data_dir='amazon_data'):
self.data_dir = data_dir
self.output_dir = os.path.join(data_dir, 'processed')
os.makedirs(self.output_dir, exist_ok=True)
self.zip_file = os.path.join(data_dir, 'Books_rating.csv.zip')
self.rating_file = os.path.join(data_dir, 'Books_rating.csv')
self.meta_file = os.path.join(data_dir, 'books_data.csv') # Optional
def check_and_unzip(self):
if not os.path.exists(self.rating_file):
if os.path.exists(self.zip_file):
print(f"Unzipping {self.zip_file}...")
with zipfile.ZipFile(self.zip_file, 'r') as zip_ref:
zip_ref.extractall(self.data_dir)
else:
print(f"❌ File not found: {self.rating_file} or {self.zip_file}")
return False
return True
def run(self, sample_size=200000):
print(f"Processing Data in {self.data_dir}...")
if not self.check_and_unzip():
return
# 1. Load Ratings
print("Loading Ratings (Books_rating.csv)...")
# Columns: Id, Title, Price, User_id, profileName, review/helpfulness, review/score, review/time, review/summary, review/text
# We use sampling for demo speed
if sample_size:
df = pd.read_csv(self.rating_file, nrows=sample_size)
else:
df = pd.read_csv(self.rating_file)
print(f"Loaded {len(df)} records.")
# 2. Extract Items & Interactions
# Since we might lack books_data.csv, we rely on 'Title' in rating file.
print("Extracting Metadata & Interactions...")
interactions = []
items_dict = {}
# We iterate and build both
for _, row in tqdm(df.iterrows(), total=len(df)):
try:
title = str(row['Title']).strip()
if not title or title.lower() == 'nan': continue
# Use Title as ID
item_id = title
# Build Item Metadata (Simulated from Title if no meta file)
if item_id not in items_dict:
price = row.get('Price', 'Unknown')
# We treat Title as 'Description' for basic Semantic Matching
full_desc = f"Title: {title}. Price: {price}."
items_dict[item_id] = {
'item_id': item_id,
'title': title,
'category': 'Books', # Default
'description': full_desc,
'price': price
}
interactions.append({
'user_id': str(row['User_id']),
'item_id': item_id,
'rating': float(row['review/score']),
'interested': 'Yes' if float(row['review/score']) >= 4.0 else 'No',
'timestamp': row.get('review/time', 0)
})
except:
continue
# 3. Save
meta_out = pd.DataFrame(list(items_dict.values()))
inter_out = pd.DataFrame(interactions)
m_path = os.path.join(self.output_dir, 'kaggle_books_metadata.json')
i_path = os.path.join(self.output_dir, 'kaggle_books_interactions.json')
meta_out.to_json(m_path, orient='records', lines=True)
inter_out.to_json(i_path, orient='records', lines=True)
print(f"Done! Saved {len(meta_out)} items and {len(inter_out)} interactions.")
print(f" -> {m_path}")
print(f" -> {i_path}")
if __name__ == "__main__":
p = KaggleBooksProcessor()
# Process 200k rows for efficiency
p.run(sample_size=200000)
|