Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / src /zero_shot /process_kaggle_data.py

ymlin105

feat: add real-time book cover fetching and client-server architecture

ad8974a 12 days ago

raw

history blame contribute delete

4.13 kB

	#!/usr/bin/env python3
	"""
	Kaggle 'Amazon Books Reviews' Processor (Ratings Only Mode)
	Adaptable to missing metadata file.
	"""

	import os
	import pandas as pd
	import json
	import zipfile
	from tqdm import tqdm

	class KaggleBooksProcessor:
	def __init__(self, data_dir='amazon_data'):
	self.data_dir = data_dir
	self.output_dir = os.path.join(data_dir, 'processed')
	os.makedirs(self.output_dir, exist_ok=True)

	self.zip_file = os.path.join(data_dir, 'Books_rating.csv.zip')
	self.rating_file = os.path.join(data_dir, 'Books_rating.csv')
	self.meta_file = os.path.join(data_dir, 'books_data.csv') # Optional

	def check_and_unzip(self):
	if not os.path.exists(self.rating_file):
	if os.path.exists(self.zip_file):
	print(f"Unzipping {self.zip_file}...")
	with zipfile.ZipFile(self.zip_file, 'r') as zip_ref:
	zip_ref.extractall(self.data_dir)
	else:
	print(f"❌ File not found: {self.rating_file} or {self.zip_file}")
	return False
	return True

	def run(self, sample_size=200000):
	print(f"Processing Data in {self.data_dir}...")

	if not self.check_and_unzip():
	return

	# 1. Load Ratings
	print("Loading Ratings (Books_rating.csv)...")
	# Columns: Id, Title, Price, User_id, profileName, review/helpfulness, review/score, review/time, review/summary, review/text
	# We use sampling for demo speed
	if sample_size:
	df = pd.read_csv(self.rating_file, nrows=sample_size)
	else:
	df = pd.read_csv(self.rating_file)

	print(f"Loaded {len(df)} records.")

	# 2. Extract Items & Interactions
	# Since we might lack books_data.csv, we rely on 'Title' in rating file.

	print("Extracting Metadata & Interactions...")
	interactions = []
	items_dict = {}

	# We iterate and build both
	for _, row in tqdm(df.iterrows(), total=len(df)):
	try:
	title = str(row['Title']).strip()
	if not title or title.lower() == 'nan': continue

	# Use Title as ID
	item_id = title

	# Build Item Metadata (Simulated from Title if no meta file)
	if item_id not in items_dict:
	price = row.get('Price', 'Unknown')
	# We treat Title as 'Description' for basic Semantic Matching
	full_desc = f"Title: {title}. Price: {price}."

	items_dict[item_id] = {
	'item_id': item_id,
	'title': title,
	'category': 'Books', # Default
	'description': full_desc,
	'price': price
	}

	interactions.append({
	'user_id': str(row['User_id']),
	'item_id': item_id,
	'rating': float(row['review/score']),
	'interested': 'Yes' if float(row['review/score']) >= 4.0 else 'No',
	'timestamp': row.get('review/time', 0)
	})
	except:
	continue

	# 3. Save
	meta_out = pd.DataFrame(list(items_dict.values()))
	inter_out = pd.DataFrame(interactions)

	m_path = os.path.join(self.output_dir, 'kaggle_books_metadata.json')
	i_path = os.path.join(self.output_dir, 'kaggle_books_interactions.json')

	meta_out.to_json(m_path, orient='records', lines=True)
	inter_out.to_json(i_path, orient='records', lines=True)

	print(f"Done! Saved {len(meta_out)} items and {len(inter_out)} interactions.")
	print(f" -> {m_path}")
	print(f" -> {i_path}")

	if __name__ == "__main__":
	p = KaggleBooksProcessor()
	# Process 200k rows for efficiency
	p.run(sample_size=200000)