Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / src /etl.py

ymlin105

feat: implement zero-RAM SQLite architecture for Hugging Face deployment

653865f 3 months ago

raw

history blame contribute delete

4.5 kB

	import pandas as pd
	import numpy as np
	import os
	from pathlib import Path
	from src.config import DATA_DIR, COVER_NOT_FOUND
	from src.utils import setup_logger

	logger = setup_logger(__name__)

	RAW_DATA_PATH = DATA_DIR / "raw" / "Books_rating.csv"
	PROCESSED_DATA_PATH = DATA_DIR / "books_processed.csv"
	REVIEW_HIGHLIGHTS_PATH = DATA_DIR / "review_highlights.txt"
	DESCRIPTIONS_PATH = DATA_DIR / "books_descriptions.txt"

	def load_books_data() -> pd.DataFrame:
	"""
	Load and preprocess the Amazon books dataset.
	If processed file exists, load it. Otherwise, process raw data.
	"""
	try:
	# Check if processed data exists
	if PROCESSED_DATA_PATH.exists():
	logger.info(f"Loading processed data from {PROCESSED_DATA_PATH}")
	books = pd.read_csv(PROCESSED_DATA_PATH)
	# Ensure thumbnails are processed
	books["large_thumbnail"] = books["thumbnail"].fillna(str(COVER_NOT_FOUND))
	return books

	# Process raw data
	if not RAW_DATA_PATH.exists():
	logger.warning(f"Raw data file not found at {RAW_DATA_PATH}. Returning empty context.")
	# Return empty DataFrame with expected columns to avoid crashes
	return pd.DataFrame(columns=['isbn13', 'title', 'description', 'average_rating',
	'authors', 'thumbnail', 'simple_categories',
	'joy', 'sadness', 'fear', 'anger', 'surprise',
	'large_thumbnail', 'tags', 'review_highlights'])

	logger.info(f"Processing raw data from {RAW_DATA_PATH}...")

	# Load Raw Data (Chunking if necessary, but 200MB fits in memory)
	# Columns: Id,Title,Price,User_id,profileName,review/helpfulness,
	# review/score,review/time,review/summary,review/text
	df = pd.read_csv(RAW_DATA_PATH)

	# Data Cleaning
	df['Title'] = df['Title'].fillna("Unknown Title")
	df['review/text'] = df['review/text'].fillna("")
	df['review/summary'] = df['review/summary'].fillna("")

	# Aggregation Strategy: Group by Book ID (Id)
	# We need to synthesize a "Description" since the dataset is just reviews.
	# We'll take the top 3 longest reviews/summaries to represent the book.

	logger.info("Grouping reviews by book...")

	# Function to aggregate text
	def aggregate_text(series):
	# Sort by helpfullness or length? Length is a proxy for detail.
	# Simple approach: Concat first 3 reviews
	texts = series.head(3).tolist()
	return " ".join([str(t) for t in texts])[:1000] # Limit to 1000 chars

	grouped = df.groupby('Id').agg({
	'Title': 'first',
	'review/text': aggregate_text,
	'review/score': 'mean'
	}).reset_index()

	# Rename columns to match schema expected by Recommender
	grouped = grouped.rename(columns={
	'Id': 'isbn13',
	'Title': 'title',
	'review/text': 'description',
	'review/score': 'average_rating'
	})

	# Add missing columns with defaults (to be filled by future upgrades)
	grouped['authors'] = "Unknown"
	grouped['thumbnail'] = str(COVER_NOT_FOUND)
	grouped['simple_categories'] = "General" # Default category

	# Add emotion columns (placeholders)
	for emotion in ['joy', 'sadness', 'fear', 'anger', 'surprise']:
	grouped[emotion] = 0.0

	# Save processed data
	logger.info(f"Saving processed data to {PROCESSED_DATA_PATH}")
	grouped.to_csv(PROCESSED_DATA_PATH, index=False)

	# Generate Descriptions TXT for VectorDB
	# Format: "ISBN Description"
	logger.info(f"Generating descriptions file at {DESCRIPTIONS_PATH}")
	with open(DESCRIPTIONS_PATH, 'w') as f:
	for _, row in grouped.iterrows():
	# Clean newlines from description
	clean_desc = str(row['description']).replace('\n', ' ')
	f.write(f"{row['isbn13']} {clean_desc}\n")

	# Final processing for return
	grouped["large_thumbnail"] = grouped["thumbnail"]

	return grouped

	except Exception as e:
	logger.error(f"Error process books data: {str(e)}")
	raise

	if __name__ == "__main__":
	load_books_data()