Spaces:

meetkss
/

Book-Recommendation-Engine

Running

Suriya

First commit

8807f0d 4 months ago

1.69 kB

	import pandas as pd
	import os

	class DataLoader:
	def __init__(self, data_path: str):
	self.data_path = data_path

	def load_data(self) -> pd.DataFrame:
	if not os.path.exists(self.data_path):
	raise FileNotFoundError(f"Dataset not found at {self.data_path}")

	df = pd.read_csv(self.data_path)

	df = df.dropna(subset=["summaries"]).reset_index(drop=True)

	to_remove = set()
	for book_name, group in df.groupby("book_name"):
	if len(group) < 2:
	continue

	group = group.sort_index()

	for i, row_i in group.iterrows():
	for j, row_j in group.iterrows():
	if (
	j > i
	and j - i == 9
	and row_i["categories"] == row_j["categories"]
	and row_i["summaries"] == row_j["summaries"]
	):
	remaining = df[(df["book_name"] == book_name) & (~df.index.isin([i, j]))]
	if not remaining.empty:
	to_remove.update([i, j])

	df = df.drop(index=to_remove).reset_index(drop=True)

	return df

	def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
	df = df.groupby("book_name").agg({
	"summaries": "first",
	"categories": lambda x: ', '.join(set(x))
	}).reset_index()

	df["combined_text"] = (
	"Summary of the book: " + df["summaries"].fillna("") + " " +
	"Categories/Genre of the book: " + df["categories"].fillna("")
	)

	return df