Spaces:

joshstrupp
/

Self-Help-Book-Recommendation-Engine

Sleeping

Self-Help-Book-Recommendation-Engine / app.py

Josh Strupp

update app

077a7f8 9 months ago

9.17 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from pathlib import Path
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# ---------------------------------------------------------------------------
	# 0. LOAD DATA PRE-GENERATED BY THE OFFLINE PIPELINE
	# ---------------------------------------------------------------------------
	BOOKS_CSV = Path("self_help_books.csv")
	REVIEWS_CSV = Path("self_help_reviews.csv") # may be absent - optional

	df_books = pd.read_csv(BOOKS_CSV)
	df_reviews = pd.read_csv(REVIEWS_CSV) if REVIEWS_CSV.exists() else pd.DataFrame()

	# ---------------------------------------------------------------------------
	# 1. VERY LIGHT TEXT PRE-PROCESSING + TF-IDF FEATURES
	# ---------------------------------------------------------------------------
	def _prep(text: str) -> str:
	"""Lower-case & cast NaNs to an empty string."""
	return str(text).lower() if pd.notnull(text) else ""

	# Build the text that summarises each book (only if not already present)
	if "combined_text" not in df_books.columns:
	df_books["combined_text"] = (
	df_books["summary"].apply(_prep) + " " +
	df_books["genres"].apply(_prep) + " " +
	df_books["key_cat_primary"].apply(_prep)
	)

	vectorizer = TfidfVectorizer(stop_words="english", max_features=50_000)
	X_BOOKS = vectorizer.fit_transform(df_books["combined_text"])

	# ---------------------------------------------------------------------------
	# 2. AUTHOR-LEVEL AGGREGATION (fallbacks if columns are missing)
	# ---------------------------------------------------------------------------
	if {"helpful_ratio", "total_reviews"}.issubset(df_books.columns):
	author_stats = (
	df_books.groupby("author_clean")
	.agg(helpful_ratio=("helpful_ratio", "mean"),
	total_reviews=("total_reviews", "sum"))
	.reset_index()
	)
	else: # keep the code functional even without those columns
	author_stats = pd.DataFrame(
	columns=["author_clean", "helpful_ratio", "total_reviews"]
	)

	# ---------------------------------------------------------------------------
	# 3. MAIN RECOMMENDATION FUNCTIONS
	# ---------------------------------------------------------------------------
	def recommend_books(user_issue: str,
	top_n: int = 5,
	reviews_per_book: int = 2,
	min_reviews: int = 10) -> pd.DataFrame:
	"""
	Blend topical similarity (70 %) with helpfulness (30 %)
	and return the `top_n` books best suited to `user_issue`.
	"""
	# ---- similarity -------------------------------------------------------
	query_vec = vectorizer.transform([user_issue.lower()])
	similarity = cosine_similarity(query_vec, X_BOOKS).ravel()

	df_temp = df_books.copy()
	df_temp["similarity"] = similarity
	df_temp["helpful_ratio_filled"] = df_temp.get("helpful_ratio", 0).fillna(0)

	if "total_reviews" in df_temp.columns:
	df_temp = df_temp[df_temp["total_reviews"] >= min_reviews]

	df_temp["score"] = (
	0.70 * df_temp["similarity"] +
	0.30 * df_temp["helpful_ratio_filled"]
	)

	top_books = df_temp.nlargest(top_n, "score").reset_index(drop=True)

	# ---- representative reviews ------------------------------------------
	results = []
	for _, row in top_books.iterrows():
	name = row.get("name", row.get("Book", ""))
	author = row.get("author_clean", row.get("Author", ""))
	# sample reviews only if we actually have them
	if not df_reviews.empty and {"is_helpful", "is_harmful"}.issubset(df_reviews.columns):
	helpful_mask = (df_reviews["name"] == name) & (df_reviews["is_helpful"])
	harmful_mask = (df_reviews["name"] == name) & (df_reviews["is_harmful"])

	helpful_reviews = (
	df_reviews[helpful_mask]
	.sample(min(reviews_per_book, helpful_mask.sum()), random_state=42)
	["review_text"].tolist()
	if helpful_mask.any() else []
	)
	harmful_reviews = (
	df_reviews[harmful_mask]
	.sample(min(reviews_per_book, harmful_mask.sum()), random_state=42)
	["review_text"].tolist()
	if harmful_mask.any() else []
	)
	else:
	helpful_reviews, harmful_reviews = [], []

	results.append({
	"Book" : name,
	"Author" : author,
	"Star_Rating" : row.get("star_rating", np.nan),
	"Price" : row.get("kindle_price_clean", np.nan),
	"Helpful_Ratio" : round(row.get("helpful_ratio", 0), 3),
	"Similarity" : round(row["similarity"], 3),
	"Helpful Reviews" : helpful_reviews,
	"Harmful Reviews" : harmful_reviews
	})

	return pd.DataFrame(results)


	def recommend_authors(user_issue: str,
	top_n: int = 5,
	min_reviews: int = 30):
	"""
	Return two DataFrames:
	• authors likely to be helpful
	• authors you might approach with caution
	Ranking = 70 % topical relevance + 30 % helpfulness.
	"""
	query_vec = vectorizer.transform([user_issue.lower()])
	similarity = cosine_similarity(query_vec, X_BOOKS).ravel()

	rel_df = pd.DataFrame({
	"author_clean": df_books["author_clean"],
	"sim_to_issue": similarity
	})

	author_relevance = (
	rel_df.groupby("author_clean")
	.agg(max_sim=("sim_to_issue", "max"))
	.reset_index()
	)

	merged = author_relevance.merge(author_stats, on="author_clean", how="left")
	merged["helpful_ratio"] = merged["helpful_ratio"].fillna(0)
	merged["total_reviews"] = merged["total_reviews"].fillna(0)
	merged = merged[merged["total_reviews"] >= min_reviews]

	merged["score"] = 0.70 * merged["max_sim"] + 0.30 * merged["helpful_ratio"]

	helpful_authors = (
	merged[merged["helpful_ratio"] >= 0.5]
	.nlargest(top_n, "score")
	.reset_index(drop=True)
	)

	risky_authors = (
	merged[merged["helpful_ratio"] < 0.5]
	.nlargest(top_n, "score")
	.reset_index(drop=True)
	)

	return helpful_authors, risky_authors


	# ---------------------------------------------------------------------------
	# 4. GRADIO GLUE – format nicely & expose a simple interface
	# ---------------------------------------------------------------------------
	def _format_output(books_df, good_authors, bad_authors) -> str:
	txt = "=== RECOMMENDED BOOKS ===\n\n"
	for _, bk in books_df.iterrows():
	txt += f"📚 {bk['Book']}\n"
	txt += f"👤 Author: {bk['Author']}\n"
	txt += f"⭐ Rating: {bk['Star_Rating']}\n"
	txt += f"💰 Price: ${bk['Price']}\n"
	txt += f"📊 Helpful Ratio: {bk['Helpful_Ratio']:.2f}\n"
	if bk["Helpful Reviews"]:
	txt += "\n✅ Helpful Reviews:\n"
	for rv in bk["Helpful Reviews"]:
	txt += f"• {rv}\n"
	if bk["Harmful Reviews"]:
	txt += "\n⚠️ Critical Reviews:\n"
	for rv in bk["Harmful Reviews"]:
	txt += f"• {rv}\n"
	txt += "\n" + "-" * 50 + "\n\n"

	txt += "=== RECOMMENDED AUTHORS ===\n\n"
	txt += "✅ Authors Likely to be Helpful:\n"
	for _, au in good_authors.iterrows():
	txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
	txt += "\n⚠️ Authors to Approach with Caution:\n"
	for _, au in bad_authors.iterrows():
	txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
	return txt


	def recommend_for_concern(concern: str,
	num_books: int = 5,
	num_reviews: int = 2) -> str:
	books_df = recommend_books(concern,
	top_n=num_books,
	reviews_per_book=num_reviews)
	good_authors, bad_authors = recommend_authors(concern,
	top_n=num_books)
	return _format_output(books_df, good_authors, bad_authors)


	# ---------------------------------------------------------------------------
	# 5. LAUNCH GRADIO
	# ---------------------------------------------------------------------------
	iface = gr.Interface(
	fn=recommend_for_concern,
	inputs=[
	gr.Textbox(label="What concern or fear would you like help with?",
	placeholder="e.g. I'm a lonely teenager"),
	gr.Slider(label="Number of recommendations",
	minimum=1, maximum=10, step=1, value=5),
	gr.Slider(label="Reviews per book",
	minimum=1, maximum=5, step=1, value=2),
	],
	outputs=gr.Textbox(label="Recommendations", lines=20),
	title="Self-Help Book Recommendation Engine",
	description="Personalised, review-aware book & author suggestions.",
	examples=[
	["I'm a lonely teenager", 5, 2],
	["I'm worried about my career", 5, 2],
	["I have anxiety about the future", 5, 2],
	],
	)

	iface.launch()