|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BOOKS_CSV = Path("self_help_books.csv") |
|
|
REVIEWS_CSV = Path("self_help_reviews.csv") |
|
|
|
|
|
df_books = pd.read_csv(BOOKS_CSV) |
|
|
df_reviews = pd.read_csv(REVIEWS_CSV) if REVIEWS_CSV.exists() else pd.DataFrame() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _prep(text: str) -> str: |
|
|
"""Lower-case & cast NaNs to an empty string.""" |
|
|
return str(text).lower() if pd.notnull(text) else "" |
|
|
|
|
|
|
|
|
if "combined_text" not in df_books.columns: |
|
|
df_books["combined_text"] = ( |
|
|
df_books["summary"].apply(_prep) + " " + |
|
|
df_books["genres"].apply(_prep) + " " + |
|
|
df_books["key_cat_primary"].apply(_prep) |
|
|
) |
|
|
|
|
|
vectorizer = TfidfVectorizer(stop_words="english", max_features=50_000) |
|
|
X_BOOKS = vectorizer.fit_transform(df_books["combined_text"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if {"helpful_ratio", "total_reviews"}.issubset(df_books.columns): |
|
|
author_stats = ( |
|
|
df_books.groupby("author_clean") |
|
|
.agg(helpful_ratio=("helpful_ratio", "mean"), |
|
|
total_reviews=("total_reviews", "sum")) |
|
|
.reset_index() |
|
|
) |
|
|
else: |
|
|
author_stats = pd.DataFrame( |
|
|
columns=["author_clean", "helpful_ratio", "total_reviews"] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recommend_books(user_issue: str, |
|
|
top_n: int = 5, |
|
|
reviews_per_book: int = 2, |
|
|
min_reviews: int = 10) -> pd.DataFrame: |
|
|
""" |
|
|
Blend topical similarity (70 %) with helpfulness (30 %) |
|
|
and return the `top_n` books best suited to `user_issue`. |
|
|
""" |
|
|
|
|
|
query_vec = vectorizer.transform([user_issue.lower()]) |
|
|
similarity = cosine_similarity(query_vec, X_BOOKS).ravel() |
|
|
|
|
|
df_temp = df_books.copy() |
|
|
df_temp["similarity"] = similarity |
|
|
df_temp["helpful_ratio_filled"] = df_temp.get("helpful_ratio", 0).fillna(0) |
|
|
|
|
|
if "total_reviews" in df_temp.columns: |
|
|
df_temp = df_temp[df_temp["total_reviews"] >= min_reviews] |
|
|
|
|
|
df_temp["score"] = ( |
|
|
0.70 * df_temp["similarity"] + |
|
|
0.30 * df_temp["helpful_ratio_filled"] |
|
|
) |
|
|
|
|
|
top_books = df_temp.nlargest(top_n, "score").reset_index(drop=True) |
|
|
|
|
|
|
|
|
results = [] |
|
|
for _, row in top_books.iterrows(): |
|
|
name = row.get("name", row.get("Book", "")) |
|
|
author = row.get("author_clean", row.get("Author", "")) |
|
|
|
|
|
if not df_reviews.empty and {"is_helpful", "is_harmful"}.issubset(df_reviews.columns): |
|
|
helpful_mask = (df_reviews["name"] == name) & (df_reviews["is_helpful"]) |
|
|
harmful_mask = (df_reviews["name"] == name) & (df_reviews["is_harmful"]) |
|
|
|
|
|
helpful_reviews = ( |
|
|
df_reviews[helpful_mask] |
|
|
.sample(min(reviews_per_book, helpful_mask.sum()), random_state=42) |
|
|
["review_text"].tolist() |
|
|
if helpful_mask.any() else [] |
|
|
) |
|
|
harmful_reviews = ( |
|
|
df_reviews[harmful_mask] |
|
|
.sample(min(reviews_per_book, harmful_mask.sum()), random_state=42) |
|
|
["review_text"].tolist() |
|
|
if harmful_mask.any() else [] |
|
|
) |
|
|
else: |
|
|
helpful_reviews, harmful_reviews = [], [] |
|
|
|
|
|
results.append({ |
|
|
"Book" : name, |
|
|
"Author" : author, |
|
|
"Star_Rating" : row.get("star_rating", np.nan), |
|
|
"Price" : row.get("kindle_price_clean", np.nan), |
|
|
"Helpful_Ratio" : round(row.get("helpful_ratio", 0), 3), |
|
|
"Similarity" : round(row["similarity"], 3), |
|
|
"Helpful Reviews" : helpful_reviews, |
|
|
"Harmful Reviews" : harmful_reviews |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(results) |
|
|
|
|
|
|
|
|
def recommend_authors(user_issue: str, |
|
|
top_n: int = 5, |
|
|
min_reviews: int = 30): |
|
|
""" |
|
|
Return two DataFrames: |
|
|
• authors likely to be helpful |
|
|
• authors you might approach with caution |
|
|
Ranking = 70 % topical relevance + 30 % helpfulness. |
|
|
""" |
|
|
query_vec = vectorizer.transform([user_issue.lower()]) |
|
|
similarity = cosine_similarity(query_vec, X_BOOKS).ravel() |
|
|
|
|
|
rel_df = pd.DataFrame({ |
|
|
"author_clean": df_books["author_clean"], |
|
|
"sim_to_issue": similarity |
|
|
}) |
|
|
|
|
|
author_relevance = ( |
|
|
rel_df.groupby("author_clean") |
|
|
.agg(max_sim=("sim_to_issue", "max")) |
|
|
.reset_index() |
|
|
) |
|
|
|
|
|
merged = author_relevance.merge(author_stats, on="author_clean", how="left") |
|
|
merged["helpful_ratio"] = merged["helpful_ratio"].fillna(0) |
|
|
merged["total_reviews"] = merged["total_reviews"].fillna(0) |
|
|
merged = merged[merged["total_reviews"] >= min_reviews] |
|
|
|
|
|
merged["score"] = 0.70 * merged["max_sim"] + 0.30 * merged["helpful_ratio"] |
|
|
|
|
|
helpful_authors = ( |
|
|
merged[merged["helpful_ratio"] >= 0.5] |
|
|
.nlargest(top_n, "score") |
|
|
.reset_index(drop=True) |
|
|
) |
|
|
|
|
|
risky_authors = ( |
|
|
merged[merged["helpful_ratio"] < 0.5] |
|
|
.nlargest(top_n, "score") |
|
|
.reset_index(drop=True) |
|
|
) |
|
|
|
|
|
return helpful_authors, risky_authors |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _format_output(books_df, good_authors, bad_authors) -> str: |
|
|
txt = "=== RECOMMENDED BOOKS ===\n\n" |
|
|
for _, bk in books_df.iterrows(): |
|
|
txt += f"📚 {bk['Book']}\n" |
|
|
txt += f"👤 Author: {bk['Author']}\n" |
|
|
txt += f"⭐ Rating: {bk['Star_Rating']}\n" |
|
|
txt += f"💰 Price: ${bk['Price']}\n" |
|
|
txt += f"📊 Helpful Ratio: {bk['Helpful_Ratio']:.2f}\n" |
|
|
if bk["Helpful Reviews"]: |
|
|
txt += "\n✅ Helpful Reviews:\n" |
|
|
for rv in bk["Helpful Reviews"]: |
|
|
txt += f"• {rv}\n" |
|
|
if bk["Harmful Reviews"]: |
|
|
txt += "\n⚠️ Critical Reviews:\n" |
|
|
for rv in bk["Harmful Reviews"]: |
|
|
txt += f"• {rv}\n" |
|
|
txt += "\n" + "-" * 50 + "\n\n" |
|
|
|
|
|
txt += "=== RECOMMENDED AUTHORS ===\n\n" |
|
|
txt += "✅ Authors Likely to be Helpful:\n" |
|
|
for _, au in good_authors.iterrows(): |
|
|
txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n" |
|
|
txt += "\n⚠️ Authors to Approach with Caution:\n" |
|
|
for _, au in bad_authors.iterrows(): |
|
|
txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n" |
|
|
return txt |
|
|
|
|
|
|
|
|
def recommend_for_concern(concern: str, |
|
|
num_books: int = 5, |
|
|
num_reviews: int = 2) -> str: |
|
|
books_df = recommend_books(concern, |
|
|
top_n=num_books, |
|
|
reviews_per_book=num_reviews) |
|
|
good_authors, bad_authors = recommend_authors(concern, |
|
|
top_n=num_books) |
|
|
return _format_output(books_df, good_authors, bad_authors) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=recommend_for_concern, |
|
|
inputs=[ |
|
|
gr.Textbox(label="What concern or fear would you like help with?", |
|
|
placeholder="e.g. I'm a lonely teenager"), |
|
|
gr.Slider(label="Number of recommendations", |
|
|
minimum=1, maximum=10, step=1, value=5), |
|
|
gr.Slider(label="Reviews per book", |
|
|
minimum=1, maximum=5, step=1, value=2), |
|
|
], |
|
|
outputs=gr.Textbox(label="Recommendations", lines=20), |
|
|
title="Self-Help Book Recommendation Engine", |
|
|
description="Personalised, review-aware book & author suggestions.", |
|
|
examples=[ |
|
|
["I'm a lonely teenager", 5, 2], |
|
|
["I'm worried about my career", 5, 2], |
|
|
["I have anxiety about the future", 5, 2], |
|
|
], |
|
|
) |
|
|
|
|
|
iface.launch() |
|
|
|