File size: 9,168 Bytes
ea2b4f2 7edf494 077a7f8 7edf494 ea2b4f2 077a7f8 7edf494 077a7f8 7edf494 077a7f8 7edf494 077a7f8 7edf494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
import gradio as gr
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ---------------------------------------------------------------------------
# 0. LOAD DATA PRE-GENERATED BY THE OFFLINE PIPELINE
# ---------------------------------------------------------------------------
BOOKS_CSV = Path("self_help_books.csv")
REVIEWS_CSV = Path("self_help_reviews.csv") # may be absent - optional
df_books = pd.read_csv(BOOKS_CSV)
df_reviews = pd.read_csv(REVIEWS_CSV) if REVIEWS_CSV.exists() else pd.DataFrame()
# ---------------------------------------------------------------------------
# 1. VERY LIGHT TEXT PRE-PROCESSING + TF-IDF FEATURES
# ---------------------------------------------------------------------------
def _prep(text: str) -> str:
"""Lower-case & cast NaNs to an empty string."""
return str(text).lower() if pd.notnull(text) else ""
# Build the text that summarises each book (only if not already present)
if "combined_text" not in df_books.columns:
df_books["combined_text"] = (
df_books["summary"].apply(_prep) + " " +
df_books["genres"].apply(_prep) + " " +
df_books["key_cat_primary"].apply(_prep)
)
vectorizer = TfidfVectorizer(stop_words="english", max_features=50_000)
X_BOOKS = vectorizer.fit_transform(df_books["combined_text"])
# ---------------------------------------------------------------------------
# 2. AUTHOR-LEVEL AGGREGATION (fallbacks if columns are missing)
# ---------------------------------------------------------------------------
if {"helpful_ratio", "total_reviews"}.issubset(df_books.columns):
author_stats = (
df_books.groupby("author_clean")
.agg(helpful_ratio=("helpful_ratio", "mean"),
total_reviews=("total_reviews", "sum"))
.reset_index()
)
else: # keep the code functional even without those columns
author_stats = pd.DataFrame(
columns=["author_clean", "helpful_ratio", "total_reviews"]
)
# ---------------------------------------------------------------------------
# 3. MAIN RECOMMENDATION FUNCTIONS
# ---------------------------------------------------------------------------
def recommend_books(user_issue: str,
top_n: int = 5,
reviews_per_book: int = 2,
min_reviews: int = 10) -> pd.DataFrame:
"""
Blend topical similarity (70 %) with helpfulness (30 %)
and return the `top_n` books best suited to `user_issue`.
"""
# ---- similarity -------------------------------------------------------
query_vec = vectorizer.transform([user_issue.lower()])
similarity = cosine_similarity(query_vec, X_BOOKS).ravel()
df_temp = df_books.copy()
df_temp["similarity"] = similarity
df_temp["helpful_ratio_filled"] = df_temp.get("helpful_ratio", 0).fillna(0)
if "total_reviews" in df_temp.columns:
df_temp = df_temp[df_temp["total_reviews"] >= min_reviews]
df_temp["score"] = (
0.70 * df_temp["similarity"] +
0.30 * df_temp["helpful_ratio_filled"]
)
top_books = df_temp.nlargest(top_n, "score").reset_index(drop=True)
# ---- representative reviews ------------------------------------------
results = []
for _, row in top_books.iterrows():
name = row.get("name", row.get("Book", ""))
author = row.get("author_clean", row.get("Author", ""))
# sample reviews only if we actually have them
if not df_reviews.empty and {"is_helpful", "is_harmful"}.issubset(df_reviews.columns):
helpful_mask = (df_reviews["name"] == name) & (df_reviews["is_helpful"])
harmful_mask = (df_reviews["name"] == name) & (df_reviews["is_harmful"])
helpful_reviews = (
df_reviews[helpful_mask]
.sample(min(reviews_per_book, helpful_mask.sum()), random_state=42)
["review_text"].tolist()
if helpful_mask.any() else []
)
harmful_reviews = (
df_reviews[harmful_mask]
.sample(min(reviews_per_book, harmful_mask.sum()), random_state=42)
["review_text"].tolist()
if harmful_mask.any() else []
)
else:
helpful_reviews, harmful_reviews = [], []
results.append({
"Book" : name,
"Author" : author,
"Star_Rating" : row.get("star_rating", np.nan),
"Price" : row.get("kindle_price_clean", np.nan),
"Helpful_Ratio" : round(row.get("helpful_ratio", 0), 3),
"Similarity" : round(row["similarity"], 3),
"Helpful Reviews" : helpful_reviews,
"Harmful Reviews" : harmful_reviews
})
return pd.DataFrame(results)
def recommend_authors(user_issue: str,
top_n: int = 5,
min_reviews: int = 30):
"""
Return two DataFrames:
• authors likely to be helpful
• authors you might approach with caution
Ranking = 70 % topical relevance + 30 % helpfulness.
"""
query_vec = vectorizer.transform([user_issue.lower()])
similarity = cosine_similarity(query_vec, X_BOOKS).ravel()
rel_df = pd.DataFrame({
"author_clean": df_books["author_clean"],
"sim_to_issue": similarity
})
author_relevance = (
rel_df.groupby("author_clean")
.agg(max_sim=("sim_to_issue", "max"))
.reset_index()
)
merged = author_relevance.merge(author_stats, on="author_clean", how="left")
merged["helpful_ratio"] = merged["helpful_ratio"].fillna(0)
merged["total_reviews"] = merged["total_reviews"].fillna(0)
merged = merged[merged["total_reviews"] >= min_reviews]
merged["score"] = 0.70 * merged["max_sim"] + 0.30 * merged["helpful_ratio"]
helpful_authors = (
merged[merged["helpful_ratio"] >= 0.5]
.nlargest(top_n, "score")
.reset_index(drop=True)
)
risky_authors = (
merged[merged["helpful_ratio"] < 0.5]
.nlargest(top_n, "score")
.reset_index(drop=True)
)
return helpful_authors, risky_authors
# ---------------------------------------------------------------------------
# 4. GRADIO GLUE – format nicely & expose a simple interface
# ---------------------------------------------------------------------------
def _format_output(books_df, good_authors, bad_authors) -> str:
txt = "=== RECOMMENDED BOOKS ===\n\n"
for _, bk in books_df.iterrows():
txt += f"📚 {bk['Book']}\n"
txt += f"👤 Author: {bk['Author']}\n"
txt += f"⭐ Rating: {bk['Star_Rating']}\n"
txt += f"💰 Price: ${bk['Price']}\n"
txt += f"📊 Helpful Ratio: {bk['Helpful_Ratio']:.2f}\n"
if bk["Helpful Reviews"]:
txt += "\n✅ Helpful Reviews:\n"
for rv in bk["Helpful Reviews"]:
txt += f"• {rv}\n"
if bk["Harmful Reviews"]:
txt += "\n⚠️ Critical Reviews:\n"
for rv in bk["Harmful Reviews"]:
txt += f"• {rv}\n"
txt += "\n" + "-" * 50 + "\n\n"
txt += "=== RECOMMENDED AUTHORS ===\n\n"
txt += "✅ Authors Likely to be Helpful:\n"
for _, au in good_authors.iterrows():
txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
txt += "\n⚠️ Authors to Approach with Caution:\n"
for _, au in bad_authors.iterrows():
txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
return txt
def recommend_for_concern(concern: str,
num_books: int = 5,
num_reviews: int = 2) -> str:
books_df = recommend_books(concern,
top_n=num_books,
reviews_per_book=num_reviews)
good_authors, bad_authors = recommend_authors(concern,
top_n=num_books)
return _format_output(books_df, good_authors, bad_authors)
# ---------------------------------------------------------------------------
# 5. LAUNCH GRADIO
# ---------------------------------------------------------------------------
iface = gr.Interface(
fn=recommend_for_concern,
inputs=[
gr.Textbox(label="What concern or fear would you like help with?",
placeholder="e.g. I'm a lonely teenager"),
gr.Slider(label="Number of recommendations",
minimum=1, maximum=10, step=1, value=5),
gr.Slider(label="Reviews per book",
minimum=1, maximum=5, step=1, value=2),
],
outputs=gr.Textbox(label="Recommendations", lines=20),
title="Self-Help Book Recommendation Engine",
description="Personalised, review-aware book & author suggestions.",
examples=[
["I'm a lonely teenager", 5, 2],
["I'm worried about my career", 5, 2],
["I have anxiety about the future", 5, 2],
],
)
iface.launch()
|