File size: 9,168 Bytes
ea2b4f2
7edf494
 
077a7f8
7edf494
 
ea2b4f2
077a7f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7edf494
 
 
077a7f8
 
 
 
 
 
7edf494
 
077a7f8
 
7edf494
 
 
077a7f8
 
7edf494
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import gradio as gr
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------------------------------------------------------------
# 0. LOAD DATA PRE-GENERATED BY THE OFFLINE PIPELINE
# ---------------------------------------------------------------------------
BOOKS_CSV   = Path("self_help_books.csv")
REVIEWS_CSV = Path("self_help_reviews.csv")          # may be absent - optional

df_books   = pd.read_csv(BOOKS_CSV)
df_reviews = pd.read_csv(REVIEWS_CSV) if REVIEWS_CSV.exists() else pd.DataFrame()

# ---------------------------------------------------------------------------
# 1. VERY LIGHT TEXT PRE-PROCESSING + TF-IDF FEATURES
# ---------------------------------------------------------------------------
def _prep(text: str) -> str:
    """Lower-case & cast NaNs to an empty string."""
    return str(text).lower() if pd.notnull(text) else ""

# Build the text that summarises each book (only if not already present)
if "combined_text" not in df_books.columns:
    df_books["combined_text"] = (
        df_books["summary"].apply(_prep) + " " +
        df_books["genres"].apply(_prep)  + " " +
        df_books["key_cat_primary"].apply(_prep)
    )

vectorizer = TfidfVectorizer(stop_words="english", max_features=50_000)
X_BOOKS    = vectorizer.fit_transform(df_books["combined_text"])

# ---------------------------------------------------------------------------
# 2. AUTHOR-LEVEL AGGREGATION  (fallbacks if columns are missing)
# ---------------------------------------------------------------------------
if {"helpful_ratio", "total_reviews"}.issubset(df_books.columns):
    author_stats = (
        df_books.groupby("author_clean")
        .agg(helpful_ratio=("helpful_ratio", "mean"),
             total_reviews=("total_reviews", "sum"))
        .reset_index()
    )
else:  # keep the code functional even without those columns
    author_stats = pd.DataFrame(
        columns=["author_clean", "helpful_ratio", "total_reviews"]
    )

# ---------------------------------------------------------------------------
# 3. MAIN RECOMMENDATION FUNCTIONS
# ---------------------------------------------------------------------------
def recommend_books(user_issue: str,
                    top_n: int = 5,
                    reviews_per_book: int = 2,
                    min_reviews: int = 10) -> pd.DataFrame:
    """
    Blend topical similarity (70 %) with helpfulness (30 %)
    and return the `top_n` books best suited to `user_issue`.
    """
    # ---- similarity -------------------------------------------------------
    query_vec  = vectorizer.transform([user_issue.lower()])
    similarity = cosine_similarity(query_vec, X_BOOKS).ravel()

    df_temp = df_books.copy()
    df_temp["similarity"] = similarity
    df_temp["helpful_ratio_filled"] = df_temp.get("helpful_ratio", 0).fillna(0)

    if "total_reviews" in df_temp.columns:
        df_temp = df_temp[df_temp["total_reviews"] >= min_reviews]

    df_temp["score"] = (
        0.70 * df_temp["similarity"] +
        0.30 * df_temp["helpful_ratio_filled"]
    )

    top_books = df_temp.nlargest(top_n, "score").reset_index(drop=True)

    # ---- representative reviews ------------------------------------------
    results = []
    for _, row in top_books.iterrows():
        name   = row.get("name", row.get("Book", ""))
        author = row.get("author_clean", row.get("Author", ""))
        # sample reviews only if we actually have them
        if not df_reviews.empty and {"is_helpful", "is_harmful"}.issubset(df_reviews.columns):
            helpful_mask  = (df_reviews["name"] == name) & (df_reviews["is_helpful"])
            harmful_mask  = (df_reviews["name"] == name) & (df_reviews["is_harmful"])

            helpful_reviews = (
                df_reviews[helpful_mask]
                .sample(min(reviews_per_book, helpful_mask.sum()), random_state=42)
                ["review_text"].tolist()
                if helpful_mask.any() else []
            )
            harmful_reviews = (
                df_reviews[harmful_mask]
                .sample(min(reviews_per_book, harmful_mask.sum()), random_state=42)
                ["review_text"].tolist()
                if harmful_mask.any() else []
            )
        else:
            helpful_reviews, harmful_reviews = [], []

        results.append({
            "Book"            : name,
            "Author"          : author,
            "Star_Rating"     : row.get("star_rating", np.nan),
            "Price"           : row.get("kindle_price_clean", np.nan),
            "Helpful_Ratio"   : round(row.get("helpful_ratio", 0), 3),
            "Similarity"      : round(row["similarity"], 3),
            "Helpful Reviews" : helpful_reviews,
            "Harmful Reviews" : harmful_reviews
        })

    return pd.DataFrame(results)


def recommend_authors(user_issue: str,
                      top_n: int = 5,
                      min_reviews: int = 30):
    """
    Return two DataFrames:
        • authors likely to be helpful
        • authors you might approach with caution
    Ranking = 70 % topical relevance + 30 % helpfulness.
    """
    query_vec  = vectorizer.transform([user_issue.lower()])
    similarity = cosine_similarity(query_vec, X_BOOKS).ravel()

    rel_df = pd.DataFrame({
        "author_clean": df_books["author_clean"],
        "sim_to_issue": similarity
    })

    author_relevance = (
        rel_df.groupby("author_clean")
        .agg(max_sim=("sim_to_issue", "max"))
        .reset_index()
    )

    merged = author_relevance.merge(author_stats, on="author_clean", how="left")
    merged["helpful_ratio"] = merged["helpful_ratio"].fillna(0)
    merged["total_reviews"] = merged["total_reviews"].fillna(0)
    merged = merged[merged["total_reviews"] >= min_reviews]

    merged["score"] = 0.70 * merged["max_sim"] + 0.30 * merged["helpful_ratio"]

    helpful_authors = (
        merged[merged["helpful_ratio"] >= 0.5]
        .nlargest(top_n, "score")
        .reset_index(drop=True)
    )

    risky_authors = (
        merged[merged["helpful_ratio"] < 0.5]
        .nlargest(top_n, "score")
        .reset_index(drop=True)
    )

    return helpful_authors, risky_authors


# ---------------------------------------------------------------------------
# 4. GRADIO GLUE – format nicely & expose a simple interface
# ---------------------------------------------------------------------------
def _format_output(books_df, good_authors, bad_authors) -> str:
    txt = "=== RECOMMENDED BOOKS ===\n\n"
    for _, bk in books_df.iterrows():
        txt += f"📚 {bk['Book']}\n"
        txt += f"👤 Author: {bk['Author']}\n"
        txt += f"⭐ Rating: {bk['Star_Rating']}\n"
        txt += f"💰 Price: ${bk['Price']}\n"
        txt += f"📊 Helpful Ratio: {bk['Helpful_Ratio']:.2f}\n"
        if bk["Helpful Reviews"]:
            txt += "\n✅ Helpful Reviews:\n"
            for rv in bk["Helpful Reviews"]:
                txt += f"• {rv}\n"
        if bk["Harmful Reviews"]:
            txt += "\n⚠️ Critical Reviews:\n"
            for rv in bk["Harmful Reviews"]:
                txt += f"• {rv}\n"
        txt += "\n" + "-" * 50 + "\n\n"

    txt += "=== RECOMMENDED AUTHORS ===\n\n"
    txt += "✅ Authors Likely to be Helpful:\n"
    for _, au in good_authors.iterrows():
        txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
    txt += "\n⚠️ Authors to Approach with Caution:\n"
    for _, au in bad_authors.iterrows():
        txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
    return txt


def recommend_for_concern(concern: str,
                          num_books: int = 5,
                          num_reviews: int = 2) -> str:
    books_df = recommend_books(concern,
                               top_n=num_books,
                               reviews_per_book=num_reviews)
    good_authors, bad_authors = recommend_authors(concern,
                                                  top_n=num_books)
    return _format_output(books_df, good_authors, bad_authors)


# ---------------------------------------------------------------------------
# 5. LAUNCH GRADIO
# ---------------------------------------------------------------------------
iface = gr.Interface(
    fn=recommend_for_concern,
    inputs=[
        gr.Textbox(label="What concern or fear would you like help with?",
                   placeholder="e.g. I'm a lonely teenager"),
        gr.Slider(label="Number of recommendations",
                  minimum=1, maximum=10, step=1, value=5),
        gr.Slider(label="Reviews per book",
                  minimum=1, maximum=5, step=1, value=2),
    ],
    outputs=gr.Textbox(label="Recommendations", lines=20),
    title="Self-Help Book Recommendation Engine",
    description="Personalised, review-aware book & author suggestions.",
    examples=[
        ["I'm a lonely teenager", 5, 2],
        ["I'm worried about my career", 5, 2],
        ["I have anxiety about the future", 5, 2],
    ],
)

iface.launch()