File size: 7,276 Bytes
0aee734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import gradio as gr
from transformers import pipeline
from textblob import TextBlob
from collections import defaultdict
import pandas as pd
from tabulate import tabulate

# Initialize summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_category_summaries(df):
    """Generate product summaries in table format"""
    summaries = {}

    for category in df['cluster_name'].unique():
        category_df = df[df['cluster_name'] == category]

        if len(category_df) < 10:
            continue

        product_stats = get_product_stats(category_df)
        if len(product_stats) < 3:
            continue

        top_products, worst_product = get_top_and_worst_products(product_stats)
        product_details = analyze_top_products(top_products)

        # Format as tables
        summary_tables = format_tables(category, product_details, worst_product)
        summaries[category] = summary_tables

    return summaries

def format_tables(category, product_details, worst_product):
    """Format all sections as tables"""
    tables = []

    # Top Products Table
    top_table = []
    for product in product_details:
        top_table.append([
            product['name'],
            f"★{product['rating']:.1f}",
            product['review_count'],
            "\n".join(product['pros']),
            "\n".join(product['cons'])
        ])

    tables.append({
        'section': f"TOP PRODUCTS IN {category.upper()}",
        'headers': ["Product", "Rating", "Reviews", "Pros", "Cons"],
        'data': top_table
    })

    # Key Differences Table
    common_pros = set(product_details[0]['pros'])
    for product in product_details[1:]:
        common_pros.intersection_update(product['pros'])

    diff_table = []
    for product in product_details:
        unique_pros = [p for p in product['pros'] if p not in common_pros]
        if unique_pros:
            diff_table.append([product['name'], ", ".join(unique_pros)])

    if diff_table:
        tables.append({
            'section': "KEY DIFFERENCES",
            'headers': ["Product", "Unique Features"],
            'data': diff_table
        })

    # Worst Product Table
    if not worst_product.empty:
        worst = worst_product.iloc[0]
        _, cons = analyze_sentiment(worst['reviews'])
        tables.append({
            'section': "PRODUCT TO AVOID",
            'headers': ["Product", "Rating", "Reasons to Avoid"],
            'data': [[
                worst_product.index[0],
                f"★{worst['avg_rating']:.1f}",
                ", ".join(cons[:3]) if cons else "Consistently poor ratings"
            ]]
        })

    return tables

def get_product_stats(category_df):
    """Calculate product statistics from dataframe"""
    stats = category_df.groupby('name').agg({
        'rating': ['mean', 'count'],
        'text': list
    })
    stats.columns = ['avg_rating', 'review_count', 'reviews']
    return stats[stats['review_count'] >= 5]

def get_top_and_worst_products(product_stats):
    """Identify best and worst performing products"""
    return (
        product_stats.nlargest(3, 'avg_rating'),
        product_stats.nsmallest(1, 'avg_rating')
    )

def analyze_top_products(top_products):
    """Extract pros/cons from top products' reviews"""
    product_details = []
    for product, row in top_products.iterrows():
        pros, cons = analyze_sentiment(row['reviews'])
        product_details.append({
            'name': product,
            'rating': row['avg_rating'],
            'review_count': row['review_count'],
            'pros': pros[:3] or ["no significant positive feedback"],
            'cons': cons[:3] or ["no major complaints"]
        })
    return product_details

def analyze_sentiment(reviews):
    """Perform sentiment analysis on reviews"""
    pros = defaultdict(int)
    cons = defaultdict(int)

    for review in reviews:
        blob = TextBlob(review)
        for sentence in blob.sentences:
            polarity = sentence.sentiment.polarity
            words = [word for word, tag in blob.tags
                    if tag in ('NN', 'NNS', 'JJ', 'JJR', 'JJS')]

            if polarity > 0.3:  # Positive
                for word in words:
                    pros[word] += 1
            elif polarity < -0.3:  # Negative
                for word in words:
                    cons[word] += 1

    # Filter and sort results
    pros_sorted = [k for k, _ in sorted(pros.items(), key=lambda x: -x[1])] if pros else []
    cons_sorted = [k for k, _ in sorted(cons.items(), key=lambda x: -x[1])] if cons else []

    return pros_sorted, cons_sorted

def format_for_gradio(summaries):
    """Convert summary tables to HTML for Gradio display"""
    outputs = []
    for category, tables in summaries.items():
        category_html = f"<h2 style='color: #4a6baf;'>{category.upper()}</h2>"
        
        for table in tables:
            table_html = f"<h3 style='color: #3a5a8a;'>{table['section']}</h3>"
            table_html += tabulate(
                table['data'],
                headers=table['headers'],
                tablefmt="html",
                stralign="left",
                numalign="center"
            )
            table_html = table_html.replace('<table>', '<table style="width:100%; border-collapse: collapse; margin-bottom: 20px;">')
            table_html = table_html.replace('<th>', '<th style="background-color: #f2f2f2; padding: 8px; text-align: left; border: 1px solid #ddd;">')
            table_html = table_html.replace('<td>', '<td style="padding: 8px; border: 1px solid #ddd;">')
            category_html += table_html
        
        outputs.append(category_html)
    
    return "<hr>".join(outputs)

def analyze_reviews(df):
    """Main function to process data and generate summaries"""
    summaries = generate_category_summaries(df)
    return format_for_gradio(summaries)

# Create Gradio interface
with gr.Blocks(title="Amazon Product Review Analyzer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Amazon Product Review Analyzer")
    gr.Markdown("Analyzing top products and reviews across categories")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Product Categories Found")
            category_dropdown = gr.Dropdown(
                choices=df['cluster_name'].unique().tolist(),
                label="Select a Category",
                interactive=True
            )
            analyze_btn = gr.Button("Analyze Selected Category", variant="primary")
        
        with gr.Column():
            gr.Markdown("### All Categories Summary")
            all_categories_btn = gr.Button("Analyze All Categories", variant="secondary")
    
    output_html = gr.HTML(label="Analysis Results")
    
    # Button actions
    category_dropdown.change(
        fn=lambda x: gr.update(interactive=bool(x)),
        inputs=category_dropdown,
        outputs=analyze_btn
    )
    
    analyze_btn.click(
        fn=lambda cat: analyze_reviews(df[df['cluster_name'] == cat]),
        inputs=category_dropdown,
        outputs=output_html
    )
    
    all_categories_btn.click(
        fn=lambda: analyze_reviews(df),
        outputs=output_html
    )

# Launch the interface
demo.launch()