File size: 4,536 Bytes
3ccf31a
cc63aca
 
 
 
 
 
3ccf31a
 
 
 
 
 
cc63aca
 
 
 
 
 
 
 
 
 
 
 
 
3ccf31a
cc63aca
3ccf31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc63aca
 
3ccf31a
 
cc63aca
3ccf31a
 
 
 
 
 
 
cc63aca
 
 
 
 
 
 
 
 
3ccf31a
 
 
 
cc63aca
 
 
 
3ccf31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc63aca
3ccf31a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ✅ Load model and tokenizer directly (no caching)
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


def build_summary_prompt(text):
    return f"summarize: {text}"


def summarize_with_t5(text: str) -> str:
    """
    Uses a T5 model to generate a summary from input text.
    """
    prompt = build_summary_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    outputs = model.generate(**inputs, max_length=100, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary


# ⬇️ Optional helper/debug functions
def example_prompt(data):
    return build_summary_prompt(data["cleaned_text"].iloc[0])


def get_top_products(data):
    top_products = data["name"].value_counts().head(3).index.tolist()
    top_product_dfs = [data[data["name"] == prod] for prod in top_products]
    for idx, prod in enumerate(top_products):
        print(f"\nProduct {idx+1}: {prod}")
        print(f"Number of reviews: {len(top_product_dfs[idx])}")
        print("Average rating:", top_product_dfs[idx]["reviews.rating"].mean())
        print("Brands:", top_product_dfs[idx]["brand"].unique())
        print("Primary Categories:", top_product_dfs[idx]["primaryCategories"].unique())
        print("Sample review:", top_product_dfs[idx]["cleaned_text"].iloc[0][:200])


def compare_differences(data):
    top_products = data["name"].value_counts().head(3).index.tolist()
    top_product_dfs = [data[data["name"] == prod] for prod in top_products]
    print("\nDifferences between top products:")
    for i in range(2):
        for j in range(i + 1, 3):
            print(f"\nProduct {i+1} vs Product {j+1}:")
            brand_diff = set(top_product_dfs[i]["brand"].unique()) ^ set(top_product_dfs[j]["brand"].unique())
            cat_diff = set(top_product_dfs[i]["primaryCategories"].unique()) ^ set(top_product_dfs[j]["primaryCategories"].unique())
            print("Brand difference:", brand_diff)
            print("Primary category difference:", cat_diff)
            avg_rating_diff = top_product_dfs[i]["reviews.rating"].mean() - top_product_dfs[j]["reviews.rating"].mean()
            print("Average rating difference:", avg_rating_diff)


def extract_common_complaints(data):
    try:
        stopwords
    except NameError:
        stopwords = set([
            "the", "and", "to", "a", "of", "is", "it", "in", "i", "this", "that", "was", "for", "with",
            "my", "on", "but", "have", "so", "not", "as", "are", "had", "at", "be", "they", "you", "we",
            "all", "if", "just", "or", "me", "very", "from", "by", "an", "has", "were", "would", "when",
            "which", "one", "about", "out", "up", "what", "there", "their", "can", "more", "will", "no",
            "do", "he", "she", "them", "too", "than", "who", "after", "because", "did", "been", "our",
            "also", "could"
        ])

    if "sentiment" in data.columns:
        negative_reviews = data[data["sentiment"] == "Negative"]["cleaned_text"]
    else:
        def map_sentiment(rating):
            if rating in [1, 2]: return "Negative"
            elif rating == 3: return "Neutral"
            elif rating in [4, 5]: return "Positive"
            else: return None
        data["sentiment"] = data["reviews.rating"].apply(map_sentiment)
        negative_reviews = data[data["sentiment"] == "Negative"]["cleaned_text"]

    complaint_words = []
    for review in negative_reviews:
        complaint_words.extend([w for w in review.split() if w not in stopwords])

    common_complaints = Counter(complaint_words).most_common(20)
    print("\nMost common complaint words and example reviews:")
    shown = set()
    for word, count in common_complaints[:10]:
        for review in negative_reviews:
            if word in review.split() and (word, review) not in shown:
                print(f"\nWord: {word} (count: {count})")
                print("Example review:", review)
                shown.add((word, review))
                break


def get_worst_products(data):
    worst_products = (
        data.groupby(["primaryCategories", "name"])["reviews.rating"]
        .mean().reset_index()
        .sort_values(["primaryCategories", "reviews.rating"])
        .groupby("primaryCategories")
        .first()
        .reset_index()
    )
    return worst_products