Spaces:
Sleeping
Sleeping
File size: 5,217 Bytes
3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca 3ccf31a cc63aca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | from collections import Counter
from IPython.display import display
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline
# ✅ Load model + tokenizer directly
bart_model_name = "sshleifer/distilbart-cnn-12-6"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
# ✅ Load pipeline separately for summary function
bart_summarizer = pipeline("summarization", model=bart_model, tokenizer=bart_tokenizer, framework="pt")
def build_bart_summary_prompt(text):
return f"Summarize this review: {text}"
def summarize_with_bart(text: str) -> str:
"""
Takes long review text and returns a short summary using DistilBART.
"""
result = bart_summarizer(text, max_length=60, min_length=15, do_sample=False)
return result[0]["summary_text"]
def get_top_products(data):
top_products = data["name"].value_counts().head(3).index.tolist()
top_product_dfs = [data[data["name"] == prod] for prod in top_products]
for idx, prod in enumerate(top_products):
print(f"\nProduct {idx+1}: {prod}")
print(f"Number of reviews: {len(top_product_dfs[idx])}")
print("Average rating:", top_product_dfs[idx]["reviews.rating"].mean())
print("Brands:", top_product_dfs[idx]["brand"].unique())
print("Primary Categories:", top_product_dfs[idx]["primaryCategories"].unique())
text = top_product_dfs[idx]["cleaned_text"].iloc[0]
summary = summarize_with_bart(text)
print("BART summary:", summary)
def compare_differences(data):
top_products = data["name"].value_counts().head(3).index.tolist()
top_product_dfs = [data[data["name"] == prod] for prod in top_products]
print("\nDifferences between top products:")
for i in range(2):
for j in range(i + 1, 3):
print(f"\nProduct {i+1} vs Product {j+1}:")
brand_diff = set(top_product_dfs[i]["brand"].unique()) ^ set(top_product_dfs[j]["brand"].unique())
cat_diff = set(top_product_dfs[i]["primaryCategories"].unique()) ^ set(top_product_dfs[j]["primaryCategories"].unique())
print("Brand difference:", brand_diff)
print("Primary category difference:", cat_diff)
avg_rating_diff = top_product_dfs[i]["reviews.rating"].mean() - top_product_dfs[j]["reviews.rating"].mean()
print("Average rating difference:", avg_rating_diff)
def extract_common_complaints(data):
try:
stopwords
except NameError:
stopwords = set([
"the", "and", "to", "a", "of", "is", "it", "in", "i", "this", "that", "was", "for", "with",
"my", "on", "but", "have", "so", "not", "as", "are", "had", "at", "be", "they", "you", "we",
"all", "if", "just", "or", "me", "very", "from", "by", "an", "has", "were", "would", "when",
"which", "one", "about", "out", "up", "what", "there", "their", "can", "more", "will", "no",
"do", "he", "she", "them", "too", "than", "who", "after", "because", "did", "been", "our",
"also", "could"
])
if "sentiment" not in data.columns:
def map_sentiment(rating):
if rating in [1, 2]: return "Negative"
elif rating == 3: return "Neutral"
elif rating in [4, 5]: return "Positive"
else: return None
data["sentiment"] = data["reviews.rating"].apply(map_sentiment)
negative_reviews = data[data["sentiment"] == "Negative"]["cleaned_text"]
complaint_words = []
for review in negative_reviews:
complaint_words.extend([w for w in review.split() if w not in stopwords])
common = Counter(complaint_words).most_common(20)
print("BART summaries of reviews containing common complaint words:")
shown = set()
for word, count in common[:10]:
for review in negative_reviews:
if word in review.split() and (word, review) not in shown:
summary = summarize_with_bart(review)
print(f"\nWord: {word} (count: {count})")
print("Original review:", review)
print("BART summary:", summary)
shown.add((word, review))
break
def get_worst_products(data):
worst = (
data.groupby(["primaryCategories", "name"])["reviews.rating"]
.mean().reset_index()
.sort_values(["primaryCategories", "reviews.rating"])
.groupby("primaryCategories")
.first()
.reset_index()
)
print("Worst product by primary category (BART):")
display(worst[["primaryCategories", "name", "reviews.rating"]])
for _, row in worst.iterrows():
prod_reviews = data[
(data["primaryCategories"] == row["primaryCategories"]) &
(data["name"] == row["name"])
]
neg_review = prod_reviews[prod_reviews["sentiment"] == "Negative"]["cleaned_text"]
if not neg_review.empty:
summary = summarize_with_bart(neg_review.iloc[0])
print(f"\nCategory: {row['primaryCategories']}")
print(f"Product: {row['name']}")
print("BART summary of a negative review:", summary)
|