Spaces:

Julseb42
/

nlp-project

Sleeping

File size: 5,217 Bytes

3ccf31a
 
cc63aca
3ccf31a
cc63aca
 
3ccf31a
 
 
cc63aca
 
 
3ccf31a
 
 
 
 
cc63aca
 
 
 
 
 
 
 
3ccf31a
cc63aca
 
3ccf31a
cc63aca
3ccf31a
cc63aca
 
 
 
 
 
 
3ccf31a
 
 
 
cc63aca
 
 
3ccf31a
 
 
 
cc63aca
 
3ccf31a
 
cc63aca
3ccf31a
 
 
 
 
 
 
cc63aca
 
 
 
 
 
 
 
3ccf31a
cc63aca
3ccf31a
cc63aca
 
 
 
3ccf31a
 
cc63aca
3ccf31a
cc63aca
 
3ccf31a
 
cc63aca
3ccf31a
 
cc63aca
 
 
 
 
3ccf31a
 
 
cc63aca
3ccf31a
 
 
 
cc63aca
3ccf31a
cc63aca
3ccf31a
 
 
 
 
 
 
cc63aca
3ccf31a
cc63aca
3ccf31a
cc63aca
 
3ccf31a
cc63aca
3ccf31a
cc63aca

from collections import Counter
from IPython.display import display
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline

# ✅ Load model + tokenizer directly
bart_model_name = "sshleifer/distilbart-cnn-12-6"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

# ✅ Load pipeline separately for summary function
bart_summarizer = pipeline("summarization", model=bart_model, tokenizer=bart_tokenizer, framework="pt")


def build_bart_summary_prompt(text):
    return f"Summarize this review: {text}"


def summarize_with_bart(text: str) -> str:
    """
    Takes long review text and returns a short summary using DistilBART.
    """
    result = bart_summarizer(text, max_length=60, min_length=15, do_sample=False)
    return result[0]["summary_text"]


def get_top_products(data):
    top_products = data["name"].value_counts().head(3).index.tolist()
    top_product_dfs = [data[data["name"] == prod] for prod in top_products]

    for idx, prod in enumerate(top_products):
        print(f"\nProduct {idx+1}: {prod}")
        print(f"Number of reviews: {len(top_product_dfs[idx])}")
        print("Average rating:", top_product_dfs[idx]["reviews.rating"].mean())
        print("Brands:", top_product_dfs[idx]["brand"].unique())
        print("Primary Categories:", top_product_dfs[idx]["primaryCategories"].unique())

        text = top_product_dfs[idx]["cleaned_text"].iloc[0]
        summary = summarize_with_bart(text)
        print("BART summary:", summary)


def compare_differences(data):
    top_products = data["name"].value_counts().head(3).index.tolist()
    top_product_dfs = [data[data["name"] == prod] for prod in top_products]

    print("\nDifferences between top products:")
    for i in range(2):
        for j in range(i + 1, 3):
            print(f"\nProduct {i+1} vs Product {j+1}:")
            brand_diff = set(top_product_dfs[i]["brand"].unique()) ^ set(top_product_dfs[j]["brand"].unique())
            cat_diff = set(top_product_dfs[i]["primaryCategories"].unique()) ^ set(top_product_dfs[j]["primaryCategories"].unique())
            print("Brand difference:", brand_diff)
            print("Primary category difference:", cat_diff)
            avg_rating_diff = top_product_dfs[i]["reviews.rating"].mean() - top_product_dfs[j]["reviews.rating"].mean()
            print("Average rating difference:", avg_rating_diff)


def extract_common_complaints(data):
    try:
        stopwords
    except NameError:
        stopwords = set([
            "the", "and", "to", "a", "of", "is", "it", "in", "i", "this", "that", "was", "for", "with",
            "my", "on", "but", "have", "so", "not", "as", "are", "had", "at", "be", "they", "you", "we",
            "all", "if", "just", "or", "me", "very", "from", "by", "an", "has", "were", "would", "when",
            "which", "one", "about", "out", "up", "what", "there", "their", "can", "more", "will", "no",
            "do", "he", "she", "them", "too", "than", "who", "after", "because", "did", "been", "our",
            "also", "could"
        ])

    if "sentiment" not in data.columns:
        def map_sentiment(rating):
            if rating in [1, 2]: return "Negative"
            elif rating == 3: return "Neutral"
            elif rating in [4, 5]: return "Positive"
            else: return None
        data["sentiment"] = data["reviews.rating"].apply(map_sentiment)

    negative_reviews = data[data["sentiment"] == "Negative"]["cleaned_text"]
    complaint_words = []

    for review in negative_reviews:
        complaint_words.extend([w for w in review.split() if w not in stopwords])

    common = Counter(complaint_words).most_common(20)

    print("BART summaries of reviews containing common complaint words:")
    shown = set()
    for word, count in common[:10]:
        for review in negative_reviews:
            if word in review.split() and (word, review) not in shown:
                summary = summarize_with_bart(review)
                print(f"\nWord: {word} (count: {count})")
                print("Original review:", review)
                print("BART summary:", summary)
                shown.add((word, review))
                break


def get_worst_products(data):
    worst = (
        data.groupby(["primaryCategories", "name"])["reviews.rating"]
        .mean().reset_index()
        .sort_values(["primaryCategories", "reviews.rating"])
        .groupby("primaryCategories")
        .first()
        .reset_index()
    )

    print("Worst product by primary category (BART):")
    display(worst[["primaryCategories", "name", "reviews.rating"]])

    for _, row in worst.iterrows():
        prod_reviews = data[
            (data["primaryCategories"] == row["primaryCategories"]) &
            (data["name"] == row["name"])
        ]
        neg_review = prod_reviews[prod_reviews["sentiment"] == "Negative"]["cleaned_text"]
        if not neg_review.empty:
            summary = summarize_with_bart(neg_review.iloc[0])
            print(f"\nCategory: {row['primaryCategories']}")
            print(f"Product: {row['name']}")
            print("BART summary of a negative review:", summary)