Spaces:
Sleeping
Sleeping
| # mca_comment_analyzer.py | |
| import os | |
| import pandas as pd | |
| import torch | |
| from transformers import pipeline | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| import nltk | |
| import random | |
| from datetime import datetime, timedelta | |
| # ----------------------------- | |
| # Configs | |
| # ----------------------------- | |
| os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" | |
| os.environ["NLTK_DATA"] = "/tmp/nltk_data" | |
| # NLTK Stopwords | |
| nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True) | |
| from nltk.corpus import stopwords | |
| STOPWORDS = set(stopwords.words('english')) | |
| # ----------------------------- | |
| # MCA Comment Analyzer | |
| # ----------------------------- | |
| class MCACommentAnalyzer: | |
| def __init__(self): | |
| device = 0 if torch.cuda.is_available() else -1 | |
| print("Using device:", "GPU" if device==0 else "CPU") | |
| # Sentiment model | |
| self.sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model="distilbert-base-uncased-finetuned-sst-2-english", | |
| device=device | |
| ) | |
| # Summarizer | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model="t5-small", | |
| device=device | |
| ) | |
| self.stop_words = STOPWORDS | |
| def map_sentiment(self, pred, text): | |
| text_lower = text.lower() | |
| # ----------------------------- | |
| # Strict priority: Negative β Violation β Suggestion β Positive | |
| # ----------------------------- | |
| negative_keywords = ["confusing", "unclear", "bad", "problem"] | |
| violation_keywords = ["violation", "violates", "illegal", "non-compliant"] | |
| suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"] | |
| positive_keywords = ["clear", "helpful", "good", "appreciate", "support"] | |
| if any(w in text_lower for w in negative_keywords): | |
| return "Negative" | |
| if any(w in text_lower for w in violation_keywords): | |
| return "Violation" | |
| if any(w in text_lower for w in suggestion_keywords): | |
| return "Suggestion" | |
| if any(w in text_lower for w in positive_keywords): | |
| return "Positive" | |
| # fallback to sentiment model | |
| label = pred['label'].upper() | |
| if label == "POSITIVE": | |
| return "Positive" | |
| elif label == "NEGATIVE": | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| def process_comment(self, comment): | |
| pred = self.sentiment_model(comment)[0] | |
| sentiment = self.map_sentiment(pred, comment) | |
| if len(comment.split()) < 10: | |
| summary_text = " ".join(comment.split()[:10]) | |
| else: | |
| try: | |
| summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text'] | |
| except: | |
| summary_text = comment | |
| words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words] | |
| keywords = list(Counter(words).keys()) | |
| top_keywords = ", ".join(keywords[:3]) | |
| return sentiment, summary_text, keywords, top_keywords | |
| def process_comments(self, comments_list): | |
| sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], [] | |
| start_date = datetime.now() - timedelta(days=30) | |
| for comment in comments_list: | |
| sentiment, summary, keywords, top_kw = self.process_comment(comment) | |
| sentiments.append(sentiment) | |
| summaries.append(summary) | |
| all_keywords.extend(keywords) | |
| top_keywords_list.append(top_kw) | |
| timestamps.append(start_date + timedelta(days=random.randint(0,30))) | |
| df = pd.DataFrame({ | |
| "Timestamp": timestamps, | |
| "Comment": comments_list, | |
| "Summary": summaries, | |
| "Sentiment": sentiments, | |
| "Top Keywords": top_keywords_list | |
| }) | |
| df.sort_values(by='Timestamp', inplace=True, ascending=True) | |
| keyword_freq = pd.DataFrame( | |
| Counter(all_keywords).items(), | |
| columns=['Keyword', 'Frequency'] | |
| ).sort_values(by='Frequency', ascending=False) | |
| return df, keyword_freq | |
| def generate_wordcloud(self, keyword_freq, filename=None): | |
| wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency'])) | |
| wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict) | |
| plt.figure(figsize=(10,5)) | |
| plt.imshow(wc, interpolation="bilinear") | |
| plt.axis("off") | |
| if filename: | |
| plt.savefig(filename, bbox_inches='tight') | |
| return plt | |