# mca_comment_analyzer.py import os import pandas as pd import torch from transformers import pipeline from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter import nltk import random from datetime import datetime, timedelta # ----------------------------- # Configs # ----------------------------- os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" os.environ["NLTK_DATA"] = "/tmp/nltk_data" # NLTK Stopwords nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True) from nltk.corpus import stopwords STOPWORDS = set(stopwords.words('english')) # ----------------------------- # MCA Comment Analyzer # ----------------------------- class MCACommentAnalyzer: def __init__(self): device = 0 if torch.cuda.is_available() else -1 print("Using device:", "GPU" if device==0 else "CPU") # Sentiment model self.sentiment_model = pipeline( "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device ) # Summarizer self.summarizer = pipeline( "summarization", model="t5-small", device=device ) self.stop_words = STOPWORDS def map_sentiment(self, pred, text): text_lower = text.lower() # ----------------------------- # Strict priority: Negative → Violation → Suggestion → Positive # ----------------------------- negative_keywords = ["confusing", "unclear", "bad", "problem"] violation_keywords = ["violation", "violates", "illegal", "non-compliant"] suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"] positive_keywords = ["clear", "helpful", "good", "appreciate", "support"] if any(w in text_lower for w in negative_keywords): return "Negative" if any(w in text_lower for w in violation_keywords): return "Violation" if any(w in text_lower for w in suggestion_keywords): return "Suggestion" if any(w in text_lower for w in positive_keywords): return "Positive" # fallback to sentiment model label = pred['label'].upper() if label == "POSITIVE": return "Positive" elif label == "NEGATIVE": return "Negative" else: return "Neutral" def process_comment(self, comment): pred = self.sentiment_model(comment)[0] sentiment = self.map_sentiment(pred, comment) if len(comment.split()) < 10: summary_text = " ".join(comment.split()[:10]) else: try: summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text'] except: summary_text = comment words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words] keywords = list(Counter(words).keys()) top_keywords = ", ".join(keywords[:3]) return sentiment, summary_text, keywords, top_keywords def process_comments(self, comments_list): sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], [] start_date = datetime.now() - timedelta(days=30) for comment in comments_list: sentiment, summary, keywords, top_kw = self.process_comment(comment) sentiments.append(sentiment) summaries.append(summary) all_keywords.extend(keywords) top_keywords_list.append(top_kw) timestamps.append(start_date + timedelta(days=random.randint(0,30))) df = pd.DataFrame({ "Timestamp": timestamps, "Comment": comments_list, "Summary": summaries, "Sentiment": sentiments, "Top Keywords": top_keywords_list }) df.sort_values(by='Timestamp', inplace=True, ascending=True) keyword_freq = pd.DataFrame( Counter(all_keywords).items(), columns=['Keyword', 'Frequency'] ).sort_values(by='Frequency', ascending=False) return df, keyword_freq def generate_wordcloud(self, keyword_freq, filename=None): wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency'])) wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict) plt.figure(figsize=(10,5)) plt.imshow(wc, interpolation="bilinear") plt.axis("off") if filename: plt.savefig(filename, bbox_inches='tight') return plt